diff --git a/.gitignore b/.gitignore
index dc0a38edcb563589ce3845803174598ca68ec396..be97cf2f3ff9878774913ecf8dab0130179bbf16 100644
--- a/.gitignore
+++ b/.gitignore
@@ -116,4 +116,10 @@ metal/paddle-mobile-demo/paddle-mobile-demo/Resources/images
 metal/paddle-mobile-demo/paddle-mobile-demo/Resources/models
 metal/MobileNetDemo/MobileNetDemo/Resources
 
+#flatbuffers
+lite/model_parser/flatbuffers/framework_generated.h
+
 build*
+
+# hiai libs
+ai_ddk_lib*
diff --git a/.gitmodules b/.gitmodules
index 107036c70292cf33e945f45a8bac935dea554ece..37af6a724560144190539ab677c8f17524f5e645 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "third-party/protobuf-host"]
 	path = third-party/protobuf-host
 	url = https://github.com/protocolbuffers/protobuf.git
+[submodule "third-party/flatbuffers"]
+	path = third-party/flatbuffers
+	url = https://github.com/google/flatbuffers.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8ac227f0154feb64178d9a99b6784bfd6db40d50..55375994031850d93caa89ec7050a9e8e657d04f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -106,7 +106,8 @@ lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kerne
 lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)
 # cv build options
 lite_option(LITE_WITH_CV  "Enable build cv image in lite" OFF)
-lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." ON)
+lite_option(LITE_WITH_STATIC_CUDA  "Statically link cuda libraries." OFF)
+lite_option(CUDA_WITH_FP16 "Compile with cuda half support" OFF)
 lite_option(LITE_WITH_ARM_CLANG "when arm lang is clang, its ON." OFF)
 
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
@@ -168,6 +169,7 @@ if(LITE_WITH_RKNPU)
    include(device/rknpu)
 endif()
 
+include(external/flatbuffers)
 
 # for mobile
 if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index 4fc59ccd62671c5862a298832b1ec03d4e96d05a..e6193e0bb3c93292d2264501fc4d5739ff8766ee 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -35,7 +35,11 @@ endif()
 if(NOT DEFINED ANDROID_API_LEVEL)
     set(ANDROID_API_LEVEL "23")
     if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
-        set(ANDROID_API_LEVEL "22")
+        if(LITE_WITH_NPU AND NOT LITE_ON_TINY_PUBLISH)
+            set(ANDROID_API_LEVEL "24") # HIAI DDK depends on android-24
+        else()
+            set(ANDROID_API_LEVEL "22")
+        endif()
     endif()
 endif()
 
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index e7df3f0fd6f0b0efcaf9cd859df5fb84a0cadfc4..eb8e26218ad1d8adc920b1834abd9ba10669a3e9 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -2,6 +2,10 @@ if(NOT LITE_WITH_CUDA)
     return()
 endif()
 
+if(WITH_CUDA_FP16)
+  add_definitions("-DCUDA_WITH_FP16")
+endif()
+
 set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
 set(paddle_known_gpu_archs7 "30 35 50 52")
 set(paddle_known_gpu_archs8 "30 35 50 52 53 60 61 62")
@@ -167,6 +171,10 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
   add_definitions("-DPADDLE_CUDA_BINVER=\"100\"")
 endif()
 
+if (CUDA_WITH_FP16)
+  STRING(REGEX REPLACE "30|35|50|52" "" paddle_known_gpu_archs ${paddle_known_gpu_archs})
+endif()
+
 include_directories(${CUDA_INCLUDE_DIRS})
 if(NOT WITH_DSO)
     if(WIN32)
diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake
index 823048552f3cb5f05375e97e94cd5b5ad63e7563..16fc7dcf4191a6b2a145d4d6e70e915fe5321a6b 100644
--- a/cmake/device/xpu.cmake
+++ b/cmake/device/xpu.cmake
@@ -39,7 +39,7 @@ else()
 endif()
 
 find_library(XPU_SDK_XPU_RT_FILE NAMES xpurt
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib
+  PATHS ${XPU_SDK_ROOT}/XTDK/runtime/shlib ${XPU_SDK_ROOT}/XTDK/shlib # libxpurt.so may have been moved to XTDK/runtime/shlib
   NO_DEFAULT_PATH)
 
 if(NOT XPU_SDK_XPU_RT_FILE)
diff --git a/cmake/external/flatbuffers.cmake b/cmake/external/flatbuffers.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..12c6b162f686f0c08f1c90610767b3508130d0da
--- /dev/null
+++ b/cmake/external/flatbuffers.cmake
@@ -0,0 +1,116 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+# Introduce variables:
+# * CMAKE_INSTALL_LIBDIR
+INCLUDE(GNUInstallDirs)
+SET(LIBDIR "lib")
+if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$")
+  SET(LIBDIR "lib64")
+endif()
+
+SET(FLATBUFFERS_PREFIX_DIR ${THIRD_PARTY_PATH}/flatbuffers)
+SET(FLATBUFFERS_SOURCES_DIR ${CMAKE_SOURCE_DIR}/third-party/flatbuffers)
+SET(FLATBUFFERS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/flatbuffers)
+SET(FLATBUFFERS_INCLUDE_DIR "${FLATBUFFERS_SOURCES_DIR}/include" CACHE PATH "flatbuffers include directory." FORCE)
+IF(WIN32)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+ELSE(WIN32)
+  set(FLATBUFFERS_LIBRARIES "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.a" CACHE FILEPATH "FLATBUFFERS_LIBRARIES" FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${FLATBUFFERS_INCLUDE_DIR})
+
+if(NOT HOST_CXX_COMPILER)
+  set(HOST_CXX_COMPILER ${CMAKE_CXX_COMPILER})
+  set(HOST_C_COMPILER ${CMAKE_C_COMPILER})
+endif()
+
+SET(OPTIONAL_ARGS "-DCMAKE_CXX_COMPILER=${HOST_CXX_COMPILER}"
+                  "-DCMAKE_C_COMPILER=${HOST_C_COMPILER}")
+
+ExternalProject_Add(
+    extern_flatbuffers
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/google/flatbuffers.git"
+    GIT_TAG         "v1.12.0"
+    SOURCE_DIR      ${FLATBUFFERS_SOURCES_DIR}
+    PREFIX          ${FLATBUFFERS_PREFIX_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DBUILD_STATIC_LIBS=ON
+                    -DCMAKE_INSTALL_PREFIX=${FLATBUFFERS_INSTALL_DIR}
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR}
+                    -DFLATBUFFERS_BUILD_TESTS=OFF
+                    ${CROSS_COMPILE_CMAKE_ARGS}
+                    ${OPTIONAL_ARGS}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${FLATBUFFERS_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+)
+IF(WIN32)
+  IF(NOT EXISTS "${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib")
+    add_custom_command(TARGET extern_flatbuffers POST_BUILD
+            COMMAND cmake -E copy ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/flatbuffers_static.lib ${FLATBUFFERS_INSTALL_DIR}/${LIBDIR}/libflatbuffers.lib
+            )
+  ENDIF()
+ENDIF(WIN32)
+ADD_LIBRARY(flatbuffers STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET flatbuffers PROPERTY IMPORTED_LOCATION ${FLATBUFFERS_LIBRARIES})
+ADD_DEPENDENCIES(flatbuffers extern_flatbuffers)
+
+SET(FLATBUFFERS_FLATC_EXECUTABLE ${FLATBUFFERS_INSTALL_DIR}/bin/flatc)
+
+function(register_generated_output file_name)
+  get_property(tmp GLOBAL PROPERTY FBS_GENERATED_OUTPUTS)
+  list(APPEND tmp ${file_name})
+  set_property(GLOBAL PROPERTY FBS_GENERATED_OUTPUTS ${tmp})
+endfunction(register_generated_output)
+
+function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
+  if(FLATBUFFERS_BUILD_LEGACY)
+    set(OPT ${OPT};--cpp-std c++0x)
+  else()
+    # --cpp-std is defined by flatc default settings.
+  endif()
+  message(STATUS "`${SRC_FBS}`: add generation of C++ code with '${OPT}'")
+  get_filename_component(SRC_FBS_DIR ${SRC_FBS} PATH)
+  message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
+  string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
+  add_custom_command(
+    OUTPUT ${GEN_HEADER}
+    COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
+            --cpp --gen-mutable --gen-object-api --reflect-names
+            --cpp-ptr-type flatbuffers::unique_ptr # Used to test with C++98 STLs
+            ${OPT}
+            -I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
+            -o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
+            "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
+    DEPENDS flatbuffers
+    COMMENT "Run generation: '${GEN_HEADER}'")
+  register_generated_output(${GEN_HEADER})
+  add_custom_target(${TARGET} ALL DEPENDS ${GEN_HEADER})
+endfunction()
+
+set(FRAMEWORK_FBS_DIR "lite/model_parser/flatbuffers")
+set(FRAMEWORK_SCHEMA_PATH "${FRAMEWORK_FBS_DIR}/framework.fbs")
+compile_flatbuffers_schema_to_cpp_opt(framework_fbs_header ${FRAMEWORK_SCHEMA_PATH} "--no-includes;--gen-compare;--force-empty")
+include_directories(${FLATBUFFERS_INCLUDE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR})
+
diff --git a/lite/demo/cxx/train_demo/README.md b/docs/demo_guides/cpp_train_demo.md
similarity index 82%
rename from lite/demo/cxx/train_demo/README.md
rename to docs/demo_guides/cpp_train_demo.md
index 56f4513d45676a1deb51bfb93096db156ddd0449..c10f2091f9c14f6fc81563248c75e72abd713666 100644
--- a/lite/demo/cxx/train_demo/README.md
+++ b/docs/demo_guides/cpp_train_demo.md
@@ -1,8 +1,10 @@
+# C++ Train Demo
 
-# Introduction
-  我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
+## Introduction
+
+我们都知道，PaddleLite可以做移动端预测，事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子，这一例子对应的任务是“波士顿房价预测”，又称作“fit-a-line”。
   
-  你可以通过book库中的
+你可以通过book库中的
 [文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html)
 和
 [源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line)
@@ -10,18 +12,16 @@
 其使用线性回归（Linear Regression）
 模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。
 
-注：这是一篇使用C++ API做模型训练的教程，其他API暂时不支持训练功能。
-
-# Requirements
+## Requirements
 
 - 一部安卓手机，用于运行训练程序
-- 装了Paddle (version: 1.7.0) 的python
+- 装了Paddle (version >= 1.7.0) 的python
 
-# Quick start
+## Quick start
 
-## Step1 build paddle-lite
+### Step1 build paddle-lite
 
-请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：
+请按照paddle-lite官方文档的教程编译full_publish的paddle-lite lib。以Linux上编译为例，其具体的命令为：
 
 ```shell
 ## 配置环境
@@ -51,7 +51,7 @@ cd Paddle-Lite
 Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so
 ```
 
-## Step2 编译lr_trainer
+### Step2 编译lr_trainer
 
 ```shell
 cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/
@@ -64,7 +64,7 @@ bin/
 `-- demo_trainer
 ```
 
-## Step3 download model and run it!
+### Step3 download model and run it!
 
 在你的笔记本电脑上，用usb连接到手机，开启开发者模式，在任意目录下执行：
 
@@ -102,7 +102,7 @@ sample 8: Loss: 248.445
 sample 9: Loss: 325.135
 ```
 
-# 更多细节
+## 更多细节
 上面提到的模型是直接下载得到的，如果你想自己生成，可以执行以下命令：
 
 ```shell
@@ -125,9 +125,9 @@ md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d
 
 如果你想生成自己的模型用于训练，可以参考`train.py`中保存模型的方式。
 
-# 与Paddle训练结果做校对
+## 与Paddle训练结果做校对
 
-## 前10个Loss值
+### 前10个Loss值
 
 为了验证paddle与lite的一致性，我们控制模型参数一致、数据一致、batch size = 1的情况下，训练10个batch， 记录了二者的loss值。
 
@@ -171,11 +171,11 @@ sample 8: Loss: 248.445
 sample 9: Loss: 325.135
 ```
 
-## Loss 曲线
+### Loss 曲线
 
 控制训练时的batch size为20，每个epoch对训练数据做全局shuffle，训练100个epoch后，paddle和lite的loss曲线对比如下。
 
-![lr_loss](image/lr_loss.png)
+![lr_loss](../images/lr_loss.png)
 
 如果想复现上述效果，paddle+python的运行命令为：
 
diff --git a/docs/demo_guides/python_demo.md b/docs/demo_guides/python_demo.md
index d6a7b15bd9be638ef586e6b589e35eecbf1613c2..59f81783c0b2e791f9623e84cf57c269cbb7d6f2 100644
--- a/docs/demo_guides/python_demo.md
+++ b/docs/demo_guides/python_demo.md
@@ -86,19 +86,28 @@ config.set_model_from_file(/YOU_MODEL_PATH/mobilenet_v1_opt.nb)
 predictor = create_paddle_predictor(config)
 ```
 
-(3) 设置输入数据
+(3) 从图片读入数据
+
+```python
+image = Image.open('./example.jpg')
+resized_image = image.resize((224, 224), Image.BILINEAR)
+image_data = np.array(resized_image).flatten().tolist()
+```
+
+(4) 设置输入数据
+
 ```python
 input_tensor = predictor.get_input(0)
 input_tensor.resize([1, 3, 224, 224])
-input_tensor.set_float_data([1.] * 3 * 224 * 224)
+input_tensor.set_float_data(image_data)
 ```
 
-(4) 执行预测
+(5) 执行预测
 ```python
 predictor.run()
 ```
 
-(5) 得到输出数据
+(6) 得到输出数据
 ```python
 output_tensor = predictor.get_output(0)
 print(output_tensor.shape())
diff --git a/lite/demo/cxx/train_demo/image/lr_loss.png b/docs/images/lr_loss.png
similarity index 100%
rename from lite/demo/cxx/train_demo/image/lr_loss.png
rename to docs/images/lr_loss.png
diff --git a/docs/index.rst b/docs/index.rst
index c241f091ed2cae906879f98b769bc6b7ce830fe1..b2fba7daba51c68207af27e249559c18ab10235f 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -59,7 +59,14 @@ Welcome to Paddle-Lite's documentation!
   demo_guides/baidu_xpu
   demo_guides/rockchip_npu
   demo_guides/mediatek_apu
-  
+
+.. toctree::
+  :maxdepth: 1
+  :caption: 训练示例（预览）
+  :name: sec-train_demo_guides
+
+  demo_guides/cpp_train_demo
+
 .. toctree::
   :maxdepth: 1
   :caption: API文档
diff --git a/docs/user_guides/Compile/iOS.md b/docs/user_guides/Compile/iOS.md
index 355cc11875ce8f8db891fb843d2f1624180b71ff..60375ad1085dfac090442f9c0dad86cf71b64c9e 100644
--- a/docs/user_guides/Compile/iOS.md
+++ b/docs/user_guides/Compile/iOS.md
@@ -61,7 +61,7 @@ inference_lite_lib.ios64.armv8                iOS预测库和头文件
 - 裁剪预测库方法（只编译模型中的kernel&OP，降低预测库体积）:
 
 ```shell
-./lite/tools/build_android.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
+./lite/tools/build_ios.sh --with_strip=ON --opt_model_dir=YourOptimizedModelDir
 ```
 ```shell
 --with_strip: (OFF|ON);   是否根据输入模型裁剪预测库，默认为OFF
diff --git a/docs/user_guides/model_optimize_tool.md b/docs/user_guides/model_optimize_tool.md
index fed728cb0e06c9758a0497a9cbb93d7edf39bda7..4c80d638d224d294e247ad3f5300498dd536be62 100644
--- a/docs/user_guides/model_optimize_tool.md
+++ b/docs/user_guides/model_optimize_tool.md
@@ -21,11 +21,11 @@ pip install paddlelite
 - 方法二: 下载opt可执行文件
 从[release界面](https://github.com/PaddlePaddle/Paddle-Lite/releases)，选择当前预测库对应版本的`opt`转化工具
 
-本文提供`release/v2.6`和`release/v2.2.0`版本的优化工具下载
+本文提供`release/v2.6.1`和`release/v2.2.0`版本的优化工具下载
 
 |版本 | Linux | MacOS|
 |---|---|---|
-| `release/v2.3`| [opt](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/opt_mac) |
+| `release/v2.6.1` | [opt](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt) | [opt_mac](https://paddlelite-data.bj.bcebos.com/Release/2.6.1/opt/opt_mac) |
 |`release/v2.2.0`  | [model_optimize_tool](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool) | [model_optimize_tool_mac](https://paddlelite-data.bj.bcebos.com/model_optimize_tool/model_optimize_tool_mac) |
 
 - 方法三: 源码编译opt
diff --git a/docs/user_guides/tutorial.md b/docs/user_guides/tutorial.md
index 338449bfcb92e4029763c4357eb6d1fd5b820272..ee156038a6ea144921258734c92e9a2ea757d6ec 100644
--- a/docs/user_guides/tutorial.md
+++ b/docs/user_guides/tutorial.md
@@ -49,4 +49,4 @@ $ ./opt \
 
 ## 五. 测试工具
 
-为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](debug#debug) 和 [Profile工具](debug#profiler)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
+为了使您更好的了解并使用Lite框架，我们向有进一步使用需求的用户开放了 [Debug工具](debug) 和 [Profile工具](debug)。Lite Model Debug Tool可以用来查找Lite框架与PaddlePaddle框架在执行预测时模型中的对应变量值是否有差异，进一步快速定位问题Op，方便复现与排查问题。Profile Monitor Tool可以帮助您了解每个Op的执行时间消耗，其会自动统计Op执行的次数，最长、最短、平均执行时间等等信息，为性能调优做一个基础参考。您可以通过 [相关专题](debug) 了解更多内容。
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index ff4d00dbb1051320f817c8220a11a77edde7fb05..eeea3b3adf4caf2e3ea57eb365c32f24626851e6 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -48,11 +48,13 @@ if (WITH_TESTING)
     endif()
     if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
-	      lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
         lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "step_rnn.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "bert.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "ernie.tar.gz")
     endif()
 endif()
 
@@ -242,7 +244,6 @@ if (LITE_WITH_X86)
     add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
 
     add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
-           COMMAND rm -rf "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
            COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_light_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_light"
            COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/x86_mobilenetv1_full_demo" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobilenetv1_full"
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 38a698d2ef608435b5aaa8274958ee6b8c7a8e03..0a8cf165996c6f1d3948cd29e3c0562b23570561 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -2,7 +2,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR (NOT LITE_WITH_LOG))
   lite_cc_library(place SRCS paddle_place.cc DEPS logging)
 else()
   lite_cc_library(place SRCS paddle_place.cc DEPS glog)
-endif(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+endif()
 
 if (LITE_ON_TINY_PUBLISH)
     set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
@@ -15,8 +15,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
     #full api dynamic library
     lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
                   DEPS paddle_api paddle_api_light  paddle_api_full)
-    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
-    target_link_libraries(paddle_full_api_shared framework_proto)
+    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files})
+    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto op_registry framework_fbs_header)
+    target_link_libraries(paddle_full_api_shared framework_proto op_registry)
     if(LITE_WITH_X86)
         add_dependencies(paddle_full_api_shared xxhash)
         target_link_libraries(paddle_full_api_shared xxhash)
@@ -70,7 +71,7 @@ else()
             set(TARGET_COMIPILE_FLAGS "${TARGET_COMIPILE_FLAGS} -flto")
         endif()
         set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "${TARGET_COMIPILE_FLAGS}")
-        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h framework_fbs_header)
         if (LITE_WITH_NPU)
             # Need to add HIAI runtime libs (libhiai.so) dependency
             target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
@@ -368,6 +369,9 @@ endif()
 
 if (LITE_WITH_PYTHON)
     add_subdirectory(python)
+    # add library for opt_base
+    lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
+    add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
 endif()
 
 if (LITE_ON_TINY_PUBLISH)
@@ -375,9 +379,6 @@ if (LITE_ON_TINY_PUBLISH)
 endif()
 
 
-# add library for opt_base
-lite_cc_library(opt_base SRCS opt_base.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc DEPS kernel op optimizer mir_passes utils)
-add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_faked_cc kernel_list_h)
 
 if (LITE_ON_MODEL_OPTIMIZE_TOOL)
     message(STATUS "Compiling opt")
diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt
index d46e9f7cdec1cf422340ff11165ee166c7520bab..2929e24117c616a99ff4e078fd77fe8827186cb1 100644
--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -17,6 +17,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
     # Unlike static library, module library has to link target to be able to work
     # as a single .so lib.
     target_link_libraries(paddle_lite_jni ${lib_DEPS} ${arm_kernels} ${npu_kernels})
+    add_dependencies(paddle_lite_jni framework_fbs_header)
     if (LITE_WITH_NPU)
         # Strips the symbols of our protobuf functions to fix the conflicts during
         # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
@@ -31,7 +32,7 @@ else()
     endif()
     set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS ${TARGET_COMIPILE_FLAGS})
     target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
-    add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
+    add_dependencies(paddle_lite_jni op_list_h kernel_list_h framework_fbs_header)
     if (LITE_WITH_NPU)
         # Need to add HIAI runtime libs (libhiai.so) dependency
         target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 1ed0f8a0e4c0d1bffbc8c8cd75261208a80ed546..505f42f98725a595e3a2e0c0b412d11ae7ad709e 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -13,18 +13,24 @@
 // limitations under the License.
 
 #include "lite/api/cxx_api.h"
+
 #include <algorithm>
 #include <memory>
 #include <set>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "lite/api/paddle_use_passes.h"
 #include "lite/utils/io.h"
 
 namespace paddle {
 namespace lite {
 
+std::vector<std::string> GetAllOps() {
+  return OpLiteFactory::Global().GetAllOps();
+}
+
 void Predictor::SaveModel(const std::string &dir,
                           lite_api::LiteModelType model_type,
                           bool record_info) {
@@ -326,10 +332,8 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
     }
   }
   if (is_quantized_model) {
-#ifdef LITE_WITH_ARM
     inner_places.insert(inner_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
-#endif
   }
 
   Program program(*desc.get(), scope_, inner_places);
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index 62e29dd71cca1692546517762e3dba72497acb6a..8206912bb6621764dc5d5d3b0fb5a0eae19d862c 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -41,6 +41,8 @@ static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
     ".tailored_kernels_source_list";
 static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
 
+std::vector<std::string> GetAllOps();
+
 /*
  * Predictor for inference, input a model, it will optimize and execute it.
  */
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 5fca2c9d70c18dfc731c720ba788f18e73c17742..db4d8a98ff86cd4a85dfbb2f9a8e25da0ea4390b 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -52,12 +52,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
   if (!status_is_cloned_) {
 #ifdef LITE_WITH_MLU
     Env<TARGET(kMLU)>::Init();
-    lite::DeviceInfo::Global().SetMLURunMode(config.mlu_core_version(),
-                                             config.mlu_core_number(),
-                                             config.mlu_use_first_conv(),
-                                             config.mlu_first_conv_mean(),
-                                             config.mlu_first_conv_std(),
-                                             config.mlu_input_layout());
+    lite::TargetWrapperMlu::SetMLURunMode(config.mlu_core_version(),
+                                          config.mlu_core_number(),
+                                          config.mlu_input_layout(),
+                                          config.mlu_firstconv_param());
 #endif  // LITE_WITH_MLU
     auto use_layout_preprocess_pass =
         config.model_dir().find("OPENCL_PRE_PRECESS");
@@ -75,6 +73,10 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 
   mode_ = config.power_mode();
   threads_ = config.threads();
+#ifdef LITE_WITH_NPU
+  Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
+      config.subgraph_model_cache_dir());
+#endif
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
     !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
   int num_threads = config.x86_math_library_num_threads();
diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc
index 5f57ed40ddb762f2d80fce2327a01100bae741d9..f0d1fb96fe4dfd5f8fa57808a2098cbc42db6a11 100644
--- a/lite/api/light_api.cc
+++ b/lite/api/light_api.cc
@@ -15,8 +15,6 @@
 #include "lite/api/light_api.h"
 #include <algorithm>
 #include <map>
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
 
 namespace paddle {
 namespace lite {
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index 938079c51285bc8e8a7a25cd9e2d3682a739b567..cd640581a7a631aad733fd377bfc405869e90322 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "lite/api/paddle_api.h"
+
+#include <utility>
+
 #include "lite/core/context.h"
 #include "lite/core/device_info.h"
 #include "lite/core/target_wrapper.h"
@@ -21,6 +24,13 @@
 #ifdef LITE_WITH_CUDA
 #include "lite/backends/cuda/target_wrapper.h"
 #endif
+#ifdef LITE_WITH_XPU
+#include "lite/backends/xpu/target_wrapper.h"
+#endif
+
+#ifdef LITE_WITH_MLU
+#include "lite/backends/mlu/target_wrapper.h"
+#endif
 
 namespace paddle {
 namespace lite_api {
@@ -106,6 +116,13 @@ void Tensor::CopyFromCpu(const T *src_data) {
         data, src_data, num * sizeof(T), lite::IoDirection::HtoD, *io_stream_);
 #else
     LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::HtoD);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
   } else {
     LOG(FATAL) << "The CopyFromCpu interface just support kHost, kARM, kCUDA";
@@ -127,6 +144,13 @@ void Tensor::CopyToCpu(T *data) const {
     lite::TargetWrapperCuda::StreamSync(*io_stream_);
 #else
     LOG(FATAL) << "Please compile the lib with CUDA.";
+#endif
+  } else if (type == TargetType::kMLU) {
+#ifdef LITE_WITH_MLU
+    lite::TargetWrapperMlu::MemcpySync(
+        data, src_data, num * sizeof(T), lite::IoDirection::DtoH);
+#else
+    LOG(FATAL) << "Please compile the lib with MLU.";
 #endif
   } else {
     LOG(FATAL) << "The CopyToCpu interface just support kHost, kARM, kCUDA";
@@ -148,6 +172,11 @@ template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
 
+template void Tensor::CopyFromCpu<int, TargetType::kMLU>(const int *);
+template void Tensor::CopyFromCpu<int64_t, TargetType::kMLU>(const int64_t *);
+template void Tensor::CopyFromCpu<float, TargetType::kMLU>(const float *);
+template void Tensor::CopyFromCpu<int8_t, TargetType::kMLU>(const int8_t *);
+
 template void Tensor::CopyToCpu(float *) const;
 template void Tensor::CopyToCpu(int *) const;
 template void Tensor::CopyToCpu(int8_t *) const;
@@ -238,13 +267,9 @@ void CxxConfig::set_mlu_core_number(int core_number) {
 void CxxConfig::set_mlu_input_layout(DataLayoutType layout) {
   mlu_input_layout_ = layout;
 }
-void CxxConfig::set_mlu_use_first_conv(bool use_first_conv) {
-  mlu_use_first_conv_ = use_first_conv;
-}
-void CxxConfig::set_mlu_first_conv_mean(const std::vector<float> &mean) {
+void CxxConfig::set_mlu_firstconv_param(const std::vector<float> &mean,
+                                        const std::vector<float> &std) {
   mlu_first_conv_mean_ = mean;
-}
-void CxxConfig::set_mlu_first_conv_std(const std::vector<float> &std) {
   mlu_first_conv_std_ = std;
 }
 lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
@@ -252,18 +277,15 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
 }
 int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
 DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
-bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
-const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
-  return mlu_first_conv_mean_;
-}
-const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
-  return mlu_first_conv_std_;
+std::pair<std::vector<float>, std::vector<float>>
+CxxConfig::mlu_firstconv_param() const {
+  return std::make_pair(mlu_first_conv_mean_, mlu_first_conv_std_);
 }
 #endif
 
 void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
+  lite::TargetWrapperXPU::workspace_l3_size_per_thread = l3_size;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_workspace_l3_size_per_thread' is ignored, please "
@@ -273,7 +295,7 @@ void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
 
 void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::SetDev(dev_no);
+  lite::TargetWrapperXPU::SetDev(dev_no);
 #else
   LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
                   "ignored, please rebuild it with LITE_WITH_XPU=ON.";
@@ -282,7 +304,7 @@ void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
 
 void CxxConfig::set_xpu_multi_encoder_precision(const std::string &precision) {
 #ifdef LITE_WITH_XPU
-  lite::Context<TargetType::kXPU>::_multi_encoder_precision = precision;
+  lite::TargetWrapperXPU::multi_encoder_precision = precision;
 #else
   LOG(WARNING) << "The invoking of the function "
                   "'set_xpu_multi_encoder_precision' is "
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 9cf2e580bf7927b17bc62fb1c524a977ee806307..9c8e18f4c8f0505b9d909d4cf81b4dec6feece77 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -21,6 +21,7 @@
 #define PADDLE_LITE_API_H_
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 #include "paddle_place.h"  // NOLINT
 
@@ -174,9 +175,8 @@ class LITE_API CxxConfig : public ConfigBase {
   lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
   int mlu_core_number_{1};
   DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
-  bool mlu_use_first_conv_{false};
-  std::vector<float> mlu_first_conv_mean_;
-  std::vector<float> mlu_first_conv_std_;
+  std::vector<float> mlu_first_conv_mean_{};
+  std::vector<float> mlu_first_conv_std_{};
 #endif
 
  public:
@@ -232,24 +232,22 @@ class LITE_API CxxConfig : public ConfigBase {
   void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
   // set MLU core number, which is used when compiling MLU kernels
   void set_mlu_core_number(int core_number);
-  // set MLU input layout. User can specify layout of input data to be NHWC,
-  // default is NCHW
-  void set_mlu_input_layout(DataLayoutType layout);
   // whether use MLU's first conv kernel. First conv is a special kernel
   // provided by MLU, its input is uint8, and also needs two 3-dimentional
   // vectors which save all inputs' mean and std values
-  void set_mlu_use_first_conv(bool use_first_conv);
-  // set the 3-dimentional mean vector used by MLU's first conv
-  void set_mlu_first_conv_mean(const std::vector<float>& mean);
-  // set the 3-dimentional std vector used by MLU's first conv
-  void set_mlu_first_conv_std(const std::vector<float>& std);
+  // set the 3-dimentional mean vector and 3-dimentional std vector used by
+  // MLU's first conv
+  void set_mlu_firstconv_param(const std::vector<float>& mean,
+                               const std::vector<float>& std);
+  // set MLU input layout. User can specify layout of input data to be NHWC,
+  // default is NCHW
+  void set_mlu_input_layout(DataLayoutType layout);
 
   lite_api::MLUCoreVersion mlu_core_version() const;
   int mlu_core_number() const;
   DataLayoutType mlu_input_layout() const;
-  bool mlu_use_first_conv() const;
-  const std::vector<float>& mlu_first_conv_mean() const;
-  const std::vector<float>& mlu_first_conv_std() const;
+  // std::pair<mean, std>
+  std::pair<std::vector<float>, std::vector<float>> mlu_firstconv_param() const;
 #endif
 
   // XPU only, set the size of the workspace memory from L3 cache for the
diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc
index 832867df079efa1baebf08da4c0d8e37958460f1..4edd61277059e20f7dfb1b8410a784fd04d85502 100644
--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -15,8 +15,11 @@
 #include "lite/api/paddle_api.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/io.h"
+
 DEFINE_string(model_dir, "", "");
 
 namespace paddle {
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 485bd10770d6e5a29963f336dfdf6d47302ccbc0..2ec4965d3d526c82c41b51954f9564488c5126e1 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -55,6 +55,8 @@ USE_MIR_PASS(apu_subgraph_pass);
 USE_MIR_PASS(quantized_op_attributes_inference_pass);
 USE_MIR_PASS(lite_scale_activation_fuse_pass);
 USE_MIR_PASS(__xpu__resnet_fuse_pass);
+USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
 USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
 USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
 USE_MIR_PASS(__xpu__fc_fuse_pass);
+USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
diff --git a/lite/api/test_yolov3_lite_bm.cc b/lite/api/test_yolov3_lite_bm.cc
index d70ecf3c03955286244aa13cfe65f19569a55930..ded851d93313c3e155dd7f8860eee7446e56e715 100644
--- a/lite/api/test_yolov3_lite_bm.cc
+++ b/lite/api/test_yolov3_lite_bm.cc
@@ -59,9 +59,9 @@ void TestModel(const std::vector<Place>& valid_places) {
   }
   auto* image_tensor = predictor.GetInput(1);
   image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
-  data = image_tensor->mutable_data<float>();
-  data[0] = FLAGS_im_height;
-  data[1] = FLAGS_im_width;
+  auto* data_1 = image_tensor->mutable_data<int>();
+  data_1[0] = FLAGS_im_height;
+  data_1[1] = FLAGS_im_width;
 
   for (int i = 0; i < FLAGS_warmup; ++i) {
     predictor.Run();
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
index d50b46d5bd26e3186e5def2100042e5b22ce4977..9cf8f6a507401656bb0df214bd463a09fd82a61d 100644
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -127,5 +127,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       split_merge_lod_tenosr.cc
       reduce_prod.cc
       lstm.cc
+      clip.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc
index 8e94e212fcb5ff83e8dbfa9d70652cbdaca50656..01f25cbd36d327f7a3c252fdc675262d39748318 100644
--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -763,24 +763,6 @@ void act_thresholded_relu<float>(
   }
 }
 
-#ifdef LITE_WITH_TRAIN
-template <>
-void act_square_grad(const float* din,
-                     const float* dout_grad,
-                     float* din_grad,
-                     int size,
-                     int threads) {
-  const float* ptr_out_grad = dout_grad;
-  float* ptr_in_grad = din_grad;
-  for (int i = 0; i < size; ++i) {
-    ptr_in_grad[0] = ptr_out_grad[0] * 2.0 * din[0];
-    ptr_out_grad++;
-    ptr_in_grad++;
-    din++;
-  }
-}
-#endif
-
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h
index 0a849e9ec711a8c554388d9b69a25b79a7b392ec..b0147040cd11a888ec045948f0914a13aa932a2f 100644
--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -90,12 +90,6 @@ template <typename T>
 void act_thresholded_relu(
     const T* din, T* dout, int size, float threshold, int threads);
 
-#ifdef LITE_WITH_TRAIN
-template <typename T>
-void act_square_grad(
-    const T* din, const T* dout_grad, T* din_grad, int size, int threads);
-#endif
-
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/clip.cc b/lite/backends/arm/math/clip.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f8b48db53b9fe1b50a0832a64b3849faa417fb8
--- /dev/null
+++ b/lite/backends/arm/math/clip.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/clip.h"
+#include <algorithm>
+#include <limits>
+#include <memory>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/saturate.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void clip_kernel_fp32(
+    const float* input, int64_t num, float min, float max, float* output) {
+  float tmp;
+  for (int64_t i = 0; i < num; i++) {
+    tmp = *input;
+    tmp = tmp > min ? tmp : min;
+    *output = tmp < max ? tmp : max;
+    input++;
+    output++;
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/clip.h b/lite/backends/arm/math/clip.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd74a8880abfb660c13c630ca708fa9c8f849d12
--- /dev/null
+++ b/lite/backends/arm/math/clip.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "lite/operators/op_params.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void clip_kernel_fp32(
+    const float* input, int64_t num, float min, float max, float* output);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc
index 4d08c1e957d43b5b748ffdb90fd14a07a61d0183..04373992e4802a0b0c2529daac851e00ebcb56cf 100644
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "lite/backends/arm/math/elementwise.h"
+#include <math.h>
 #include <algorithm>
 #include "lite/backends/arm/math/funcs.h"
 
@@ -1254,6 +1254,19 @@ void elementwise_max_relu_broadcast<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_div<int64_t>(const int64_t* dinx,
+                              const int64_t* diny,
+                              int64_t* dout,
+                              int num) {
+  for (int i = 0; i < num; i++) {
+    *dout = *dinx / *diny;
+    dout++;
+    dinx++;
+    diny++;
+  }
+}
+
 template <>
 void elementwise_div<float>(const float* dinx,
                             const float* diny,
@@ -1306,6 +1319,28 @@ void elementwise_div<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_div_broadcast<int64_t>(const int64_t* dinx,
+                                        const int64_t* diny,
+                                        int64_t* dout,
+                                        int batch,
+                                        int channels,
+                                        int num) {
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int64_t* din_ptr = dinx + offset;
+      const int64_t diny_data = diny[j];
+      int64_t* dout_ptr = dout + offset;
+      for (int p = 0; p < num; p++) {
+        *dout_ptr = *din_ptr / diny_data;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_div_broadcast<float>(const float* dinx,
                                       const float* diny,
@@ -1541,6 +1576,87 @@ void elementwise_div_relu_broadcast<float>(const float* dinx,
   }
 }
 
+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const T* din_ptr = dinx + offset;
+      const T diny_data = diny[j];
+      T* dout_ptr = dout + offset;
+
+      int cnt = num >> 2;
+      int remain = num % 4;
+      for (int k = 0; k < cnt; ++k) {
+        register T dinx0 = din_ptr[0];
+        register T dinx1 = din_ptr[1];
+        register T dinx2 = din_ptr[2];
+        register T dinx3 = din_ptr[3];
+        dout_ptr[0] = dinx0 % diny_data;
+        dout_ptr[1] = dinx1 % diny_data;
+        dout_ptr[2] = dinx2 % diny_data;
+        dout_ptr[3] = dinx3 % diny_data;
+        din_ptr += 4;
+        dout_ptr += 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; p++) {
+          *dout_ptr++ = *din_ptr++ % diny_data;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num) {
+  int cnt = num >> 2;
+  int remain = num % 4;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; i++) {
+    const T* dinx_ptr = dinx + (i << 2);
+    const T* diny_ptr = diny + (i << 2);
+    T* dout_ptr = dout + (i << 2);
+
+    register T dinx0 = dinx_ptr[0];
+    register T dinx1 = dinx_ptr[1];
+    register T dinx2 = dinx_ptr[2];
+    register T dinx3 = dinx_ptr[3];
+
+    register T diny0 = diny_ptr[0];
+    register T diny1 = diny_ptr[1];
+    register T diny2 = diny_ptr[2];
+    register T diny3 = diny_ptr[3];
+
+    dout_ptr[0] = dinx0 % diny0;
+    dout_ptr[1] = dinx1 % diny1;
+    dout_ptr[2] = dinx2 % diny2;
+    dout_ptr[3] = dinx3 % diny3;
+  }
+  if (remain > 0) {
+    const T* dinx_ptr = dinx + (cnt << 2);
+    const T* diny_ptr = diny + (cnt << 2);
+    T* dout_ptr = dout + (cnt << 2);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr++ = *dinx_ptr++ % *diny_ptr++;
+    }
+  }
+}
+
+template void elementwise_mod<int64_t>(const int64_t* dinx,
+                                       const int64_t* diny,
+                                       int64_t* dout,
+                                       int num);
+
+template void elementwise_mod_broadcast<int64_t>(const int64_t* dinx,
+                                                 const int64_t* diny,
+                                                 int64_t* dout,
+                                                 int batch,
+                                                 int channels,
+                                                 int num);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/elementwise.h b/lite/backends/arm/math/elementwise.h
index 06ecab08edcaf06614de94b99084be2ee80647aa..0b400fcce26c7d307777cc6e25d8d25e0d6234bc 100644
--- a/lite/backends/arm/math/elementwise.h
+++ b/lite/backends/arm/math/elementwise.h
@@ -253,6 +253,13 @@ template <typename T>
 void elementwise_div_relu_broadcast(
     const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
 
+template <typename T>
+void elementwise_mod(const T* dinx, const T* diny, T* dout, int num);
+
+template <typename T>
+void elementwise_mod_broadcast(
+    const T* dinx, const T* diny, T* dout, int batch, int channels, int num);
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h
index e975160c97b6e7396ab208805a4d685586ac00c8..75dcc971b80e53c3874ffcbb108afdc0e0faa705 100644
--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -25,6 +25,7 @@
 #include "lite/backends/arm/math/axpy.h"
 #include "lite/backends/arm/math/beam_search.h"
 #include "lite/backends/arm/math/box_coder.h"
+#include "lite/backends/arm/math/clip.h"
 #include "lite/backends/arm/math/col_im_transform.h"
 #include "lite/backends/arm/math/concat.h"
 #include "lite/backends/arm/math/conv_block_utils.h"
diff --git a/lite/backends/arm/math/softmax.cc b/lite/backends/arm/math/softmax.cc
index 65d41b049123680f26674cc05d3c02172a260b31..b7f82e9f376e8b62195d884e8de19a142d76b316 100644
--- a/lite/backends/arm/math/softmax.cc
+++ b/lite/backends/arm/math/softmax.cc
@@ -531,7 +531,7 @@ void softmax_inner1_large_axis<float>(const float* din,
     }
     float32x2_t vhmax = vmax_f32(vget_high_f32(vmax), vget_low_f32(vmax));
     float max_data = std::max(vget_lane_f32(vhmax, 0), vget_lane_f32(vhmax, 1));
-    for (j = 4 * j; j < axis_size; ++j) {
+    for (j = 4 * nn; j < axis_size; ++j) {
       max_data = std::max(max_data, din_max_ptr[0]);
       din_max_ptr++;
     }
@@ -557,7 +557,7 @@ void softmax_inner1_large_axis<float>(const float* din,
     float32x2_t vhsum = vadd_f32(vget_high_f32(vsum), vget_low_f32(vsum));
     float sum_data = vget_lane_f32(vhsum, 0) + vget_lane_f32(vhsum, 1);
 
-    for (j = 4 * j; j < axis_size; ++j) {
+    for (j = 4 * nn; j < axis_size; ++j) {
       dout_sum_ptr[0] = expf(din_sum_ptr[0] - max_data);
       sum_data += dout_sum_ptr[0];
       din_sum_ptr++;
diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h
index 4c7cedaa97e22f74caebc5288fad8543f61bc88d..012004a65fa7d531ed85837e27b880c8c493ffca 100644
--- a/lite/backends/cuda/cuda_utils.h
+++ b/lite/backends/cuda/cuda_utils.h
@@ -41,6 +41,8 @@
         << "CUDA: " << cudaGetErrorString(e);                \
   }
 
+#define CUDA_POST_KERNEL_CHECK CUDA_CALL(cudaPeekAtLastError())
+
 #define CUBLAS_CALL(func)                                        \
   {                                                              \
     auto e = (func);                                             \
@@ -127,6 +129,10 @@ static const char* CudnnGetErrorInfo(cudnnStatus_t status) {
       return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
     case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
       return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
+#endif
+#if CUDNN_VERSION_MIN(8, 0, 0)
+    case CUDNN_STATUS_VERSION_MISMATCH:
+      return "CUDNN_STATUS_VERSION_MISMATCH";
 #endif
   }
   return "Unknown cudnn status";
diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt
index 9e33d38feedbe682f3c4d962b4ccb85b74af3a7b..7f96308a5dcaf5742bd5dcef7c2e5f146cdb7c59 100644
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -13,6 +13,8 @@ nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
 nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
 nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
+nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
+nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
 
 set (
  math_cuda
@@ -25,6 +27,8 @@ set (
  cudnn_pool
  cuda_gemm
  cuda_batched_gemm
+ cuda_strided_gemm
+ cuda_sequence_padding
 )
 
 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc
index 19ace2762af7d2088d5235e20387d8a4d941be30..5db41302c0cb0133e3badad0b5fa167d2c88f9df 100644
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -161,15 +161,17 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
                                               search_func);
 
   } else {
-    CUDNN_CHECK(
-        cudnnGetConvolutionForwardAlgorithm(this->handle_,
-                                            this->input_desc_,
-                                            this->filter_desc_,
-                                            this->conv_desc_,
-                                            this->output_desc_,
-                                            this->preference_,
-                                            this->workspace_limit_bytes_,
-                                            &this->fwd_algo_));
+    int requestedAlgoCount = 1;
+    int returnedAlgoCount;
+    CUDNN_CHECK(cudnnGetConvolutionForwardAlgorithm_v7(this->handle_,
+                                                       this->input_desc_,
+                                                       this->filter_desc_,
+                                                       this->conv_desc_,
+                                                       this->output_desc_,
+                                                       requestedAlgoCount,
+                                                       &returnedAlgoCount,
+                                                       &this->algo_perf_));
+    this->fwd_algo_ = this->algo_perf_.algo;
   }
   CUDNN_CHECK(
       cudnnGetConvolutionForwardWorkspaceSize(this->handle_,
diff --git a/lite/backends/cuda/math/cudnn_conv.h b/lite/backends/cuda/math/cudnn_conv.h
index f73f1db7b1785814b6e97f28c8624b76fa75f89c..a084edefa17a5882f7e6d67407e1f48a818e3407 100644
--- a/lite/backends/cuda/math/cudnn_conv.h
+++ b/lite/backends/cuda/math/cudnn_conv.h
@@ -81,6 +81,7 @@ class CudnnConv2DBase {
   cudaStream_t stream_;
   cudnnHandle_t handle_;
   cudnnConvolutionFwdAlgo_t fwd_algo_;
+  cudnnConvolutionFwdAlgoPerf_t algo_perf_;
   cudnnTensorDescriptor_t input_desc_;
   cudnnTensorDescriptor_t output_desc_;
   cudnnTensorDescriptor_t bias_desc_;
@@ -98,8 +99,6 @@ class CudnnConv2DBase {
 
   const bool use_tensor_core_ = true;
   const size_t workspace_limit_bytes_ = 4 * 1024 * 1024;
-  const cudnnConvolutionFwdPreference_t preference_ =
-      CUDNN_CONVOLUTION_FWD_PREFER_FASTEST;
 
   // For int8
   Tensor temp_tensor_;
diff --git a/lite/backends/cuda/math/sequence_padding.cu b/lite/backends/cuda/math/sequence_padding.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3a32be2a3446e420cac53a33506f141a001d61f0
--- /dev/null
+++ b/lite/backends/cuda/math/sequence_padding.cu
@@ -0,0 +1,166 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/sequence_padding.h"
+#include "lite/backends/cuda/math/utils.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+enum CopyType { kSeqToPad, kPadToSeq };
+
+template <typename T, CopyType Type>
+__global__ void SequencePadKernel(T* dst,
+                                  const T* src,
+                                  const T* pad_value,
+                                  bool is_constant_pad,
+                                  const size_t* seq_offsets,
+                                  const int seq_num,
+                                  const int pad_seq_len,
+                                  const int step_width) {
+  size_t seq_idx = blockIdx.y;
+  size_t seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
+
+  size_t step_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  size_t seq_data_offset = (seq_offsets[seq_idx] + step_idx) * step_width;
+  size_t pad_data_offset = (seq_idx * pad_seq_len + step_idx) * step_width;
+  T* dst_data = dst + (Type == kSeqToPad ? pad_data_offset : seq_data_offset);
+  const T* src_data =
+      src + (Type == kSeqToPad ? seq_data_offset : pad_data_offset);
+
+  if (step_idx < seq_len) {
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
+      dst_data[i] = src_data[i];
+    }
+  } else if (step_idx < pad_seq_len && Type == kSeqToPad) {
+    for (size_t i = threadIdx.x; i < step_width; i += blockDim.x) {
+      dst_data[i] = is_constant_pad ? pad_value[0] : pad_value[i];
+    }
+  }
+}
+
+template <typename T>
+void SequencePadding(T* pad_data,
+                     const T* seq_data,
+                     const T* pad_value_data,
+                     bool is_constant_pad,
+                     const size_t* seq_offsets_data,
+                     int seq_num,
+                     int pad_seq_len,
+                     int step_width,
+                     cudaStream_t* stream) {
+  const int kBlockSize = 512;
+  /* At least use 32 threads to copy sequence_width elements,
+   * and at least 8 elements for each thread.
+   */
+  size_t block_dim_x =
+      std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+  size_t block_dim_y = kBlockSize / block_dim_x;
+  dim3 threads(block_dim_x, block_dim_y);
+
+  size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
+  size_t grid_dim_y = seq_num;
+  dim3 grid(grid_dim_x, grid_dim_y);
+
+  SequencePadKernel<T, kSeqToPad><<<grid, threads, 0, *stream>>>(
+      pad_data,
+      seq_data,
+      pad_value_data,
+      is_constant_pad,
+      seq_offsets_data,
+      seq_num,
+      pad_seq_len,
+      step_width);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+template <typename T>
+void SequenceUnpadding(T* seq_data,
+                       const T* pad_data,
+                       const size_t* seq_offsets_data,
+                       int seq_num,
+                       int pad_seq_len,
+                       int step_width,
+                       cudaStream_t* stream) {
+  const int kBlockSize = 512;
+  /* At least use 32 threads to copy sequence_width elements,
+   * and at least 8 elements for each thread.
+   */
+  size_t block_dim_x =
+      std::min(((((step_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+  size_t block_dim_y = kBlockSize / block_dim_x;
+  dim3 threads(block_dim_x, block_dim_y);
+
+  size_t grid_dim_x = (pad_seq_len + block_dim_y - 1) / block_dim_y;
+  size_t grid_dim_y = seq_num;
+  dim3 grid(grid_dim_x, grid_dim_y);
+
+  SequencePadKernel<T, kPadToSeq><<<grid, threads, 0, *stream>>>(
+      seq_data,
+      pad_data,
+      nullptr,
+      false,
+      seq_offsets_data,
+      seq_num,
+      pad_seq_len,
+      step_width);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+template void SequencePadding(float* pad_data,
+                              const float* seq_data,
+                              const float* pad_value_data,
+                              bool is_constant_pad,
+                              const size_t* seq_offsets_data,
+                              int seq_num,
+                              int pad_seq_len,
+                              int step_width,
+                              cudaStream_t* stream);
+
+template void SequencePadding(half* pad_data,
+                              const half* seq_data,
+                              const half* pad_value_data,
+                              bool is_constant_pad,
+                              const size_t* seq_offsets_data,
+                              int seq_num,
+                              int pad_seq_len,
+                              int step_width,
+                              cudaStream_t* stream);
+
+template void SequenceUnpadding(float* seq_data,
+                                const float* pad_data,
+                                const size_t* seq_offsets_data,
+                                int seq_num,
+                                int pad_seq_len,
+                                int step_width,
+                                cudaStream_t* stream);
+
+template void SequenceUnpadding(half* seq_data,
+                                const half* pad_data,
+                                const size_t* seq_offsets_data,
+                                int seq_num,
+                                int pad_seq_len,
+                                int step_width,
+                                cudaStream_t* stream);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/sequence_padding.h b/lite/backends/cuda/math/sequence_padding.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfbac9b5bce2cad75174695ee85c28720a3eaf11
--- /dev/null
+++ b/lite/backends/cuda/math/sequence_padding.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <string>
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename T>
+void SequenceUnpadding(T* seq_data,
+                       const T* pad_data,
+                       const size_t* seq_offsets_data,
+                       int seq_num,
+                       int pad_seq_len,
+                       int step_width,
+                       cudaStream_t* stream);
+
+template <typename T>
+void SequencePadding(T* pad_data,
+                     const T* seq_data,
+                     const T* pad_value_data,
+                     bool is_constant_pad,
+                     const size_t* seq_offsets_data,
+                     int seq_num,
+                     int pad_seq_len,
+                     int step_width,
+                     cudaStream_t* stream);
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/strided_gemm.cc b/lite/backends/cuda/math/strided_gemm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91013d977702682a42050407f49356bf7445bcbd
--- /dev/null
+++ b/lite/backends/cuda/math/strided_gemm.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/strided_gemm.h"
+
+#include <iostream>
+
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+bool StridedGemm<PtypeIn, PtypeOut>::init(const bool trans_a,
+                                          const bool trans_b,
+                                          Context<TARGET(kCUDA)>* ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  return true;
+}
+
+template <>
+bool StridedGemm<float, float>::run(const float alpha,
+                                    const float beta,
+                                    const int m,
+                                    const int n,
+                                    const int k,
+                                    const float* a_data,
+                                    const float* b_data,
+                                    float* c_data,
+                                    const int batch_size,
+                                    const int64_t stride_a,
+                                    const int64_t stride_b) {
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  const int64_t stride_c = m_ * n_;
+  CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_,
+                                         cu_trans_b_,
+                                         cu_trans_a_,
+                                         n_,
+                                         m_,
+                                         k_,
+                                         &alpha,
+                                         b_data,
+                                         CUDA_R_32F,
+                                         ldb_,
+                                         stride_b,
+                                         a_data,
+                                         CUDA_R_32F,
+                                         lda_,
+                                         stride_a,
+                                         &beta,
+                                         c_data,
+                                         CUDA_R_32F,
+                                         ldc_,
+                                         stride_c,
+                                         batch_size,
+                                         CUDA_R_32F,
+                                         algo_));
+  return true;
+}
+
+template <>
+bool StridedGemm<half, half>::run(const half alpha,
+                                  const half beta,
+                                  const int m,
+                                  const int n,
+                                  const int k,
+                                  const half* a_data,
+                                  const half* b_data,
+                                  half* c_data,
+                                  const int batch_size,
+                                  const int64_t stride_a,
+                                  const int64_t stride_b) {
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  const int64_t stride_c = m_ * n_;
+  CUBLAS_CALL(cublasGemmStridedBatchedEx(cu_handle_,
+                                         cu_trans_b_,
+                                         cu_trans_a_,
+                                         n_,
+                                         m_,
+                                         k_,
+                                         &alpha,
+                                         b_data,
+                                         CUDA_R_16F,
+                                         ldb_,
+                                         stride_b,
+                                         a_data,
+                                         CUDA_R_16F,
+                                         lda_,
+                                         stride_a,
+                                         &beta,
+                                         c_data,
+                                         CUDA_R_16F,
+                                         ldc_,
+                                         stride_c,
+                                         batch_size,
+                                         CUDA_R_16F,
+                                         algo_));
+  return true;
+}
+
+template class StridedGemm<float, float>;
+template class StridedGemm<half, half>;
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/strided_gemm.h b/lite/backends/cuda/math/strided_gemm.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a0fe7143a2569eda36d203d9c905f2a4a0c772c
--- /dev/null
+++ b/lite/backends/cuda/math/strided_gemm.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cudnn.h>
+
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+class StridedGemm {
+ public:
+  StridedGemm() : cu_handle_(nullptr) {}
+  ~StridedGemm() {}
+
+  bool init(const bool trans_a,
+            const bool trans_b,
+            Context<TARGET(kCUDA)>* ctx);
+
+  bool run(const PtypeIn alpha,
+           const PtypeIn beta,
+           const int m,
+           const int n,
+           const int k,
+           const PtypeIn* a_data,
+           const PtypeIn* b_data,
+           PtypeOut* c_data,
+           const int batch_size,
+           const int64_t stride_a,
+           const int64_t stride_b);
+
+ private:
+  cudaStream_t exe_stream_;
+  cublasHandle_t cu_handle_;
+  cublasOperation_t cu_trans_a_;
+  cublasOperation_t cu_trans_b_;
+  int m_{-1};
+  int n_{-1};
+  int k_{-1};
+  int lda_{-1};
+  int ldb_{-1};
+  int ldc_{-1};
+  cublasGemmAlgo_t algo_{CUBLAS_GEMM_DEFAULT_TENSOR_OP};
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/transpose.cu b/lite/backends/cuda/math/transpose.cu
index c50840fe269657965db8c58b171fce6819009775..d919bd757fbbcfcc5e5f8a3a4c18fbd1ed9ac53f 100644
--- a/lite/backends/cuda/math/transpose.cu
+++ b/lite/backends/cuda/math/transpose.cu
@@ -174,24 +174,9 @@ void Transpose<T>::transpose(T* dst,
   TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream);
 }
 
-// template <typename T>
-// void Transpose<T>::transpose(T* dst,
-//                             const T* src,
-//                             const std::vector<int>& src_dims,
-//                             const std::vector<int>& axes,
-//                             cudaStream_t* stream) {
-//  std::vector<int64_t> _src_dims(src_dims.size(), 0);
-//  std::transform(
-//      src_dims.begin(),
-//      src_dims.end(),
-//      _src_dims.begin(),
-//      [](int data) -> int64_t { return static_cast<int64_t>(data); });
-//  TransposeCUDAImpl<T>(_src_dims, axes, src, dst, &Y_dims_, &strides_,
-//  stream);
-//}
-
 template class Transpose<int8_t>;
 template class Transpose<float>;
+template class Transpose<half>;
 
 }  // namespace math
 }  // namespace cuda
diff --git a/lite/backends/mlu/target_wrapper.cc b/lite/backends/mlu/target_wrapper.cc
index 2385f69246a163830e0df855082d728da2743e02..b98854946db7eda4f133d773ae0f5ba9e45a77cc 100644
--- a/lite/backends/mlu/target_wrapper.cc
+++ b/lite/backends/mlu/target_wrapper.cc
@@ -15,6 +15,7 @@
 #include "lite/backends/mlu/target_wrapper.h"
 
 #include <memory>
+#include <utility>
 
 #include "lite/backends/mlu/mlu_utils.h"
 
@@ -36,6 +37,13 @@ void cnrtMemcpyDtoH(void* dst, const void* src, size_t size) {
 
 }  // namespace mlu
 
+thread_local cnmlCoreVersion_t TargetWrapperMlu::mlu_core_version_{CNML_MLU270};
+thread_local int TargetWrapperMlu::mlu_core_number_{1};
+thread_local bool TargetWrapperMlu::use_first_conv_{false};
+thread_local std::vector<float> TargetWrapperMlu::mean_vec_;
+thread_local std::vector<float> TargetWrapperMlu::std_vec_;
+thread_local DataLayoutType TargetWrapperMlu::input_layout_{DATALAYOUT(kNCHW)};
+
 size_t TargetWrapperMlu::num_devices() {
   uint32_t dev_count = 0;
   CNRT_CALL(cnrtGetDeviceCount(&dev_count)) << " cnrt get device count failed";
@@ -77,15 +85,42 @@ void TargetWrapperMlu::MemcpySync(void* dst,
       LOG(FATAL) << "Unsupported IoDirection" << static_cast<int>(dir);
   }
 }
+void TargetWrapperMlu::SetMLURunMode(
+    lite_api::MLUCoreVersion core_version,
+    int core_number,
+    DataLayoutType input_layout,
+    std::pair<std::vector<float>, std::vector<float>> firstconv_param) {
+  switch (core_version) {
+    case (lite_api::MLUCoreVersion::MLU_220):
+      mlu_core_version_ = CNML_MLU220;
+      break;
+    case (lite_api::MLUCoreVersion::MLU_270):
+      mlu_core_version_ = CNML_MLU270;
+      break;
+    default:
+      mlu_core_version_ = CNML_MLU270;
+      break;
+  }
+  mlu_core_number_ = core_number;
+  mean_vec_ = firstconv_param.first;
+  std_vec_ = firstconv_param.second;
+  use_first_conv_ = !(mean_vec_.empty() || std_vec_.empty());
+  input_layout_ = input_layout;
+}
+
+cnmlCoreVersion_t TargetWrapperMlu::MLUCoreVersion() {
+  return mlu_core_version_;
+}
+
+int TargetWrapperMlu::MLUCoreNumber() { return mlu_core_number_; }
+
+bool TargetWrapperMlu::UseFirstConv() { return use_first_conv_; }
+
+const std::vector<float>& TargetWrapperMlu::MeanVec() { return mean_vec_; }
+
+const std::vector<float>& TargetWrapperMlu::StdVec() { return std_vec_; }
 
-// void TargetWrapperMlu::MemcpyAsync(void* dst,
-//                                    const void* src,
-//                                    size_t size,
-//                                    IoDirection dir,
-//                                    const stream_t& stream) {
-//   LOG(WARNING) << "Mlu unsupported MemcpyAsync now, use MemcpySync.";
-//   MemcpySync(dst, src, size, dir);
-// }
+DataLayoutType TargetWrapperMlu::InputLayout() { return input_layout_; }
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/mlu/target_wrapper.h b/lite/backends/mlu/target_wrapper.h
index 2d9e10806f78e56f50b04d408dab219c923456fc..2566ae153e2f9539d1ad5739f208bc5f946a7542 100644
--- a/lite/backends/mlu/target_wrapper.h
+++ b/lite/backends/mlu/target_wrapper.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <utility>
+#include <vector>
 #include "lite/backends/mlu/mlu_utils.h"
 #include "lite/core/target_wrapper.h"
 
@@ -43,11 +45,25 @@ class TargetWrapper<TARGET(kMLU)> {
                          const void* src,
                          size_t size,
                          IoDirection dir);
-  // static void MemcpyAsync(void* dst,
-  //                         const void* src,
-  //                         size_t size,
-  //                         IoDirection dir,
-  //                         const queue_t& queue);
+  static void SetMLURunMode(
+      lite_api::MLUCoreVersion core_version,
+      int core_number,
+      DataLayoutType input_layout,
+      std::pair<std::vector<float>, std::vector<float>> firstconv_param);
+  static cnmlCoreVersion_t MLUCoreVersion();
+  static int MLUCoreNumber();
+  static bool UseFirstConv();
+  static const std::vector<float>& MeanVec();
+  static const std::vector<float>& StdVec();
+  static DataLayoutType InputLayout();
+
+ private:
+  static thread_local cnmlCoreVersion_t mlu_core_version_;
+  static thread_local int mlu_core_number_;
+  static thread_local bool use_first_conv_;
+  static thread_local std::vector<float> mean_vec_;
+  static thread_local std::vector<float> std_vec_;
+  static thread_local DataLayoutType input_layout_;
 };
 
 }  // namespace lite
diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc
index f9803aa8810ada33b9eecafe1502515501514e41..22f760e39f86b29ccf025a83b2a43c87882f9e02 100644
--- a/lite/backends/npu/device.cc
+++ b/lite/backends/npu/device.cc
@@ -20,96 +20,122 @@ namespace paddle {
 namespace lite {
 namespace npu {
 
-bool WriteToOMFile(const domi::ModelBufferData& om_model_buff,
-                   std::string om_file_path) {
-  FILE* fp;
-  fp = fopen(om_file_path.c_str(), "wb");
-  CHECK(fp != nullptr) << om_file_path << " open failed!";
-
-  uint32_t write_size =
-      (uint32_t)fwrite(om_model_buff.data, 1, om_model_buff.length, fp);
-  CHECK_EQ(write_size, om_model_buff.length) << "write om file failed !";
-
-  fclose(fp);
-  return true;
-}
-
-bool ReadFromOMFile(domi::ModelBufferData* om_model_buff,
-                    std::string om_file_path) {
-  FILE* fp;
-  fp = fopen(om_file_path.c_str(), "rb");
-  CHECK(fp != nullptr) << om_file_path << " open failed!";
-
-  fseek(fp, 0, SEEK_END);
-  uint32_t model_length = (uint32_t)ftell(fp);
-  fseek(fp, 0, SEEK_SET);
-  om_model_buff->data = malloc(model_length);
-  om_model_buff->length = model_length;
-  uint32_t read_size =
-      (uint32_t)fread(om_model_buff->data, 1, model_length, fp);
-  CHECK_EQ(read_size, model_length) << "read om file failed !";
-
-  fclose(fp);
-  return true;
-}
-
-std::shared_ptr<hiai::AiModelMngerClient> Device::Build(
-    const std::string model_name,                // NOLINT
-    std::vector<ge::Operator>& input_nodes,      // NOLINT
-    std::vector<ge::Operator>& output_nodes,     // NOLINT
-    const std::string model_cache_full_dir = ""  // NOLINT
-    ) {
-  VLOG(3) << "[NPU] Build model";
-  // Build the HiAI IR graph to the HiAI om model
-  ge::Graph ir_graph("graph");
-  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
-  ge::Model om_model("model", "model");
-  om_model.SetGraph(ir_graph);
-  domi::HiaiIrBuild ir_build;
-  domi::ModelBufferData om_model_buf;
-
-  if (!model_cache_full_dir.empty() && IsFileExists(model_cache_full_dir)) {
-    VLOG(3) << "Will read om model from " << model_cache_full_dir;
-    ReadFromOMFile(&om_model_buf, model_cache_full_dir);
-  } else {
-    if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
-      LOG(WARNING) << "[NPU] CreateModelBuff failed!";
-      return nullptr;
-    }
-    if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
-      LOG(WARNING) << "[NPU] BuildIRModel failed!";
-      ir_build.ReleaseModelBuff(om_model_buf);
-      return nullptr;
-    }
-    if (!model_cache_full_dir.empty()) {
-      VLOG(3) << "Will write om model to " << model_cache_full_dir;
-      WriteToOMFile(om_model_buf, model_cache_full_dir);
-    }
-  }
-
+std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
+    const std::string& model_name,
+    std::vector<char>* model_buffer,
+    bool* model_comp) {
   // Create a HiAI model manager client to load the HiAI om model
-  std::shared_ptr<hiai::AiModelMngerClient> model_client(
-      new hiai::AiModelMngerClient());
+  auto model_client = std::make_shared<hiai::AiModelMngerClient>();
   if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
-    ir_build.ReleaseModelBuff(om_model_buf);
+    LOG(WARNING) << "[NPU] Init hiai model client failed!";
     return nullptr;
   }
+  // Check HiAI DDK version
+  const char* ddk_version = model_client->GetVersion();
+  if (ddk_version) {
+    LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
+  } else {
+    LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
+  }
+  // Check model compatibility
   auto model_desc = std::make_shared<hiai::AiModelDescription>(
       model_name, freq_level(), framework_type(), model_type(), device_type());
-  model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
-  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
-  model_descs.push_back(model_desc);
+  model_desc->SetModelBuffer(
+      reinterpret_cast<const void*>(model_buffer->data()),
+      model_buffer->size());
+  if (!*model_comp &&
+      model_client->CheckModelCompatibility(*model_desc, *model_comp) !=
+          hiai::AI_SUCCESS) {
+    *model_comp = false;
+    VLOG(3) << "[NPU] model is NOT compatiblitiable, setting model_comp to "
+            << *model_comp;
+  } else {
+    *model_comp = true;
+    VLOG(3) << "[NPU] model is compatiblitiable, setting model_comp to "
+            << *model_comp;
+  }
+  // Rebuild and write the data of the compatible model to the model buffer
+  if (!*model_comp) {
+    std::shared_ptr<hiai::AiModelBuilder> model_builder =
+        std::make_shared<hiai::AiModelBuilder>(model_client);
+    hiai::MemBuffer* org_model_buffer = model_builder->InputMemBufferCreate(
+        reinterpret_cast<void*>(model_buffer->data()), model_buffer->size());
+    if (org_model_buffer) {
+      std::vector<hiai::MemBuffer*> org_model_buffers;
+      org_model_buffers.push_back(org_model_buffer);
+      hiai::MemBuffer* new_model_buffer = model_builder->OutputMemBufferCreate(
+          framework_type(), org_model_buffers);
+      // VLOG(3) << "[NPU] new model buffer memeory size is " <<
+      // new_model_buffer->GetMemBufferSize();
+      if (new_model_buffer) {
+        uint32_t new_model_size = 0;
+        if (model_builder->BuildModel(org_model_buffers,
+                                      new_model_buffer,
+                                      new_model_size) == hiai::AI_SUCCESS) {
+          // need to change to new_model_size as GetMemBufferSize is not
+          // correct.
+          model_buffer->resize(new_model_size);
+          memcpy(reinterpret_cast<void*>(model_buffer->data()),
+                 new_model_buffer->GetMemBufferData(),
+                 new_model_size);
+          // Reset the model buffer
+          model_desc->SetModelBuffer(
+              reinterpret_cast<const void*>(model_buffer->data()),
+              model_buffer->size());
+          VLOG(3) << "[NPU] Rebuild the compatible model done.";
+        } else {
+          LOG(WARNING) << "[NPU] Rebuild the compatible model failed!";
+        }
+        model_builder->MemBufferDestroy(new_model_buffer);
+      } else {
+        LOG(WARNING) << "[NPU] OutputMemBufferCreate failed!";
+      }
+      model_builder->MemBufferDestroy(org_model_buffer);
+    } else {
+      LOG(WARNING) << "[NPU] InputMemBufferCreate failed!";
+    }
+  }
+  // Load the compatible model
+  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs{
+      model_desc};
   if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
     LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
-    ir_build.ReleaseModelBuff(om_model_buf);
     return nullptr;
   }
-  ir_build.ReleaseModelBuff(om_model_buf);
-  VLOG(3) << "[NPU] Build done";
+  VLOG(3) << "[NPU] Load model done.";
   return model_client;
 }
 
+bool Device::Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+                   std::vector<ge::Operator>& output_nodes,  // NOLINT
+                   std::vector<char>* model_buffer) {
+  // Convert the HiAI IR graph to the HiAI om model
+  ge::Graph ir_graph("graph");
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
+  ge::Model om_model("model", "model");
+  om_model.SetGraph(ir_graph);
+
+  // Build the HiAI om model, serialize and output it to the om buffer
+  domi::HiaiIrBuild ir_build;
+  domi::ModelBufferData om_buffer;
+  if (!ir_build.CreateModelBuff(om_model, om_buffer)) {
+    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
+    return false;
+  }
+  if (!ir_build.BuildIRModel(om_model, om_buffer)) {
+    LOG(WARNING) << "[NPU] BuildIRModel failed!";
+    ir_build.ReleaseModelBuff(om_buffer);
+    return false;
+  }
+  model_buffer->resize(om_buffer.length);
+  memcpy(reinterpret_cast<void*>(model_buffer->data()),
+         reinterpret_cast<void*>(om_buffer.data),
+         om_buffer.length);
+  ir_build.ReleaseModelBuff(om_buffer);
+  VLOG(3) << "[NPU] Build model done.";
+  return true;
+}
+
 }  // namespace npu
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h
index cf03e097194bf20ab428677b09b840991e8a902c..5862f0b393292d95b6500ae75171fab07a5279a6 100644
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -38,14 +38,18 @@ class Device {
   int model_type() { return model_type_; }
   int device_type() { return device_type_; }
 
+  // Load the HiAI om model from buffer, rebuild the model if it's incompatible
+  // with the current device, then create a HiAI model manager client(from HiAI
+  // Server) to run inference
+  std::shared_ptr<hiai::AiModelMngerClient> Load(
+      const std::string& model_name,
+      std::vector<char>* model_buffer,
+      bool* model_comp);
   // Build the HiAI IR graph to om model, return HiAI model manager client to
   // load om model and run inference.
-  std::shared_ptr<hiai::AiModelMngerClient> Build(
-      const std::string model_name,             // NOLINT
-      std::vector<ge::Operator>& input_nodes,   // NOLINT
-      std::vector<ge::Operator>& output_nodes,  // NOLINT
-      const std::string model_cache_name        // NOLINT
-      );                                        // NOLINT
+  bool Build(std::vector<ge::Operator>& input_nodes,   // NOLINT
+             std::vector<ge::Operator>& output_nodes,  // NOLINT
+             std::vector<char>* model_buffer);
 
  private:
   int freq_level_{3};
diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc
index 67d679fdd596b109b714bf7ba3cd45b2632b9420..002073517bc61af60da213db9af6e56da5f5b501 100644
--- a/lite/backends/opencl/cl_context.cc
+++ b/lite/backends/opencl/cl_context.cc
@@ -119,7 +119,7 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
   }
 }
 
-cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
+cl::NDRange CLContext::LocalWorkSizeTune(cl::NDRange global_work_size,
                                          size_t max_work_size,
                                          int divisor) {
   int preferred_lws = 0;
@@ -157,7 +157,7 @@ cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
                      static_cast<size_t>(gws0)};
 #endif
 }
-cl::NDRange CLContext::LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+cl::NDRange CLContext::LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
                                                 size_t max_work_size,
                                                 int divisor) {
   int preferred_lws = 0;
diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h
index 82d15bee5ec460a1fb06430571f007fcef23f66f..c204a8510402b8741c761938c3b2c37ac07fe961 100644
--- a/lite/backends/opencl/cl_context.h
+++ b/lite/backends/opencl/cl_context.h
@@ -62,10 +62,10 @@ class CLContext {
 
   cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
 
-  cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
+  cl::NDRange LocalWorkSizeTune(cl::NDRange global_work_size,
                                 size_t max_work_size,
                                 int divitor = 2);
-  cl::NDRange LocalWorkSizeTurnReverse(cl::NDRange global_work_size,
+  cl::NDRange LocalWorkSizeTuneReverse(cl::NDRange global_work_size,
                                        size_t max_work_size,
                                        int divitor = 2);
   bool IsArmMali();
diff --git a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
index a14748c69f3eafce515c90f2b8a226703fe5883d..080ce2b457421970409431dee6841ac4f7d57bb5 100644
--- a/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/fc_kernel.cl
@@ -405,7 +405,9 @@ void fc_gemm_4x4(__global const CL_DTYPE* a,
     } else {
         for (int cidx = col; cidx < N; ++cidx) {
             for (int ridx = row; ridx < M; ++ridx) {
-                CL_COMPUTE_DTYPE a0, b0, c0 = bias ? bias[cidx] : 0;
+                CL_COMPUTE_DTYPE a0 = 0;
+                CL_COMPUTE_DTYPE b0 = 0;
+                CL_COMPUTE_DTYPE c0 = bias ? bias[cidx] : 0;
                 for (int p = 0; p < K; ++p) {
                     a0 = *(a + ridx * K + p);
                     b0 = *(b + p * N + cidx),
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
index 1c808da68ddc923e12234bc4b6ac99b35bfffb0b..9209f0e0f8d04fad5e788f3742c7922af8e13f49 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_opt_kernel.cl
@@ -6,9 +6,7 @@ __kernel void conv2d_1x1_opt(
     __private const int global_size_dim2,
     __read_only image2d_t input_image,
     __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
     __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
     __read_only image2d_t new_scale,
     __read_only image2d_t new_biase,
@@ -284,9 +282,7 @@ __kernel void conv2d_1x1_simple(
     __private const int global_size_dim2,
     __read_only image2d_t input_image,
     __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
     __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
     __read_only image2d_t new_scale,
     __read_only image2d_t new_biase,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
index 771765ea6063a08784ae824a757b28450d808f6d..6a3aa6455daf8d20430a434ff6f47dac382f1f74 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3(__private const int global_size_dim0,
                          __private const int global_size_dim2,
                          __read_only image2d_t input_image,
                          __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                          __read_only image2d_t bias,
-#endif
                          __write_only image2d_t output_image,
                          __private const int stride,
                          __private const int offset,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
index 79f3922e89549fc15b7a849efb0e2b6595357102..739f852a7c6b60e4c38cb2523dfb745af65bc8df 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_3x3_opt_kernel.cl
@@ -19,9 +19,7 @@ __kernel void conv2d_3x3_opt(__private const int item_ch,
                              __private const int item_h,
                              __read_only image2d_t input_image,
                              __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                              __read_only image2d_t bias,
-#endif
                              __write_only image2d_t output_image,
                              __private const int stride,
                              __private const int pad,
@@ -264,9 +262,7 @@ __kernel void conv2d_3x3_multi_batch(__private const int item_ch,
                                      __private const int item_h,
                                      __read_only image2d_t input_image,
                                      __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                      __read_only image2d_t bias,
-#endif
                                      __write_only image2d_t output_image,
                                      __private const int stride,
                                      __private const int pad,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
index d856af6a1d4026b1595bc287901e53f64267dc81..f08d53fa4968d041337adfe3252529bca3b5c55e 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_5x5(__private const int global_size_dim0,
                          __private const int global_size_dim2,
                          __read_only image2d_t input_image,
                          __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                          __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                          __read_only image2d_t new_scale,
                          __read_only image2d_t new_biase,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
index 4ed2e072022dc4b457a86d634bf4bc21ab62bc45..4cce039f27b750950a1475ac266e0f5117c6d259 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_5x5_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_5x5_opt(__private const int item_ch,
                              __private const int item_h,
                              __read_only image2d_t input_image,
                              __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                              __read_only image2d_t bias,
-#endif
                              __write_only image2d_t output_image,
                              __private const int stride,
                              __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
                                      __private const int item_h,
                                      __read_only image2d_t input_image,
                                      __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                      __read_only image2d_t bias,
-#endif
                                      __write_only image2d_t output_image,
                                      __private const int stride,
                                      __private const int pad,
@@ -513,4 +509,4 @@ __kernel void conv2d_5x5_multi_batch(__private const int item_ch,
                    (int2)(out_w_base_id + out_w_id4, item_h_id),
                    output[4]);
   }
-}
\ No newline at end of file
+}
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
index 4998dc99279fffad8750ef3b6495597e9fc4ad65..2a2f210601e760651ee850686391af3c040fbe7f 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_kernel.cl
@@ -5,9 +5,7 @@ __kernel void conv2d_7x7(__private const int global_size_dim0,
                          __private const int global_size_dim2,
                          __read_only image2d_t input_image,
                          __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                          __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                          __read_only image2d_t new_scale,
                          __read_only image2d_t new_biase,
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
index d82f4b4c96b586b6ecf948827402afd0766dcea4..4eadcd9f8032996abae04660b6878ab5beaff9a7 100644
--- a/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_7x7_opt_kernel.cl
@@ -20,9 +20,7 @@ __kernel void conv2d_7x7_opt(__private const int item_ch,
                              __private const int item_h,
                              __read_only image2d_t input_image,
                              __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                              __read_only image2d_t bias,
-#endif
                              __write_only image2d_t output_image,
                              __private const int stride,
                              __private const int pad,
@@ -268,9 +266,7 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
                                      __private const int item_h,
                                      __read_only image2d_t input_image,
                                      __read_only image2d_t filter_image,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                      __read_only image2d_t bias,
-#endif
                                      __write_only image2d_t output_image,
                                      __private const int stride,
                                      __private const int pad,
@@ -513,4 +509,4 @@ __kernel void conv2d_7x7_multi_batch(__private const int item_ch,
                    (int2)(out_w_base_id + out_w_id4, item_h_id),
                    output[4]);
   }
-}
\ No newline at end of file
+}
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
index 27313aea23ed16ecc7a6763dfbbbe63bca18941a..465b9f8f925a130b4d1b059ab15e93bc29128ec7 100755
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_basic_kernel.cl
@@ -19,9 +19,7 @@ __kernel void depth_conv2d(__private const int global_size_dim0,
                            __private const int global_size_dim2,
                            __read_only image2d_t input,
                            __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                            __read_only image2d_t bias,
-#endif
 #ifdef BATCH_NORM
                            __read_only image2d_t new_scale,
                            __read_only image2d_t new_biase,
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
index 5626fe6be7d451d4ffe22a2008affa7d82298bc3..6fbdc21f934f21dd26c3eb66885f7087e3d340c0 100755
--- a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -20,9 +20,7 @@ __kernel void depth_conv2d_3x3(
     __private const int global_size_dim2,
     __read_only image2d_t input,
     __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
     __read_only image2d_t bias,
-#endif
     __write_only image2d_t output_image,
     __private const int stride,
     __private const int offset,
@@ -249,9 +247,7 @@ __kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
                                  __private const int ou_nh,
                                  __read_only image2d_t input,
                                  __read_only image2d_t filter,
-#if defined(BIASE_CH) || defined(BIASE_ELE)
                                  __read_only image2d_t bias,
-#endif
                                  __write_only image2d_t output_image,
                                  __private const int stride,
                                  __private const int pad,
diff --git a/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b8533076b79aa2e94e30e38dd34d3f2292fdf88a
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/transpose_kernel.cl
@@ -0,0 +1,160 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void transpose_4d(__read_only image2d_t input_image,
+                           __write_only image2d_t output_image,
+                           __private const int out_C,
+                           __private const int out_H,
+                           __private const int out_W,
+                           __private const int in_W) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = 1;
+  const int out_h = out_nh % out_H;
+  const int out_c0 = out_c * 4;
+  const int out_c1 = out_c * 4 + 1;
+  const int out_c2 = out_c * 4 + 2;
+  const int out_c3 = out_c * 4 + 3;
+
+  const int in_n = out_n;
+  const int in_c = out_w * 0.25;
+  const int in_h0 = out_c0;
+  const int in_h1 = out_c1;
+  const int in_h2 = out_c2;
+  const int in_h3 = out_c3;
+  const int in_w = out_h;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  int2 input_pos0;
+  int2 input_pos1;
+  int2 input_pos2;
+  int2 input_pos3;
+
+  input_pos0.x = in_W * in_c + in_w;
+  input_pos0.y = in_n * in_h0;
+
+  input_pos1.x = in_W * in_c + in_w;
+  input_pos1.y = in_n * in_h1;
+
+  input_pos2.x = in_W * in_c + in_w;
+  input_pos2.y = in_n * in_h2;
+
+  input_pos3.x = in_W * in_c + in_w;
+  input_pos3.y = in_n * in_h3;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input0;
+  CL_DTYPE4 input1;
+  CL_DTYPE4 input2;
+  CL_DTYPE4 input3;
+  CL_DTYPE4 output;
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
+
+  if (out_w % 4 == 0) {
+    output.x = input0.x;
+  } else if (out_w % 4 == 1) {
+    output.x = input0.y;
+  } else if (out_w % 4 == 2) {
+    output.x = input0.z;
+  } else {
+    output.x = input0.w;
+  }
+  if (out_C - out_c * 4 >= 2) {
+    input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
+    if(out_w % 4 == 0) {
+      output.y = input1.x;
+    } else if(out_w % 4 == 1) {
+      output.y = input1.y;
+    } else if(out_w % 4 == 2) {
+      output.y = input1.z;
+    } else {
+      output.y = input1.w;
+    }
+  } else {
+    output.y = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 3) {
+    input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
+    if (out_w % 4 == 0){
+      output.z = input2.x;
+    } else if (out_w % 4 == 1) {
+      output.z = input2.y;
+    } else if (out_w % 4 == 2) {
+      output.z = input2.z;
+    } else {
+      output.z = input2.w;
+    }
+  } else {
+    output.z = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 4) {
+    input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
+    if (out_w % 4 == 0) {
+      output.w = input3.x;
+    } else if (out_w % 4 == 1) {
+      output.w = input3.y;
+    } else if (out_w % 4 == 2) {
+      output.w = input3.z;
+    } else {
+      output.w = input3.w;
+    }
+  } else {
+    output.w = 0.0f;
+  }
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
+
+__kernel void transpose(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int out_C,
+                        __private const int out_H,
+                        __private const int out_W,
+                        __private const int in_W) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = 1;
+  const int out_h = out_nh % out_H;
+  
+  const int in_n = 1;
+  const int in_c = out_c;
+  const int in_w = out_h;
+  const int in_h = out_w;
+  
+  int2 input_pos;
+  int2 output_pos;
+  input_pos.x = in_c * in_W + in_w;
+  input_pos.y = in_n * in_h;
+  
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_n * out_h;
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input;
+  CL_DTYPE4 output;
+  input = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos);
+
+  output = input;
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, input);
+}
\ No newline at end of file
diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc
index 4978dfb84a4ee5770df011c54dccde59a62135b7..0d4301c5b6a56e50eba2d9a6ae13ce353a9b1e2e 100644
--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "lite/backends/x86/cupti_lib_path.h"
 #include "lite/backends/x86/port.h"
 #include "lite/backends/x86/warpctc_lib_path.h"
+#include "lite/utils/cp_logging.h"
 #include "lite/utils/env.h"
-#include "lite/utils/paddle_enforce.h"
 
 // DEFINE_string(cudnn_dir,
 //               "",
@@ -178,7 +178,7 @@ auto error_msg =
 #endif  // !_WIN32
   if (throw_on_error) {
     CHECK(dso_handle != nullptr);
-    // PADDLE_ENFORCE(nullptr != dso_handle, error_msg, dlPath, errorno);
+    // CHECK(nullptr != dso_handle, error_msg, dlPath, errorno);
   } else if (nullptr == dso_handle) {
     // LOG(WARNING) << string::Sprintf(error_msg, dlPath, errorno);
   }
diff --git a/lite/backends/x86/jit/benchmark.cc b/lite/backends/x86/jit/benchmark.cc
index c49984691e5beca5a42defd68243e1352372cf11..6318916dfa53d5cce0c33d0149a520ccb9288c28 100644
--- a/lite/backends/x86/jit/benchmark.cc
+++ b/lite/backends/x86/jit/benchmark.cc
@@ -319,8 +319,8 @@ void BenchKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](
       int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
+    CHECK_GT(n, 0);
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc
index 7e697014ed241a75693b783127633b255964f80b..e6628058d03959a2a58b403a6ad61af6c50b431c 100644
--- a/lite/backends/x86/jit/gen/embseqpool.cc
+++ b/lite/backends/x86/jit/gen/embseqpool.cc
@@ -129,11 +129,11 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const emb_seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.table_height, 0);
-    PADDLE_ENFORCE_GT(attr.table_width, 0);
-    PADDLE_ENFORCE_GT(attr.index_height, 0);
-    PADDLE_ENFORCE_GT(attr.index_width, 0);
-    PADDLE_ENFORCE_GT(attr.out_width, 0);
+    CHECK_GT(attr.table_height, 0);
+    CHECK_GT(attr.table_width, 0);
+    CHECK_GT(attr.index_height, 0);
+    CHECK_GT(attr.index_width, 0);
+    CHECK_GT(attr.out_width, 0);
     return make_unique<EmbSeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/lite/backends/x86/jit/gen/embseqpool.h b/lite/backends/x86/jit/gen/embseqpool.h
index 7bb248dd1d384af949fd3cd190df3d90d21921ef..d013887be5ecec1f67fa022b49b889f9cee9ade4 100644
--- a/lite/backends/x86/jit/gen/embseqpool.h
+++ b/lite/backends/x86/jit/gen/embseqpool.h
@@ -17,7 +17,7 @@
 #include <string>
 #include "lite/backends/x86/jit/gen/jitcode.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc
index f78df73f66532f891721c74cff9c78cc3bb61922..87fe758809e3e7e18d2f939a26f3729b937bf6f6 100644
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -27,7 +27,7 @@ void MatMulJitCode::genCode() {
   preCode();
   int block, rest;
   const auto groups = packed_groups(n_, k_, &block, &rest);
-  PADDLE_ENFORCE_GT(groups.front(), 0);
+  CHECK_GT(groups.front(), 0);
 
   const int block_len = sizeof(float) * block;
   const int x_reg_idx = (block == ZMM_FLOAT_BLOCK ? 32 : 16) - 1;
@@ -116,9 +116,9 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const matmul_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.m, 0);
-    PADDLE_ENFORCE_GT(attr.n, 0);
-    PADDLE_ENFORCE_GT(attr.k, 0);
+    CHECK_GT(attr.m, 0);
+    CHECK_GT(attr.n, 0);
+    CHECK_GT(attr.k, 0);
     return make_unique<MatMulJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/lite/backends/x86/jit/gen/matmul.h b/lite/backends/x86/jit/gen/matmul.h
index 95edc14201ac94d302ff806d0a4b8f5f50b2835c..8bc1e41d0a17d548c47819b5e11daf7ed5065e86 100644
--- a/lite/backends/x86/jit/gen/matmul.h
+++ b/lite/backends/x86/jit/gen/matmul.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "lite/backends/x86/jit/gen/jitcode.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -32,7 +32,7 @@ class MatMulJitCode : public JitCode {
                          size_t code_size = 256 * 1024,
                          void* code_ptr = nullptr)
       : JitCode(code_size, code_ptr), m_(attr.m), n_(attr.n), k_(attr.k) {
-    PADDLE_ENFORCE_EQ(m_, 1, "Only support m==1 yet");
+    CHECK_EQ(m_, 1) << "Only support m==1 yet";
     this->genCode();
   }
 
diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc
index 4c80737aac4bc9cd09f4ff222c8fad8c441887ec..c54093e4dfa00f89f51c70840c45518f3eddfd3d 100644
--- a/lite/backends/x86/jit/gen/seqpool.cc
+++ b/lite/backends/x86/jit/gen/seqpool.cc
@@ -69,8 +69,8 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const seq_pool_attr_t& attr) const override {
-    PADDLE_ENFORCE_GT(attr.w, 0);
-    PADDLE_ENFORCE_GT(attr.h, 0);
+    CHECK_GT(attr.w, 0);
+    CHECK_GT(attr.h, 0);
     return make_unique<SeqPoolJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/lite/backends/x86/jit/gen/seqpool.h b/lite/backends/x86/jit/gen/seqpool.h
index a00428f3e0982889665cd23b21a5978c7c239399..a1bde4a9b66f22ef8815bdc61fe866065e7f4203 100644
--- a/lite/backends/x86/jit/gen/seqpool.h
+++ b/lite/backends/x86/jit/gen/seqpool.h
@@ -17,7 +17,7 @@
 #include <string>
 #include "lite/backends/x86/jit/gen/jitcode.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -125,8 +125,8 @@ class SeqPoolJitCode : public JitCode {
         vmovss(xmm_t(reg_idx + max_num_regs), ptr[reg_ptr_src_i]);
         reg_idx++;
       }
-      PADDLE_ENFORCE_EQ(
-          reg_idx, rest_used_num_regs, "All heights should use same regs");
+      CHECK_EQ(reg_idx, rest_used_num_regs)
+          << "All heights should use same regs";
       for (int i = 0; i < reg_idx; ++i) {
         vaddps(xmm_t(i), xmm_t(i), xmm_t(i + max_num_regs));
       }
diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc
index 44e083366132c675b339b2da4bbb3b7c1c6b7569..f91f1305ee30af708443e6a9a8bbb3fae2cc0b80 100644
--- a/lite/backends/x86/jit/gen/sgd.cc
+++ b/lite/backends/x86/jit/gen/sgd.cc
@@ -17,7 +17,7 @@
 #include <memory>
 #include <vector>
 #include "lite/backends/x86/jit/registry.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -113,9 +113,9 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
   }
   std::unique_ptr<GenBase> CreateJitCode(
       const sgd_attr_t& attr) const override {
-    PADDLE_ENFORCE_EQ(attr.param_width, attr.grad_width);
-    PADDLE_ENFORCE_LE(attr.selected_rows_size, attr.grad_height);
-    PADDLE_ENFORCE_GE(attr.selected_rows_size, 0);
+    CHECK_EQ(attr.param_width, attr.grad_width);
+    CHECK_LE(attr.selected_rows_size, attr.grad_height);
+    CHECK_GE(attr.selected_rows_size, 0);
     return make_unique<SgdJitCode>(attr, CodeSize(attr));
   }
 };
diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc
index fb1e71f7b0b1e6f68a331d264682e80fbab7c219..7c4860ba5084860b67b6ecb7e3eed8aafb16cb2c 100644
--- a/lite/backends/x86/jit/gen/vbroadcast.cc
+++ b/lite/backends/x86/jit/gen/vbroadcast.cc
@@ -16,7 +16,7 @@
 #include <memory>
 #include <vector>
 #include "lite/backends/x86/jit/registry.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -76,7 +76,7 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
     return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
   }
   std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
-    PADDLE_ENFORCE_GT(w, 0);
+    CHECK_GT(w, 0);
     return make_unique<VBroadcastJitCode>(w, CodeSize(w));
   }
 };
diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc
index a3376be423828b25c6eda6fff30a56578c7bbbe5..a9a89fdb205ad54268986eeee628aec75ac01b74 100644
--- a/lite/backends/x86/jit/gen_base.cc
+++ b/lite/backends/x86/jit/gen_base.cc
@@ -21,8 +21,8 @@
 // posix_memalign
 #include "lite/backends/x86/cpu_info.h"
 #include "lite/backends/x86/jit/macro.h"
+#include "lite/utils/cp_logging.h"
 #include "lite/utils/env.h"
-#include "lite/utils/paddle_enforce.h"
 
 #ifndef _WIN32
 #define posix_memalign_free free
@@ -62,12 +62,10 @@ void* GenBase::operator new(size_t size) {
 #ifdef _WIN32
   ptr = _aligned_malloc(size, alignment);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
-                    0,
-                    "GenBase Alloc %ld error!",
-                    size);
+  CHECK_EQ(posix_memalign(&ptr, alignment, size), 0) << "GenBase Alloc " << size
+                                                     << " error!";
 #endif
-  PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
+  CHECK(ptr) << "Fail to allocate GenBase CPU memory: size = " << size;
   return ptr;
 }
 
diff --git a/lite/backends/x86/jit/helper.cc b/lite/backends/x86/jit/helper.cc
index 8322f7ebd2ce78f99979574983d81cebe5139606..f80a24d15c4666eacd31770c46f8a7ad4e7cfb37 100644
--- a/lite/backends/x86/jit/helper.cc
+++ b/lite/backends/x86/jit/helper.cc
@@ -14,9 +14,10 @@
 
 #include "lite/backends/x86/jit/helper.h"
 #include <algorithm>  // tolower
+#include <cstring>
 #include <numeric>
 #include <string>
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -104,12 +105,12 @@ void pack_weights<float>(const float* src, float* dst, int n, int k) {
   int block, rest;
   const auto groups = packed_groups(n, k, &block, &rest);
   std::for_each(groups.begin(), groups.end(), [&](int i) {
-    PADDLE_ENFORCE_GT(i, 0, "each element of groups should be larger than 0.");
+    CHECK_GT(i, 0) << "each element of groups should be larger than 0.";
   });
   int sum = std::accumulate(groups.begin(), groups.end(), 0);
   std::memset(dst, 0, k * sum * block * sizeof(float));
-  PADDLE_ENFORCE_GE(
-      sum * block, n, "The packed n should be equal to or larger than n");
+  CHECK_GE(sum * block, n)
+      << "The packed n should be equal to or larger than n";
 
   const int block_len = sizeof(float) * block;
   int n_offset = 0;
diff --git a/lite/backends/x86/jit/helper.h b/lite/backends/x86/jit/helper.h
index f741edbbed5b721fb9104a9c9a171a12532e4705..57a3611bb671c6d83ec3212702a57e3fc7d7f35f 100644
--- a/lite/backends/x86/jit/helper.h
+++ b/lite/backends/x86/jit/helper.h
@@ -23,7 +23,7 @@
 #include "lite/backends/x86/jit/kernel_base.h"
 #include "lite/backends/x86/jit/kernel_key.h"
 #include "lite/backends/x86/jit/kernel_pool.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -78,8 +78,8 @@ inline const Kernel* GetReferKernel() {
   auto& ref_pool = ReferKernelPool::Instance().AllKernels();
   KernelKey kkey(KernelTuple::kernel_type, lite::fluid::CPUPlace());
   auto ref_iter = ref_pool.find(kkey);
-  PADDLE_ENFORCE(ref_iter != ref_pool.end(),
-                 "Every Kernel should have reference function.");
+  CHECK(ref_iter != ref_pool.end())
+      << "Every Kernel should have reference function.";
   auto& ref_impls = ref_iter->second;
   for (auto& impl : ref_impls) {
     auto i = dynamic_cast<const ReferKernel<KernelTuple>*>(impl.get());
@@ -94,7 +94,7 @@ template <typename KernelTuple>
 inline typename KernelTuple::func_type GetReferFunc() {
   auto ker = GetReferKernel<KernelTuple>();
   auto p = dynamic_cast<const ReferKernel<KernelTuple>*>(ker);
-  PADDLE_ENFORCE(p, "The Refer kernel should exsit");
+  CHECK(p) << "The Refer kernel should exsit";
   return p->GetFunc();
 }
 
@@ -125,7 +125,7 @@ std::vector<const Kernel*> GetAllCandidateKernels(
 
   // The last implementation should be reference function on CPUPlace.
   auto ref = GetReferKernel<KernelTuple>();
-  PADDLE_ENFORCE(ref != nullptr, "Refer Kernel can not be empty.");
+  CHECK(ref != nullptr) << "Refer Kernel can not be empty.";
   res.emplace_back(ref);
   return res;
 }
@@ -140,11 +140,11 @@ GetAllCandidateFuncsWithTypes(const typename KernelTuple::attr_type& attr) {
     std::string name = k->ImplType();
     if (name == "JitCode") {
       auto i = dynamic_cast<const GenBase*>(k);
-      PADDLE_ENFORCE(i, "jitcode kernel cast can not fail.");
+      CHECK(i) << "jitcode kernel cast can not fail.";
       res.emplace_back(std::make_pair(name, i->template getCode<Func>()));
     } else {
       auto i = dynamic_cast<const KernelMore<KernelTuple>*>(k);
-      PADDLE_ENFORCE(i, "kernel cast can not fail.");
+      CHECK(i) << "kernel cast can not fail.";
       res.emplace_back(std::make_pair(name, i->GetFunc()));
     }
   }
@@ -166,7 +166,7 @@ template <typename KernelTuple, typename PlaceType = lite::fluid::CPUPlace>
 typename KernelTuple::func_type GetDefaultBestFunc(
     const typename KernelTuple::attr_type& attr) {
   auto funcs = GetAllCandidateFuncs<KernelTuple, PlaceType>(attr);
-  PADDLE_ENFORCE_GE(funcs.size(), 1UL);
+  CHECK_GE(funcs.size(), 1UL);
   // Here could do some runtime benchmark of this attr and return the best one.
   // But yet just get the first one as the default best one,
   // which is searched in order and tuned by offline.
diff --git a/lite/backends/x86/jit/kernel_key.cc b/lite/backends/x86/jit/kernel_key.cc
index a6288fcf19d6867e1e1eb0bce32e559a4f303929..30397ffe1c4980e4af19a7a0eb44b47585b44f2c 100644
--- a/lite/backends/x86/jit/kernel_key.cc
+++ b/lite/backends/x86/jit/kernel_key.cc
@@ -14,7 +14,7 @@
 
 #include "lite/backends/x86/jit/kernel_key.h"
 #include <xxhash.h>  // XXH64: 13.8 GB/s
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/jit/more/mkl/mkl.h b/lite/backends/x86/jit/more/mkl/mkl.h
index 6bc791e64575b8f481f91ea3c28ea4896fe1860d..473e1253194513c16d6d8c3b52eac110512e806e 100644
--- a/lite/backends/x86/jit/more/mkl/mkl.h
+++ b/lite/backends/x86/jit/more/mkl/mkl.h
@@ -18,7 +18,7 @@
 #include <type_traits>
 #include <vector>
 #include "lite/backends/x86/jit/kernel_base.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -104,11 +104,11 @@ void EmbSeqPool(const T* table,
                 const int64_t* idx,
                 T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(
-        idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
+                                         << " i: " << i;
+    CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -175,22 +175,22 @@ void Sgd(const T* lr,
          const int64_t* rows,
          T* out,
          const sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  CHECK_EQ(attr->param_width, attr->grad_width);
+  CHECK_LE(attr->selected_rows_size, attr->grad_height);
   T scalar = -lr[0];
   int width = attr->grad_width;
   if (out == param) {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      CHECK_LT(h_idx, attr->param_height);
+      CHECK_GE(h_idx, 0);
       VAXPY(scalar, grad + i * width, out + h_idx * width, width);
     }
   } else {
     for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
       auto h_idx = rows[i];
-      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-      PADDLE_ENFORCE_GE(h_idx, 0);
+      CHECK_LT(h_idx, attr->param_height);
+      CHECK_GE(h_idx, 0);
       VScal(&scalar, grad + i * width, out + h_idx * width, width);
       VAdd(param + h_idx * width,
            out + h_idx * width,
diff --git a/lite/backends/x86/jit/refer/refer.h b/lite/backends/x86/jit/refer/refer.h
index d8c8d86911ab9a7794192aa68fb0c0571b1e4d26..b7243dfda350e8d0ea5909cf84ae3aa76d845055 100644
--- a/lite/backends/x86/jit/refer/refer.h
+++ b/lite/backends/x86/jit/refer/refer.h
@@ -22,7 +22,6 @@
 #include "lite/backends/x86/jit/kernel_base.h"
 #include "lite/backends/x86/jit/macro.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
 
 namespace paddle {
 namespace lite {
@@ -480,12 +479,12 @@ void EmbSeqPool(const T* table,
                 const int64_t* idx,
                 T* out,
                 const emb_seq_pool_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->table_width * attr->index_width, attr->out_width);
+  CHECK_EQ(attr->table_width * attr->index_width, attr->out_width);
 
   auto check_idx_value_valid = [&](int64_t i) {
-    PADDLE_ENFORCE_LT(
-        idx[i], attr->table_height, "idx value: %d, i: %d", idx[i], i);
-    PADDLE_ENFORCE_GE(idx[i], 0, "idx value: %d, i: %d", idx[i], i);
+    CHECK_LT(idx[i], attr->table_height) << "idx value: " << idx[i]
+                                         << " i: " << i;
+    CHECK_GE(idx[i], 0) << "idx value: " << idx[i] << " i: " << i;
   };
 
   for (int64_t w = 0; w != attr->index_width; ++w) {
@@ -527,12 +526,12 @@ void Sgd(const T* lr,
          const int64_t* rows,
          T* out,
          const lite::jit::sgd_attr_t* attr) {
-  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
-  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  CHECK_EQ(attr->param_width, attr->grad_width);
+  CHECK_LE(attr->selected_rows_size, attr->grad_height);
   for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
     auto h_idx = rows[i];
-    PADDLE_ENFORCE_LT(h_idx, attr->param_height);
-    PADDLE_ENFORCE_GE(h_idx, 0);
+    CHECK_LT(h_idx, attr->param_height);
+    CHECK_GE(h_idx, 0);
     for (int64_t j = 0; j < attr->grad_width; ++j) {
       out[h_idx * attr->grad_width + j] =
           param[h_idx * attr->grad_width + j] -
diff --git a/lite/backends/x86/jit/test.cc b/lite/backends/x86/jit/test.cc
index aafcad579fdefd675323e0e2a6f40bd89c2a0166..03570a56d9c766271be630fe1d2e3048c6c42608 100644
--- a/lite/backends/x86/jit/test.cc
+++ b/lite/backends/x86/jit/test.cc
@@ -910,8 +910,8 @@ void TestKernelSgd() {
   const T lr = 0.1;
   auto UnDuplicatedRandomVec = [](
       int n, const int64_t lower, const int64_t upper) -> std::vector<int64_t> {
-    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
-    PADDLE_ENFORCE_GT(n, 0);
+    CHECK_LE(static_cast<size_t>(upper - lower), n - 1);
+    CHECK_GT(n, 0);
     std::vector<int64_t> all, out;
     for (int i = 0; i < n; ++i) {
       all.push_back(i);
diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc
index 5d7e98629cb89bd7a3fdee852507e0f381e54931..274e8836dd6e59d610ddeb7a63f898cdc1b19cc1 100644
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -116,7 +116,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
     lod[0].assign(high_level.begin(), high_level.end());
     lod[1].assign(low_level.begin(), low_level.end());
     // if (!lite::fluid::CheckLoD(lod)) {
-    //  //PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+    //  //LOG(FATAL)<<"lod %s is not right", framework::LoDToString(lod));
     //}
     selected_ids->set_lod(lod);
     selected_scores->set_lod(lod);
diff --git a/lite/backends/x86/math/blas.cc b/lite/backends/x86/math/blas.cc
index 3bc5f9f67ad96e7ec699400ff6369fe48c745b7e..4c6bf06951f81e90a73c91c2378f904db5678495 100644
--- a/lite/backends/x86/math/blas.cc
+++ b/lite/backends/x86/math/blas.cc
@@ -23,7 +23,7 @@ namespace math {
 MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
                                      int num_flatten_cols,
                                      bool trans) {
-  PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
+  CHECK_GT(tensor_dim.size(), 1u);
   MatDescriptor retv;
   if (num_flatten_cols > 1) {
     auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h
index 34b258892be05625ae88076eff175f56a53d3537..4a64e45ea945f2d46c06ba31d67bd2a0fbf7c635 100644
--- a/lite/backends/x86/math/blas_impl.h
+++ b/lite/backends/x86/math/blas_impl.h
@@ -287,22 +287,22 @@ struct CBlas<double> {
 
 template <>
 struct CBlas<lite::fluid::float16> {
-  static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+  static void GEMM(...) { LOG(FATAL) << "float16 GEMM not supported on CPU"; }
   static void SMM_GEMM(...) {
-    PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
+    LOG(FATAL) << "float16 SMM_GEMM not supported on CPU";
   }
-  static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); }
-  static void VEXP(...) { PADDLE_THROW("float16 VEXP not supported on CPU"); }
+  static void VMUL(...) { LOG(FATAL) << "float16 VMUL not supported on CPU"; }
+  static void VEXP(...) { LOG(FATAL) << "float16 VEXP not supported on CPU"; }
   static void VSQUARE(...) {
-    PADDLE_THROW("float16 VSQUARE not supported on CPU");
+    LOG(FATAL) << "float16 VSQUARE not supported on CPU";
   }
-  static void VPOW(...) { PADDLE_THROW("float16 VPOW not supported on CPU"); }
-  static void DOT(...) { PADDLE_THROW("float16 DOT not supported on CPU"); };
-  static void SCAL(...) { PADDLE_THROW("float16 SCAL not supported on CPU"); };
-  static void ASUM(...) { PADDLE_THROW("float16 ASUM not supported on CPU"); };
+  static void VPOW(...) { LOG(FATAL) << "float16 VPOW not supported on CPU"; }
+  static void DOT(...) { LOG(FATAL) << "float16 DOT not supported on CPU"; };
+  static void SCAL(...) { LOG(FATAL) << "float16 SCAL not supported on CPU"; };
+  static void ASUM(...) { LOG(FATAL) << "float16 ASUM not supported on CPU"; };
 #ifdef PADDLE_WITH_MKLML
   static void GEMM_BATCH(...) {
-    PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
+    LOG(FATAL) << "float16 GEMM_BATCH not supported on CPU";
   }
 #endif
 };
@@ -461,11 +461,11 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
   auto dim_a = mat_a.dims();
   auto dim_b = mat_b.dims();
   auto dim_out = mat_out->dims();
-  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
-                 "The input and output of matmul be matrix");
-  // PADDLE_ENFORCE(
-  //    mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target(),
-  //    "The targets of matrices must be same");
+  CHECK(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2)
+      << "The input and output of matmul be matrix";
+  // CHECK(
+  //    mat_a.target() == mat_b.target() && mat_a.target() == mat_out->target())
+  //    << "The targets of matrices must be same";
 
   int M = dim_out[0];
   int N = dim_out[1];
@@ -746,7 +746,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                           T alpha,
                           lite::Tensor *mat_out,
                           T beta) const {
-  PADDLE_ENFORCE_EQ(dim_a.width_, dim_b.height_);
+  CHECK_EQ(dim_a.width_, dim_b.height_);
   CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans;
   CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans;
   if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) {
@@ -761,8 +761,8 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
                            beta,
                            mat_out->template mutable_data<T>());
   } else {
-    PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
-                   dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
+    CHECK(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 ||
+          dim_b.batch_size_ == 0);
     this->template BatchedGEMM<T>(
         transA,
         transB,
diff --git a/lite/backends/x86/math/context_project.h b/lite/backends/x86/math/context_project.h
index 0c56e0d759fd9b1e3abba5209f43d7a0c8fe194e..72a2f4ce12cbd72b26cd87e97d0178275a4b4abd 100644
--- a/lite/backends/x86/math/context_project.h
+++ b/lite/backends/x86/math/context_project.h
@@ -146,7 +146,7 @@ class ContextProjectFunctor {
       }
     }
     if (padding_trainable) {
-      PADDLE_ENFORCE(padding_data != nullptr);
+      CHECK(padding_data != nullptr);
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         if (lod_level_0[i] == lod_level_0[i + 1]) continue;
 
diff --git a/lite/backends/x86/math/cpu_vec.h b/lite/backends/x86/math/cpu_vec.h
index 9ff64d53f069d2e4c5b639d273af5b4aa5738b2b..0e721cc8c272eee4b1df1f4b254b5e1d0c1ebb0a 100644
--- a/lite/backends/x86/math/cpu_vec.h
+++ b/lite/backends/x86/math/cpu_vec.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <functional>
 #include <string>
 #include "lite/backends/x86/cpu_info.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 #ifdef PADDLE_WITH_MKLML
 #include "lite/backends/x86/mklml.h"
@@ -652,7 +652,7 @@ class VecActivations {
     } else if (type == "identity" || type == "") {
       return vec_identity<T, isa>;
     }
-    PADDLE_THROW("Not support type: %s", type);
+    LOG(FATAL) << "Not support type: " << type;
   }
 };
 
diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc
index 941a34643669f060cdd18f38f92c39e529da7b19..2419620111b7ace292d8a2d366fc1dce2353a15c 100644
--- a/lite/backends/x86/math/cross_entropy.cc
+++ b/lite/backends/x86/math/cross_entropy.cc
@@ -57,7 +57,7 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
       for (int i = 0; i < batch_size; ++i) {
         for (int j = 0; j < num_remain; j++) {
           int lbl = label_data[i * num_remain + j];
-          PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
+          CHECK((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
           int index = i * num_classes + lbl * num_remain + j;
           int loss_idx = i * num_remain + j;
           loss_data[loss_idx] =
diff --git a/lite/backends/x86/math/cross_entropy.h b/lite/backends/x86/math/cross_entropy.h
index 6b66f0b08548c1306681409345c051d1ab40a7c0..d2a66083ac1a72de9e5e469618fc387a5ea784dc 100644
--- a/lite/backends/x86/math/cross_entropy.h
+++ b/lite/backends/x86/math/cross_entropy.h
@@ -27,7 +27,7 @@ namespace math {
 template <typename T>
 struct TolerableValue {
   HOSTDEVICE T operator()(const T& x) const {
-    PADDLE_ENFORCE(static_cast<bool>(std::is_floating_point<T>::value));
+    CHECK(static_cast<bool>(std::is_floating_point<T>::value));
     const T kApproInf = 1e20;
 
     if (x == INFINITY) return kApproInf;
diff --git a/lite/backends/x86/math/detail/activation_functions.h b/lite/backends/x86/math/detail/activation_functions.h
index 6a13a3d471e10970b36120a12b21a36256350803..dc3c3eac1989f256378e408b8e8e4401bea43e7c 100644
--- a/lite/backends/x86/math/detail/activation_functions.h
+++ b/lite/backends/x86/math/detail/activation_functions.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <math.h>
 #include <string>
 #include "lite/backends/x86/cpu_info.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -46,8 +46,6 @@ inline ActivationType GetActivationType(const std::string &type) {
     return ActivationType::kIdentity;
   }
   LOG(ERROR) << "Not support type " << type;
-  // PADDLE_ENFORCE(false, "Not support type %s", type);
-  // PADDLE_THROW("Not support type %s.", type);
   return ActivationType();
 }
 
diff --git a/lite/backends/x86/math/gru_compute.h b/lite/backends/x86/math/gru_compute.h
index 86b7a91f4127de50aeb5c5fb02122bced0af4188..767e9b9da0e2977f566c793c2fdc71f83ab5b6d4 100644
--- a/lite/backends/x86/math/gru_compute.h
+++ b/lite/backends/x86/math/gru_compute.h
@@ -13,7 +13,7 @@ limitations under the License. */
 
 #include "lite/backends/x86/math/detail/activation_functions.h"
 #include "lite/core/context.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc
index b916c912ffc2a4d62b63b98fdce150b353ba087e..abbd9b0e2811913f6aff79561e365d20bffbeae4 100644
--- a/lite/backends/x86/math/im2col.cc
+++ b/lite/backends/x86/math/im2col.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "lite/backends/x86/math/im2col.h"
 #include <vector>
 #include "lite/backends/x86/math/im2col_cfo_cpu.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -38,8 +38,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
                   lite::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
+    CHECK_EQ(im.dims().size(), 3);
+    CHECK_EQ(col->dims().size(), 5);
 
     if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 &&
         dilation[1] == 1) {
@@ -72,8 +72,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
                   lite::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
+    CHECK_EQ(im->dims().size(), 3);
+    CHECK_EQ(col.dims().size(), 5);
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
     int im_width = im->dims()[2];
@@ -82,20 +82,20 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
     int col_height = col.dims()[3];
     int col_width = col.dims()[4];
 
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
+    CHECK_EQ((im_height + padding[0] + padding[2] -
+              ((dilation[0] * (filter_height - 1) + 1))) /
+                     stride[0] +
+                 1,
+             col_height)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
+    CHECK_EQ((im_width + padding[1] + padding[3] -
+              ((dilation[1] * (filter_width - 1) + 1))) /
+                     stride[1] +
+                 1,
+             col_width)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
 
     int channels_col = im_channels * filter_height * filter_width;
 
@@ -150,8 +150,8 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
                   lite::Tensor* col) {
-    PADDLE_ENFORCE(im.dims().size() == 3);
-    PADDLE_ENFORCE(col->dims().size() == 5);
+    CHECK_EQ(im.dims().size(), 3);
+    CHECK_EQ(col->dims().size(), 5);
     int im_channels = im.dims()[0];
     int im_height = im.dims()[1];
     int im_width = im.dims()[2];
@@ -214,8 +214,8 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
                   const std::vector<int>& stride,
                   const std::vector<int>& padding,
                   lite::Tensor* im) {
-    PADDLE_ENFORCE(im->dims().size() == 3);
-    PADDLE_ENFORCE(col.dims().size() == 5);
+    CHECK_EQ(im->dims().size(), 3);
+    CHECK_EQ(col.dims().size(), 5);
     int im_channels = im->dims()[0];
     int im_height = im->dims()[1];
     int im_width = im->dims()[2];
@@ -224,16 +224,16 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
     int col_height = col.dims()[0];
     int col_width = col.dims()[1];
 
-    PADDLE_ENFORCE_EQ(
+    CHECK_EQ(
         (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
+        col_height)
+        << "Output_height and padding(padding_up, padding_down) are "
+           "inconsistent.";
+    CHECK_EQ(
         (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
+        col_width)
+        << "col_width and padding(padding_left, padding_right) are "
+           "inconsistent.";
 
     T* im_data = im->template mutable_data<T>();
     const T* col_data = col.data<T>();
diff --git a/lite/backends/x86/math/lstm_compute.h b/lite/backends/x86/math/lstm_compute.h
index ddb7bea9995ebcca978be97f8295eb07b0e4e17e..b403770cca7248fba10e62708dddfb91f2789488 100644
--- a/lite/backends/x86/math/lstm_compute.h
+++ b/lite/backends/x86/math/lstm_compute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "lite/backends/x86/math/detail/activation_functions.h"
 #include "lite/core/context.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc
index cb1781db2199c1b7a12aaec80b1904f65b23b534..cc4aa5d9fa54c50eb944714c14a5f6b15634a181 100644
--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -121,8 +121,8 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
                   lite::Tensor* output) {
     const auto& in_dims = input.dims();
     auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(vector.numel(), size);
-    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    CHECK_EQ(vector.numel(), size);
+    CHECK_EQ(output->dims(), in_dims);
 
     const T* input_data = input.data<T>();
     const T* vector_data = vector.data<T>();
diff --git a/lite/backends/x86/math/math_function.h b/lite/backends/x86/math/math_function.h
index 8f629b5f171814f0df8e51e61123c7c0aabf7643..7081ec0053e0b4194730e6f4353e1274d6019bb4 100644
--- a/lite/backends/x86/math/math_function.h
+++ b/lite/backends/x86/math/math_function.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
-//#include "lite/tensor_util.h"
+#include "lite/utils/cp_logging.h"
+// #include "lite/tensor_util.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h
index acfb76759f6fc9fa4122afd2388bc3adf8f5ea22..9bbfebcfb2feb0e3c9d68261240bed18888350c3 100644
--- a/lite/backends/x86/math/math_function_impl.h
+++ b/lite/backends/x86/math/math_function_impl.h
@@ -59,7 +59,7 @@ void ColwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
                                        lite::TensorLite* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size);
+  CHECK_EQ(out->numel(), size);
 
   auto in = lite::fluid::EigenMatrix<T>::From(input);
   auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
@@ -81,7 +81,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
     auto& in_dims = input.dims();
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    CHECK_EQ(out->numel(), size);
 
     T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
@@ -103,8 +103,8 @@ void RowwiseMean<Target, T>::operator()(const lite::Context<Target>& context,
                                         const lite::TensorLite& input,
                                         lite::TensorLite* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  CHECK_EQ(in_dims.size(), 2U);
+  CHECK_EQ(out->numel(), in_dims[0]);
 
   auto in = lite::fluid::EigenMatrix<T>::From(input);
   auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
@@ -124,10 +124,10 @@ class RowwiseMean<lite::TargetType::kX86, T> {
                   const lite::TensorLite& input,
                   lite::TensorLite* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    CHECK_EQ(in_dims.size(), 2U);
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    CHECK_EQ(out->numel(), height);
     auto inv_size = 1.0 / size;
     T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
@@ -147,8 +147,8 @@ void RowwiseSum<Target, T>::operator()(const lite::Context<Target>& context,
                                        const lite::TensorLite& input,
                                        lite::TensorLite* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0]);
+  CHECK_EQ(in_dims.size(), 2U);
+  CHECK_EQ(out->numel(), in_dims[0]);
 
   auto in = lite::fluid::EigenMatrix<T>::From(input);
   auto vec = lite::fluid::EigenVector<T>::Flatten(*out);
@@ -168,10 +168,10 @@ class RowwiseSum<lite::TargetType::kX86, T> {
                   const lite::TensorLite& input,
                   lite::TensorLite* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
+    CHECK_EQ(in_dims.size(), 2U);
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), height);
+    CHECK_EQ(out->numel(), height);
 
     T* out_buf = out->template mutable_data<T>(out->target());
     const T* in_buf = input.data<T>();
diff --git a/lite/backends/x86/math/math_function_test.cc b/lite/backends/x86/math/math_function_test.cc
index 19122a6169fbbe1729e38389b0006b11190bc206..b3511ca3521634a771965348e754e10bfd72e19f 100644
--- a/lite/backends/x86/math/math_function_test.cc
+++ b/lite/backends/x86/math/math_function_test.cc
@@ -273,7 +273,7 @@ TEST(math_funciton, set_constant) {
   auto* ctx = new paddle::platform::CPUDeviceContext();
   paddle::operators::math::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+    CHECK_EQ(10, t.data<int>()[i]);
   }
   delete ctx;
 }
diff --git a/lite/backends/x86/math/sampler.h b/lite/backends/x86/math/sampler.h
index efd9e48e5443186b6b735287cc150f99cb42be81..07cca52e1f436c2979a331dd27c2ddc554c0dad8 100644
--- a/lite/backends/x86/math/sampler.h
+++ b/lite/backends/x86/math/sampler.h
@@ -32,7 +32,7 @@ namespace math {
 class Sampler {
  public:
   explicit Sampler(int64_t range, unsigned int seed = 0UL) : range_(range) {
-    //    PADDLE_ENFORCE_GT(range, 0, "Range should be greater than 0.");
+    //    CHECK_GT(range, 0, "Range should be greater than 0.");
     if (seed == 0) {
       std::random_device r;
       seed_ = r();
diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc
index 03a18587f4a029bcaebe484ca1ab1951e7c3ecad..8e2a81905b871902aa8ec79c9dd718a62c9f6dec 100644
--- a/lite/backends/x86/math/selected_rows_functor.cc
+++ b/lite/backends/x86/math/selected_rows_functor.cc
@@ -31,7 +31,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
                   const fluid::SelectedRows& input2,
                   fluid::SelectedRows* output) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    CHECK_EQ(in1_height, input2.height());
     output->set_height(in1_height);
 
     auto& in1_rows = input1.rows();
@@ -49,8 +49,8 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
     auto& in2_value = input2.value();
 
     auto in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
-    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+    CHECK_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    CHECK_EQ(in1_row_numel, out_value->numel() / out_rows.size());
 
     auto* out_data = out_value->template mutable_data<T>();
     auto* in1_data = in1_value.data<T>();
@@ -73,15 +73,15 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
     auto in1_height = input1.height();
     auto in2_dims = input2.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
-    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+    CHECK_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, out_dims[0]);
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
-    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+    CHECK_EQ(in1_row_numel, input2.numel() / in1_height);
+    CHECK_EQ(in1_row_numel, output->numel() / in1_height);
 
     SetConstant<lite::TargetType::kX86, T> functor;
     functor(context, output, 0.0);
@@ -113,7 +113,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
                   const int64_t input2_offset,
                   fluid::SelectedRows* input2) {
     auto in1_height = input1.height();
-    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+    CHECK_EQ(in1_height, input2->height());
 
     auto& in1_rows = input1.rows();
     auto& in2_rows = *(input2->mutable_rows());
@@ -149,7 +149,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
       auto& in_rows = (*iter)->rows();
       size += in_rows.end() - in_rows.begin();
       auto in1_height = (*iter)->height();
-      PADDLE_ENFORCE_EQ(in1_height, input2->height());
+      CHECK_EQ(in1_height, input2->height());
     }
     // concat rows
     std::vector<int64_t> in2_rows;
@@ -185,13 +185,13 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
 
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, in2_dims[0]);
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->template mutable_data<T>();
@@ -291,12 +291,11 @@ struct MergeAdd<lite::TargetType::kX86, T> {
       if (input->rows().size() == 0) {
         continue;
       }
-      PADDLE_ENFORCE_EQ(input_width,
-                        input->value().dims()[1],
-                        "all input should have same "
-                        "dimension except for the first one");
-      PADDLE_ENFORCE_EQ(
-          input_height, input->height(), "all input should have same height");
+      CHECK_EQ(input_width, input->value().dims()[1])
+          << "all input should have same "
+             "dimension except for the first one";
+      CHECK_EQ(input_height, input->height())
+          << "all input should have same height";
       row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
@@ -376,13 +375,13 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
                   lite::Tensor* input2) {
     auto in1_height = input1.height();
     auto in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    CHECK_EQ(in1_height, in2_dims[0]);
 
     auto& in1_value = input1.value();
     auto& in1_rows = input1.rows();
 
     int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+    CHECK_EQ(in1_row_numel, input2->numel() / in1_height);
 
     auto* in1_data = in1_value.data<T>();
     auto* input2_data = input2->template data<T>();
diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc
index aa7aeac532e2fa1f90d452924b364be1896ee862..597521b6e7cac49ac91dbddac71af22bb5a8760c 100644
--- a/lite/backends/x86/math/sequence2batch.cc
+++ b/lite/backends/x86/math/sequence2batch.cc
@@ -30,12 +30,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
     const uint64_t* index = index_lod.data();
     const auto& src_dims = src.dims();
     const auto& dst_dims = dst->dims();
-    PADDLE_ENFORCE_EQ(
-        src_dims.size(), 2UL, "The src must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(
-        dst_dims.size(), 2UL, "The dst must be matrix with rank 2.");
-    PADDLE_ENFORCE_EQ(
-        src_dims[1], dst_dims[1], "The width of src and dst must be same.");
+    CHECK_EQ(src_dims.size(), 2UL) << "The src must be matrix with rank 2.";
+    CHECK_EQ(dst_dims.size(), 2UL) << "The dst must be matrix with rank 2.";
+    CHECK_EQ(src_dims[1], dst_dims[1])
+        << "The width of src and dst must be same.";
     auto height = dst_dims[0];
     auto width = dst_dims[1];
     auto* src_data = src.data<T>();
diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h
index 796894cb7d18ec4db7b670276bb3d3fc5b1427f8..953576eea4170cca57f10bb977ca9bfecb36ae6d 100644
--- a/lite/backends/x86/math/sequence2batch.h
+++ b/lite/backends/x86/math/sequence2batch.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/eigen.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -66,21 +66,18 @@ class LoDTensor2BatchFunctor {
                   bool is_reverse = false) const {
     if (!is_cal_batch_lod) {
       auto lods = batch->lod();
-      PADDLE_ENFORCE_GT(lods.size(),
-                        2UL,
-                        "The LoD of LoDTensor should inlcude at least 2-level "
-                        "sequence information.");
-      PADDLE_ENFORCE_EQ(
-          lods[1].size(),
-          static_cast<size_t>(lod_tensor.dims()[0]),
-          "The LoD information should be consistent with the dims.");
+      CHECK_GT(lods.size(), 2UL)
+          << "The LoD of LoDTensor should inlcude at least 2-level "
+             "sequence information.";
+      CHECK_EQ(lods[1].size(), static_cast<size_t>(lod_tensor.dims()[0]))
+          << "The LoD information should be consistent with the dims.";
       CopyMatrixRowsFunctor<Target, T> to_batch;
       to_batch(context, lod_tensor, lods[1], batch, true);
       return;
     }
 
     auto lods = lod_tensor.lod();
-    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+    CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
 
     const auto& lod = lods[0];
 
@@ -165,14 +162,11 @@ class Batch2LoDTensorFunctor {
                   const lite::Tensor& batch,
                   lite::Tensor* lod_tensor) const {
     auto in_lod = batch.lod();
-    PADDLE_ENFORCE_GT(in_lod.size(),
-                      2UL,
-                      "The LoD of LoDTensor should inlcude at least 2-level "
-                      "sequence information.");
-    PADDLE_ENFORCE_EQ(
-        in_lod[1].size(),
-        static_cast<size_t>(lod_tensor->dims()[0]),
-        "The LoD information should be consistent with the dims.");
+    CHECK_GT(in_lod.size(), 2UL)
+        << "The LoD of LoDTensor should inlcude at least 2-level "
+           "sequence information.";
+    CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
+        << "The LoD information should be consistent with the dims.";
     CopyMatrixRowsFunctor<Target, T> to_seq;
     to_seq(context, batch, in_lod[1], lod_tensor, false);
   }
diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc
index eb977dc2d23f4cfaeec7dd5a6e2834ca23345f76..3b2f8bfc4f58a4bfcab968a9288eb8d1d817d78d 100644
--- a/lite/backends/x86/math/sequence_padding.cc
+++ b/lite/backends/x86/math/sequence_padding.cc
@@ -37,10 +37,9 @@ void CopyValidData(lite::Tensor* dst_tensor,
       layout == kBatchLengthWidth ? step_width : seq_num * step_width;
   for (int seq_idx = 0; seq_idx < seq_num; ++seq_idx) {
     int valid_seq_len = seq_offsets[seq_idx + 1] - seq_offsets[seq_idx];
-    PADDLE_ENFORCE_GE(
-        pad_seq_len,
-        valid_seq_len,
-        "The padded sequence length can not be less than its original length.");
+    CHECK_GE(pad_seq_len, valid_seq_len) << "The padded sequence length can "
+                                            "not be less than its original "
+                                            "length.";
     int seq_data_offset = seq_offsets[seq_idx] * step_width;
     int pad_data_offset = layout == kBatchLengthWidth
                               ? seq_idx * pad_seq_len * step_width
@@ -108,9 +107,9 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
               pad_seq_len,
               step_width,
               layout);
-    PADDLE_ENFORCE(pad_value.numel() == 1 || pad_value.numel() == step_width,
-                   "The numel of 'pad_value' can only be 1 or be equal to the "
-                   "'step_width'.");
+    CHECK(pad_value.numel() == 1 || pad_value.numel() == step_width)
+        << "The numel of 'pad_value' can only be 1 or be equal to the "
+           "'step_width'.";
 
     // fill padding value
     T* pad_data = pad_tensor->template mutable_data<T>();
diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h
index 43407014dea0ed0c78ab29da7fb8ebb0e0310566..5512c4aa11fb5dc05283d01b1d6d3da7fb83c064 100644
--- a/lite/backends/x86/math/sequence_padding.h
+++ b/lite/backends/x86/math/sequence_padding.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "lite/core/context.h"
 #include "lite/core/tensor.h"
 #include "lite/fluid/lod.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -46,15 +46,14 @@ inline static void CheckDims(const lite::DDim& seq_tensor_dims,
                              int64_t padded_seq_len,
                              int64_t step_width,
                              const PadLayout& layout) {
-  PADDLE_ENFORCE_EQ(static_cast<size_t>(seq_tensor_dims[0]),
-                    seq_offset.back(),
-                    "Value of 1st dimension of the sequence tensor should be "
-                    "equal to sum of lengths of all sequences.");
+  CHECK_EQ(static_cast<size_t>(seq_tensor_dims[0]), seq_offset.back())
+      << "Value of 1st dimension of the sequence tensor should be "
+         "equal to sum of lengths of all sequences.";
 
-  PADDLE_ENFORCE(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
-                     seq_tensor_dims.size() == pad_tensor_dims.size(),
-                 "pad_tensor's rank should be 1 greater than seq_tensor's "
-                 "rank, or be equal with it.");
+  CHECK(seq_tensor_dims.size() + 1 == pad_tensor_dims.size() ||
+        seq_tensor_dims.size() == pad_tensor_dims.size())
+      << "pad_tensor's rank should be 1 greater than seq_tensor's "
+         "rank, or be equal with it.";
 }
 
 /*
diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc
index 2d00ebad61840da5b14fbf12d9255394b2b2df1a..c1ddb030349a7f7f46fd6b98d3f967eb6fdfe48e 100644
--- a/lite/backends/x86/math/sequence_pooling.cc
+++ b/lite/backends/x86/math/sequence_pooling.cc
@@ -46,12 +46,12 @@ class MaxSeqPoolFunctor {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
     auto idx_dims = index->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    CHECK_GT(in_dims.size(), 1u);
+    CHECK_GT(out_dims.size(), 1u);
     for (size_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+      CHECK_EQ(in_dims[i], out_dims[i]);
     }
-    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+    CHECK_EQ(idx_dims, out_dims);
 
     auto starts = input.lod()[0];
     const T* in_data = input.data<T>();
@@ -95,10 +95,10 @@ class MaxSeqPoolFunctor<T, true> {
                   lite::Tensor* index) {
     auto in_dims = input.dims();
     auto out_dims = output->dims();
-    PADDLE_ENFORCE_GT(in_dims.size(), 1u);
-    PADDLE_ENFORCE_GT(out_dims.size(), 1u);
+    CHECK_GT(in_dims.size(), 1u);
+    CHECK_GT(out_dims.size(), 1u);
     for (size_t i = 1; i < in_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+      CHECK_EQ(in_dims[i], out_dims[i]);
     }
 
     auto starts = input.lod()[0];
@@ -136,12 +136,12 @@ class MaxSeqPoolGradFunctor {
     auto og_dims = out_grad.dims();
     auto ig_dims = in_grad->dims();
     auto idx_dims = index.dims();
-    PADDLE_ENFORCE_GT(og_dims.size(), 1);
-    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
+    CHECK_GT(og_dims.size(), 1);
+    CHECK_GT(ig_dims.size(), 1);
     for (size_t i = 1; i < og_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+      CHECK_EQ(og_dims[i], ig_dims[i]);
     }
-    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+    CHECK_EQ(idx_dims, og_dims);
 
     const T* og_data = out_grad.data<T>();
     const int* max_index = index.data<int>();
@@ -236,7 +236,7 @@ class SumSeqPoolGradFunctor {
     auto lod = in_grad->lod()[0];
     int64_t out_w = out_grad.numel() / out_grad.dims()[0];
     int64_t in_w = in_grad->numel() / in_grad->dims()[0];
-    PADDLE_ENFORCE(in_w == out_w);
+    CHECK(in_w == out_w);
     const T* out_g_data = out_grad.data<T>();
     T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
     auto blas = math::GetBlas<TARGET(kX86), T>(context);
@@ -330,7 +330,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
         out_e.device(eigen_device) = in_e.sum(Eigen::array<int, 1>({{0}})) /
                                      std::sqrt(static_cast<T>(h));
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        LOG(FATAL) << "unsupported pooling pooltype";
       }
     }
   }
@@ -389,7 +389,7 @@ class SequencePoolGradFunctor<TARGET(kX86), T> {
       } else if (pooltype == "FIRST") {
         in_g_e.chip(0, 0).device(eigen_device) = out_g_e_v;
       } else {
-        PADDLE_THROW("unsupported pooling pooltype");
+        LOG(FATAL) << "unsupported pooling pooltype";
       }
     }
   }
diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc
index b91f43a571994bef95650361a6dc62c0465837a7..8bba0f92055dbee5a81bf12ab2fa5cc6592bd60c 100644
--- a/lite/backends/x86/math/sequence_pooling_test.cc
+++ b/lite/backends/x86/math/sequence_pooling_test.cc
@@ -50,9 +50,9 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
   in_grad.mutable_data<T>(in_dims, context->GetPlace());
 
   // check tensor contruction result
-  PADDLE_ENFORCE_EQ(in_grad.dims().size(), out_grad.dims().size());
+  CHECK_EQ(in_grad.dims().size(), out_grad.dims().size());
   for (int64_t i = 1; i < out_grad.dims().size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_grad.dims()[i], out_grad.dims()[i]);
+    CHECK_EQ(in_grad.dims()[i], out_grad.dims()[i]);
   }
 
   // call functor
diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc
index c54bb2099edd0a7e6be61cfdff6340734f09116a..bcab1e77c0bef356453bf1ea1f30aabfc9f1dff0 100644
--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
@@ -55,7 +55,7 @@ void Tree2ColUtil::construct_tree(const lite::Tensor &EdgeSet,
                                   std::vector<std::vector<int>> *tr,
                                   size_t *node_count) {
   auto edge_set_dims = EdgeSet.dims();
-  PADDLE_ENFORCE_EQ(edge_set_dims[1], 2);
+  CHECK_EQ(edge_set_dims[1], 2);
   int64_t edge_count = EdgeSet.numel();
 
   const int *edge_data = EdgeSet.data<int>();
diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc
index 119d7294e9ec21e67f09776ad20d04f15b8b81ce..7ff132cbf121172b5bf35966637080d599eaf498 100644
--- a/lite/backends/x86/math/unpooling.cc
+++ b/lite/backends/x86/math/unpooling.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lite/backends/x86/math/unpooling.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -41,7 +41,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          CHECK(index < output_feasize) << "err index in unpooling!";
           output_data[index] = input_data[i];
         }
         input_data += input_feasize;
@@ -77,7 +77,7 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
       for (int c = 0; c < output_channels; ++c) {
         for (int i = 0; i < input_feasize; ++i) {
           int index = indices_data[i];
-          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          CHECK(index < output_feasize) << "err index in unpooling!";
           input_grad_data[i] = output_grad_data[index];
         }
         input_grad_data += input_feasize;
diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc
index 91979bb7fdcfe66d84ded3f9797144ddafc8769e..8e8f44be55fc2df342092ad399f00bcc7941908d 100644
--- a/lite/backends/x86/math/vol2col.cc
+++ b/lite/backends/x86/math/vol2col.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "lite/backends/x86/math/vol2col.h"
 #include <vector>
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -36,8 +36,8 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   lite::Tensor* col) const {
-    PADDLE_ENFORCE(vol.dims().size() == 4);
-    PADDLE_ENFORCE(col->dims().size() == 7);
+    CHECK_EQ(vol.dims().size(), 4);
+    CHECK_EQ(col->dims().size(), 7);
 
     int input_channels = vol.dims()[0];
     int input_depth = vol.dims()[1];
@@ -52,27 +52,27 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
     int channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    CHECK_EQ((input_depth + 2 * paddings[0] -
+              ((dilations[0] * (filter_depth - 1) + 1))) /
+                     strides[0] +
+                 1,
+             output_depth)
+        << "input_depth and output_depth are "
+           "mismatching.";
+    CHECK_EQ((input_height + 2 * paddings[1] -
+              ((dilations[1] * (filter_height - 1) + 1))) /
+                     strides[1] +
+                 1,
+             output_height)
+        << "input_height and output_height are "
+           "mismatching.";
+    CHECK_EQ((input_width + 2 * paddings[2] -
+              ((dilations[2] * (filter_width - 1) + 1))) /
+                     strides[2] +
+                 1,
+             output_width)
+        << "input_width and output_width are "
+           "mismatching.";
 
     const T* vol_data = vol.data<T>();
     T* col_data = col->template mutable_data<T>();
@@ -122,8 +122,8 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
                   const std::vector<int>& strides,
                   const std::vector<int>& paddings,
                   lite::Tensor* vol) const {
-    PADDLE_ENFORCE(vol->dims().size() == 4);
-    PADDLE_ENFORCE(col.dims().size() == 7);
+    CHECK_EQ(vol->dims().size(), 4);
+    CHECK_EQ(col.dims().size(), 7);
 
     int input_channels = vol->dims()[0];
     int input_depth = vol->dims()[1];
@@ -138,27 +138,27 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
     int channels_col =
         input_channels * filter_depth * filter_height * filter_width;
 
-    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
-                       ((dilations[0] * (filter_depth - 1) + 1))) /
-                              strides[0] +
-                          1,
-                      output_depth,
-                      "input_depth and output_depth are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
-                       ((dilations[1] * (filter_height - 1) + 1))) /
-                              strides[1] +
-                          1,
-                      output_height,
-                      "input_height and output_height are "
-                      "mismatching.");
-    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
-                       ((dilations[2] * (filter_width - 1) + 1))) /
-                              strides[2] +
-                          1,
-                      output_width,
-                      "input_width and output_width are "
-                      "mismatching.");
+    CHECK_EQ((input_depth + 2 * paddings[0] -
+              ((dilations[0] * (filter_depth - 1) + 1))) /
+                     strides[0] +
+                 1,
+             output_depth)
+        << "input_depth and output_depth are "
+           "mismatching.";
+    CHECK_EQ((input_height + 2 * paddings[1] -
+              ((dilations[1] * (filter_height - 1) + 1))) /
+                     strides[1] +
+                 1,
+             output_height)
+        << "input_height and output_height are "
+           "mismatching.";
+    CHECK_EQ((input_width + 2 * paddings[2] -
+              ((dilations[2] * (filter_width - 1) + 1))) /
+                     strides[2] +
+                 1,
+             output_width)
+        << "input_width and output_width are "
+           "mismatching.";
     T* vol_data = vol->template mutable_data<T>();
     const T* col_data = col.data<T>();
 
diff --git a/lite/backends/xpu/debug.h b/lite/backends/xpu/debug.h
new file mode 100644
index 0000000000000000000000000000000000000000..75d18b6f4bf461a871c26c7665d8b48bc2f3db38
--- /dev/null
+++ b/lite/backends/xpu/debug.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+template <typename T>
+void DumpCPUMem(const T* ptr,
+                size_t len,
+                const std::string& comment = "",
+                size_t stride = 1,
+                size_t item_per_line = 30) {
+  size_t after_stride_len = (len + stride - 1) / stride;
+  std::unique_ptr<T[]> after_stride(new T[after_stride_len]);
+  for (size_t i = 0; i < after_stride_len; ++i) {
+    after_stride[i] = ptr[i * stride];
+  }
+  double sum = 0;
+  for (size_t i = 0; i < len; ++i) {
+    sum += ptr[i];
+  }
+
+  printf(
+      "------------------------------ [%s] len=%zd stride=%zd sum=%f BEGIN "
+      "------------------------------\n",
+      comment.c_str(),
+      len,
+      stride,
+      sum);
+  size_t nline = (after_stride_len + item_per_line - 1) / item_per_line;
+  for (size_t i = 0; i < nline; ++i) {
+    size_t line_begin = i * item_per_line;
+    size_t line_end = line_begin + item_per_line;
+    printf("line[%04zd] -- ", i);
+    for (size_t ii = line_begin; (ii < line_end) && (ii < after_stride_len);
+         ++ii) {
+      if (std::is_same<T, float>::value) {
+        printf("%.6f, ", static_cast<float>(after_stride[ii]));
+      } else if (std::is_same<T, int16_t>::value) {
+        printf("%d ", static_cast<int>(after_stride[ii]));
+      } else {
+        // CHECK(false) << "unknown type";
+      }
+    }
+    printf("\n");
+  }
+  printf(
+      "------------------------------ [%s] len=%zd stride=%zd sum=%f  END  "
+      "------------------------------\n",
+      comment.c_str(),
+      len,
+      stride,
+      sum);
+}
+
+template <typename T>
+void DumpXPUMem(const T* ptr,
+                size_t len,
+                const std::string& comment = "",
+                size_t stride = 1,
+                size_t item_per_line = 30) {
+  size_t after_stride_len = (len + stride - 1) / stride;
+  std::unique_ptr<T[]> cpu_mem(new T[len]);
+  xpu_memcpy(
+      cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
+  std::unique_ptr<T[]> after_stride(new T[after_stride_len]);
+  for (size_t i = 0; i < after_stride_len; ++i) {
+    after_stride[i] = cpu_mem[i * stride];
+  }
+  double sum = 0;
+  for (size_t i = 0; i < len; ++i) {
+    sum += cpu_mem[i];
+  }
+
+  printf(
+      "------------------------------ [%s] len=%zd stride=%zd sum=%f BEGIN "
+      "------------------------------\n",
+      comment.c_str(),
+      len,
+      stride,
+      sum);
+  size_t nline = (after_stride_len + item_per_line - 1) / item_per_line;
+  for (size_t i = 0; i < nline; ++i) {
+    size_t line_begin = i * item_per_line;
+    size_t line_end = line_begin + item_per_line;
+    printf("line[%04zd] -- ", i);
+    for (size_t ii = line_begin; (ii < line_end) && (ii < after_stride_len);
+         ++ii) {
+      if (std::is_same<T, float>::value) {
+        printf("%.6f, ", static_cast<float>(after_stride[ii]));
+      } else if (std::is_same<T, int16_t>::value) {
+        printf("%d ", static_cast<int>(after_stride[ii]));
+      } else {
+        // CHECK(false) << "unknown type";
+      }
+    }
+    printf("\n");
+  }
+  printf(
+      "------------------------------ [%s] len=%zd stride=%zd sum=%f  END  "
+      "------------------------------\n",
+      comment.c_str(),
+      len,
+      stride,
+      sum);
+}
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc
index 5dcbc1e275cca8c32003cbef74dfb1f6d4caee93..85a0023590858ab72e9e4f258d62dce809888918 100644
--- a/lite/backends/xpu/target_wrapper.cc
+++ b/lite/backends/xpu/target_wrapper.cc
@@ -13,7 +13,6 @@
 // limitations under the License.
 
 #include "lite/backends/xpu/target_wrapper.h"
-#include "lite/backends/xpu/xpu_header_sitter.h"
 
 namespace paddle {
 namespace lite {
@@ -42,5 +41,21 @@ void TargetWrapperXPU::MemcpySync(void* dst,
   }
 }
 
+XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
+                                                      bool use_l3) {
+  void* ptr{nullptr};
+  if (use_l3) {
+    ptr = xdnn::alloc_workspace(GetRawContext(), size);
+  } else {
+    ptr = TargetWrapperXPU::Malloc(size);
+  }
+  CHECK(ptr != nullptr);
+  return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
+}
+
+std::string TargetWrapperXPU::multi_encoder_precision;  // NOLINT
+int TargetWrapperXPU::workspace_l3_size_per_thread{0};
+thread_local xdnn::Context* TargetWrapperXPU::tls_raw_ctx_{nullptr};
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h
index c42d4139246085d8b9a367b45b60699209d0b668..b84b5d75e74a14e81091b003aa3ae5514e53a42c 100644
--- a/lite/backends/xpu/target_wrapper.h
+++ b/lite/backends/xpu/target_wrapper.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <memory>                                 // std::unique_ptr
+#include "lite/backends/xpu/xpu_header_sitter.h"  // xpu_free
 #include "lite/core/target_wrapper.h"
 
 namespace paddle {
@@ -21,6 +23,24 @@ namespace lite {
 
 using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
 
+struct XPUScratchPad {
+  XPUScratchPad(void* addr, bool is_l3) : addr_(addr), is_l3_(is_l3) {}
+
+  void* addr_{nullptr};
+  bool is_l3_{false};
+};
+
+struct XPUScratchPadDeleter {
+  void operator()(XPUScratchPad* sp) const {
+    if (!sp->is_l3_) {
+      xpu_free(sp->addr_);
+    }
+    delete sp;
+  }
+};
+
+using XPUScratchPadGuard = std::unique_ptr<XPUScratchPad, XPUScratchPadDeleter>;
+
 template <>
 class TargetWrapper<TARGET(kXPU)> {
  public:
@@ -34,6 +54,41 @@ class TargetWrapper<TARGET(kXPU)> {
                          const void* src,
                          size_t size,
                          IoDirection dir);
+
+  static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = true);
+
+  static xdnn::Context* GetRawContext() {
+    if (tls_raw_ctx_ == nullptr) {
+      tls_raw_ctx_ = xdnn::create_context();
+      CHECK(tls_raw_ctx_);
+      int r = xdnn::set_workspace_l3_size(tls_raw_ctx_,
+                                          workspace_l3_size_per_thread);
+      if (r != 0) {
+        LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
+                     << ", workspace_l3_size_per_thread = "
+                     << workspace_l3_size_per_thread;
+      }
+    }
+    return tls_raw_ctx_;
+  }
+
+  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
+  // thread
+  static void SetDev(int dev_no = 0) {
+    const char* dev_env = getenv("LITE_XPU_DEV");
+    if (dev_env) {
+      xpu_set_device(atoi(dev_env));
+      return;
+    }
+
+    xpu_set_device(dev_no);
+  }
+
+  static std::string multi_encoder_precision;  // NOLINT
+  static int workspace_l3_size_per_thread;
+
+ private:
+  static thread_local xdnn::Context* tls_raw_ctx_;
 };
 
 }  // namespace lite
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 56a5c9b8f7ea0ed47d21629d7ccf083b4f9fa232..af2bfbe86aaa1b3f145838015a6d6a62090cb3b1 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -121,7 +121,7 @@ lite_cc_library(kernel SRCS kernel.cc
         PROFILE_DEPS lite_profiler
   )
 lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
-  cpp_op_desc tensor
+  cpp_op_desc tensor utils
   )
 
 add_dependencies(kernel kernel_list_h)
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index 731215f542567ec3ff0cc87d6990624bfa6b2bc2..1138a3bcc2e3e3f3c77d94bf8128b8231f930550 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/core/arena/framework.h"
+#include <set>
 #include "lite/core/context.h"
 #include "lite/operators/subgraph_op.h"
 
@@ -22,7 +23,14 @@ namespace arena {
 
 void TestCase::CreateInstruction() {
   std::shared_ptr<lite::OpLite> op = nullptr;
-  if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) {
+  static const std::set<TargetType> subgraph_op_supported_targets(
+      {TARGET(kNPU), TARGET(kXPU)});
+  bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
+                            subgraph_op_supported_targets.end();
+#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
+  enable_subgraph_op = false;  // Use XPU kernel directly if XTCL is disabled.
+#endif
+  if (enable_subgraph_op) {
     // Create a new block desc to wrap the original op desc
     int sub_block_idx = 0;
     auto sub_block_desc = new cpp::BlockDesc();
@@ -47,7 +55,7 @@ void TestCase::CreateInstruction() {
     op = LiteOpRegistry::Global().Create(op_desc().Type());
   }
   CHECK(op) << "no op for " << op_desc().Type();
-  op->Attach(*op_desc_, inst_scope_);
+  op->Attach(*op_desc_, inst_scope_.get());
   auto kernels = op->CreateKernels({place_});
   // filter out the target kernel
   CHECK(!kernels.empty()) << "No kernel found for place "
@@ -72,53 +80,35 @@ void TestCase::CreateInstruction() {
 void TestCase::PrepareInputsForInstruction() {
   for (auto& arg : op_desc().InputArgumentNames()) {
     for (auto& var : op_desc().Input(arg)) {
-      std::string kernel_key = instruction_->kernel()->key_with_alias();
-      const auto* param_type = ParamTypeRegistry::Global().RetrieveInArgument(
-          place_, kernel_key, arg);
-
-      const Type* inst_type = nullptr;
-      if (param_type->type->IsTensor()) {
-        inst_type = Type::GetTensorTy(TARGET(kHost));
-      } else if (param_type->type->IsTensorList()) {
-        inst_type = Type::GetTensorListTy(TARGET(kHost));
-      } else {
-        LOG(FATAL) << "unsupported param_type";
-      }
-
-      CHECK(scope_->FindVar(var));
-      if (!TargetCompatibleTo(*inst_type, *param_type->type)) {
-        /// Create a tensor or tensor_array in the instruction's scope,
-        /// alloc memory and then copy data there.
-        if (param_type->type->IsTensor()) {
-          const auto* shared_tensor = scope_->FindTensor(var);
-          auto* target_tensor = inst_scope_->NewTensor(var);
-          CHECK(!shared_tensor->dims().empty()) << "shared_tensor is empty yet";
-          target_tensor->Resize(shared_tensor->dims());
-          TargetCopy(param_type->type->target(),
-                     target_tensor->mutable_data(param_type->type->target(),
-                                                 shared_tensor->memory_size()),
-                     shared_tensor->raw_data(),
-                     shared_tensor->memory_size());
-        } else if (param_type->type->IsTensorList()) {
-          const auto* shared_tensor_array =
-              scope_->FindVar(var)->GetMutable<std::vector<Tensor>>();
-          auto* target_tensor_array =
-              inst_scope_->Var(var)->GetMutable<std::vector<Tensor>>();
-          CHECK(!shared_tensor_array->empty())
-              << "shared_tensor_array is empty yet";
-          target_tensor_array->resize(shared_tensor_array->size());
-          for (size_t i = 0; i < shared_tensor_array->size(); i++) {
-            target_tensor_array->at(i).Resize(
-                shared_tensor_array->at(i).dims());
-            TargetCopy(param_type->type->target(),
-                       target_tensor_array->at(i).mutable_data(
-                           param_type->type->target(),
-                           shared_tensor_array->at(i).memory_size()),
-                       shared_tensor_array->at(i).raw_data(),
-                       shared_tensor_array->at(i).memory_size());
-          }
-        } else {
-          LOG(FATAL) << "not support";
+      const auto* type = instruction_->kernel()->GetInputDeclType(arg);
+      CHECK(base_scope_->FindVar(var));
+      /// Create a tensor or tensor_array in the instruction's scope,
+      /// alloc memory and then copy data there.
+      if (type->IsTensor() &&
+          !TargetCompatibleTo(*Type::GetTensorTy(TARGET(kHost)), *type)) {
+        const auto* base_tensor = base_scope_->FindTensor(var);
+        auto* inst_tensor = inst_scope_->FindMutableTensor(var);
+        CHECK(!base_tensor->dims().empty())
+            << "The dims of input tensor is empty yet";
+        TargetCopy(type->target(),
+                   inst_tensor->mutable_data(type->target(),
+                                             base_tensor->memory_size()),
+                   base_tensor->raw_data(),
+                   base_tensor->memory_size());
+      } else if (type->IsTensorList() &&
+                 !TargetCompatibleTo(*Type::GetTensorListTy(TARGET(kHost)),
+                                     *type)) {
+        const auto* base_tensor_list = base_scope_->FindTensorList(var);
+        auto* inst_tensor_list = inst_scope_->FindMutableTensorList(var);
+        CHECK_EQ(base_tensor_list->size(), inst_tensor_list->size());
+        for (size_t i = 0; i < base_tensor_list->size(); i++) {
+          CHECK(!base_tensor_list->at(i).dims().empty())
+              << "The dims of input tensor[" << i << "] is empty yet";
+          TargetCopy(type->target(),
+                     inst_tensor_list->at(i).mutable_data(
+                         type->target(), base_tensor_list->at(i).memory_size()),
+                     inst_tensor_list->at(i).raw_data(),
+                     inst_tensor_list->at(i).memory_size());
         }
       }
     }
@@ -126,78 +116,88 @@ void TestCase::PrepareInputsForInstruction() {
 }
 
 template <typename T>
-bool TestCase::CheckTensorPrecision(const Tensor* a_tensor,
-                                    const Tensor* b_tensor,
+bool TestCase::CheckTensorPrecision(const Tensor* inst_tensor,
+                                    const Tensor* base_tensor,
                                     float abs_error) {
-  CHECK(a_tensor);
-  CHECK(b_tensor);
+  CHECK(inst_tensor);
+  CHECK(base_tensor);
 
-  CHECK(ShapeEquals(a_tensor->dims(), b_tensor->dims()));
+  CHECK(ShapeEquals(inst_tensor->dims(), base_tensor->dims()));
 
-  CHECK(a_tensor->lod() == b_tensor->lod()) << "lod not match";
+  CHECK(inst_tensor->lod() == base_tensor->lod()) << "lod not match";
 
   // The baseline should output in host devices.
-  CHECK(b_tensor->target() == TARGET(kHost) ||
-        b_tensor->target() == TARGET(kX86) ||
-        b_tensor->target() == TARGET(kARM));
-
-  const T* a_data{};
-  switch (a_tensor->target()) {
+  CHECK(base_tensor->target() == TARGET(kHost) ||
+        base_tensor->target() == TARGET(kX86) ||
+        base_tensor->target() == TARGET(kARM));
+  const T* inst_data{};
+  Tensor inst_host_tensor;
+  inst_host_tensor.Resize(inst_tensor->dims());
+  switch (inst_tensor->target()) {
     case TARGET(kX86):
     case TARGET(kHost):
     case TARGET(kARM):
-      a_data = static_cast<const T*>(a_tensor->raw_data());
+      inst_data = static_cast<const T*>(inst_tensor->raw_data());
+      break;
+#ifdef LITE_WITH_XPU
+    case TARGET(kXPU):
+      CopySync<TARGET(kXPU)>(inst_host_tensor.mutable_data<T>(),
+                             inst_tensor->raw_data(),
+                             sizeof(T) * inst_tensor->dims().production(),
+                             IoDirection::DtoH);
+      inst_data = inst_host_tensor.data<T>();
       break;
+#endif
 
     default:
       // Before compare, need to copy data from `target` device to host.
       LOG(FATAL) << "Not supported";
   }
 
-  CHECK(a_data);
+  CHECK(inst_data);
 
-  const T* b_data = static_cast<const T*>(b_tensor->raw_data());
+  const T* base_data = static_cast<const T*>(base_tensor->raw_data());
 
   bool success = true;
-  for (int i = 0; i < a_tensor->dims().production(); i++) {
-    EXPECT_NEAR(a_data[i], b_data[i], abs_error);
-    if (fabsf(a_data[i] - b_data[i]) > abs_error) {
+  for (int i = 0; i < inst_tensor->dims().production(); i++) {
+    EXPECT_NEAR(inst_data[i], base_data[i], abs_error);
+    if (fabsf(inst_data[i] - base_data[i]) > abs_error) {
       success = false;
     }
   }
   return success;
 }
 
-bool TestCase::CheckPrecision(const Tensor* a_tensor,
-                              const Tensor* b_tensor,
+bool TestCase::CheckPrecision(const Tensor* inst_tensor,
+                              const Tensor* base_tensor,
                               float abs_error,
                               PrecisionType precision_type) {
   PrecisionType precision_type_t = precision_type;
   if (precision_type == PRECISION(kAny)) {
-    precision_type_t = b_tensor->precision();
+    precision_type_t = base_tensor->precision();
   }
-  CHECK(precision_type_t == b_tensor->precision())
+  CHECK(precision_type_t == base_tensor->precision())
       << "arg precision type and base tensor precision type are not matched! "
          "arg precision type is: "
       << PrecisionToStr(precision_type) << ", base tensor precision type is: "
-      << PrecisionToStr(b_tensor->precision());
-  CHECK(a_tensor->precision() == b_tensor->precision())
+      << PrecisionToStr(base_tensor->precision());
+  CHECK(inst_tensor->precision() == base_tensor->precision())
       << "real tensor precision type and base tensor precision type are not "
          "matched! real tensor precision type is: "
-      << PrecisionToStr(a_tensor->precision())
+      << PrecisionToStr(inst_tensor->precision())
       << ", base tensor precision type is: "
-      << PrecisionToStr(b_tensor->precision());
+      << PrecisionToStr(base_tensor->precision());
   switch (precision_type_t) {
     case PRECISION(kFloat):
-      return CheckTensorPrecision<float>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<float>(inst_tensor, base_tensor, abs_error);
     case PRECISION(kInt8):
-      return CheckTensorPrecision<int8_t>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<int8_t>(inst_tensor, base_tensor, abs_error);
     case PRECISION(kInt32):
-      return CheckTensorPrecision<int32_t>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<int32_t>(inst_tensor, base_tensor, abs_error);
     case PRECISION(kInt64):
-      return CheckTensorPrecision<int64_t>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<int64_t>(inst_tensor, base_tensor, abs_error);
     case PRECISION(kBool):
-      return CheckTensorPrecision<bool>(a_tensor, b_tensor, abs_error);
+      return CheckTensorPrecision<bool>(inst_tensor, base_tensor, abs_error);
     default:
       LOG(FATAL) << "not support type: " << PrecisionToStr(precision_type);
       return false;
@@ -209,24 +209,24 @@ bool TestCase::CheckPrecision(const std::string& var_name,
                               PrecisionType precision_type) {
   bool success = true;
   if (inst_scope_->FindVar(var_name)->IsType<Tensor>()) {
-    auto a_tensor = inst_scope_->FindTensor(var_name);
-    auto b_tensor = base_scope_->FindTensor(var_name);
-    success = success &&
-              CheckPrecision(a_tensor, b_tensor, abs_error, precision_type);
+    auto inst_tensor = inst_scope_->FindTensor(var_name);
+    auto base_tensor = base_scope_->FindTensor(var_name);
+    success =
+        success &&
+        CheckPrecision(inst_tensor, base_tensor, abs_error, precision_type);
   } else if (inst_scope_->FindVar(var_name)->IsType<std::vector<Tensor>>()) {
-    auto a_tensor_array =
-        inst_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
-    auto b_tensor_array =
-        base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
-    CHECK_EQ(a_tensor_array->size(), b_tensor_array->size());
-    for (size_t i = 0; i < a_tensor_array->size(); i++) {
-      Tensor* a_tensor = &(a_tensor_array->at(i));
-      Tensor* b_tensor = &(b_tensor_array->at(i));
-      if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) {
+    auto inst_tensor_list = inst_scope_->FindMutableTensorList(var_name);
+    auto base_tensor_list = base_scope_->FindMutableTensorList(var_name);
+    CHECK_EQ(inst_tensor_list->size(), base_tensor_list->size());
+    for (size_t i = 0; i < inst_tensor_list->size(); i++) {
+      Tensor* inst_tensor = &(inst_tensor_list->at(i));
+      Tensor* base_tensor = &(base_tensor_list->at(i));
+      if (inst_tensor->dims().size() == 0 && base_tensor->dims().size() == 0) {
         continue;
       }
-      success = success &&
-                CheckPrecision(a_tensor, b_tensor, abs_error, precision_type);
+      success =
+          success &&
+          CheckPrecision(inst_tensor, base_tensor, abs_error, precision_type);
     }
   } else {
     LOG(FATAL) << "unsupported var type";
diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h
index cf864a32044e3dfd03ecd03327a0db69275ef586..4e73768e53576f03e47158618fa4f0eac0851382 100644
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -28,7 +28,7 @@
 #include "lite/core/program.h"
 #include "lite/core/scope.h"
 #include "lite/core/types.h"
-#include "lite/model_parser/cpp/op_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 
 namespace paddle {
 namespace lite {
@@ -40,13 +40,15 @@ namespace arena {
 class TestCase {
  public:
   explicit TestCase(const Place& place, const std::string& alias)
-      : place_(place), scope_(new Scope), alias_(alias) {
+      : place_(place),
+        alias_(alias),
+        inst_scope_(new Scope),
+        base_scope_(new Scope) {
     ctx_ = ContextScheduler::Global().NewContext(place_.target);
   }
   virtual ~TestCase();
 
   void Prepare() {
-    PrepareScopes();
     PrepareData();
     op_desc_.reset(new cpp::OpDesc);
     PrepareOpDesc(op_desc_.get());
@@ -91,16 +93,15 @@ class TestCase {
   // kernel registry.
   void CheckKernelConsistWithDefinition() {}
 
-  Scope& scope() { return *scope_; }
-
-  Scope* baseline_scope() { return base_scope_; }
-  Scope* inst_scope() { return inst_scope_; }
+  Scope* baseline_scope() { return base_scope_.get(); }
+  Scope* inst_scope() { return inst_scope_.get(); }
 
  protected:
   // Prepare inputs in scope() for Tester.
   virtual void PrepareData() = 0;
 
-  /// Prepare a tensor in host. The tensors will be created in scope_.
+  /// Prepare a tensor in host. The tensors will be created both in base_scope_
+  /// and inst_scope_.
   /// Need to specify the targets other than X86 or ARM.
   template <typename T>
   void SetCommonTensor(const std::string& var_name,
@@ -108,42 +109,47 @@ class TestCase {
                        const T* data,
                        const LoD& lod = {},
                        bool is_persistable = false) {
-    auto* tensor = scope_->NewTensor(var_name);
-    tensor->Resize(ddim);
-    auto* d = tensor->mutable_data<T>();
-    memcpy(d, data, ddim.production() * sizeof(T));
+    // Create and fill a input tensor with the given data for baseline
+    auto* base_tensor = base_scope_->NewTensor(var_name);
+    base_tensor->Resize(ddim);
+    memcpy(base_tensor->mutable_data<T>(), data, ddim.production() * sizeof(T));
 
     // set lod
-    if (!lod.empty()) *tensor->mutable_lod() = lod;
+    if (!lod.empty()) *base_tensor->mutable_lod() = lod;
     // set persistable
-    tensor->set_persistable(is_persistable);
+    base_tensor->set_persistable(is_persistable);
+
+    // Create a copy for instruction
+    auto* inst_tensor = inst_scope_->NewTensor(var_name);
+    inst_tensor->CopyDataFrom(*base_tensor);
   }
 
   /// Prepare a tensor_array in host. The tensors will be created in scope_.
   /// Need to specify the targets other than X86 or ARM.
   template <typename T>
   void SetCommonTensorList(const std::string& var_name,
-                           const std::vector<DDim>& array_tensor_dims,
+                           const std::vector<DDim>& ddims,
                            const std::vector<std::vector<T>>& datas,
                            const std::vector<LoD>& lods = {}) {
-    CHECK_EQ(array_tensor_dims.size(), datas.size());
+    // Create a tensor array for baseline, and a copy for instruction
+    CHECK_EQ(ddims.size(), datas.size());
     if (!lods.empty()) {
-      CHECK_EQ(array_tensor_dims.size(), lods.size());
+      CHECK_EQ(ddims.size(), lods.size());
     }
 
-    auto* tensor_array =
-        scope_->Var(var_name)->GetMutable<std::vector<Tensor>>();
-    for (int i = 0; i < array_tensor_dims.size(); i++) {
-      Tensor tmp;
-      tmp.Resize(array_tensor_dims[i]);
-      auto* tmp_data = tmp.mutable_data<T>();
-      memcpy(tmp_data,
+    auto* base_tensor_list = base_scope_->NewTensorList(var_name);
+    auto* inst_tensor_list = inst_scope_->NewTensorList(var_name);
+    for (int i = 0; i < ddims.size(); i++) {
+      Tensor item;
+      item.Resize(ddims[i]);
+      memcpy(item.mutable_data<T>(),
              datas[i].data(),
-             array_tensor_dims[i].production() * sizeof(T));
+             ddims[i].production() * sizeof(T));
       if (!lods.empty()) {
-        tmp.set_lod(lods[i]);
+        item.set_lod(lods[i]);
       }
-      tensor_array->push_back(tmp);
+      base_tensor_list->push_back(item);
+      inst_tensor_list->push_back(item);
     }
   }
 
@@ -157,11 +163,6 @@ class TestCase {
   std::unique_ptr<KernelContext> ctx_;
   void CreateInstruction();
 
-  void PrepareScopes() {
-    inst_scope_ = &scope_->NewScope();
-    base_scope_ = &scope_->NewScope();
-  }
-
   // Check shape
   // TODO(Superjomn) Move this method to utils or DDim?
   bool ShapeEquals(const DDim& a, const DDim& b) {
@@ -172,25 +173,23 @@ class TestCase {
     return true;
   }
 
-  /// Copy the input tensors to target devices needed by the instruction.
+  // Copy the host tensors to the device tensors if needed by the instruction.
   void PrepareInputsForInstruction();
 
   // Create output tensors and variables.
   void PrepareOutputsForInstruction() {
     for (auto x : op_desc().output_vars()) {
-      inst_scope_->NewTensor(x);
-      base_scope_->NewTensor(x);
+      inst_scope_->Var(x);
     }
   }
 
  private:
   Place place_;
-  std::shared_ptr<Scope> scope_;
   std::string alias_;
   // The workspace for the Instruction.
-  Scope* inst_scope_{};
+  std::shared_ptr<Scope> inst_scope_;
   // The workspace for the baseline implementation.
-  Scope* base_scope_{};
+  std::shared_ptr<Scope> base_scope_;
   std::unique_ptr<cpp::OpDesc> op_desc_;
   std::unique_ptr<Instruction> instruction_;
 };
diff --git a/lite/core/context.cc b/lite/core/context.cc
index eb8f90d7fa90d459846b24bc93b5d26cdfc3969a..f14d1dfddea806ab3839f6f897b9d4d3fe396ca8 100644
--- a/lite/core/context.cc
+++ b/lite/core/context.cc
@@ -21,10 +21,10 @@ namespace lite {
 std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""};  // NOLINT
 #endif
 
-#ifdef LITE_WITH_XPU
-std::string Context<TargetType::kXPU>::_multi_encoder_precision;  // NOLINT
-thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
-int Context<TargetType::kXPU>::_workspace_l3_size_per_thread{0};
+#ifdef LITE_WITH_MLU
+int Context<TargetType::kMLU>::next_queue_id_{0};
+std::map<int, int> Context<TargetType::kMLU>::queue_id_map_;
+std::mutex Context<TargetType::kMLU>::map_mutex_;
 #endif
 
 }  // namespace lite
diff --git a/lite/core/context.h b/lite/core/context.h
index f606eeffaf8ccf932e2d17f03478d4d893ee482d..c3993d9589eeac442eaa827152fd1293852396db 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -25,6 +25,7 @@
 #ifdef LITE_WITH_MLU
 #include <cnml.h>
 #include <cnrt.h>
+#include <mutex>  // NOLINT
 #include "lite/backends/mlu/mlu_utils.h"
 #endif
 #ifdef LITE_WITH_XPU
@@ -143,45 +144,12 @@ class Context<TargetType::kXPU> {
 
   void CopySharedTo(XPUContext* ctx) {}
 
+  // TODO(miaotianxiang): remove this
   static xdnn::Context* GetRawContext() {
-    if (_tls_raw_ctx == nullptr) {
-      _tls_raw_ctx = xdnn::create_context();
-      CHECK(_tls_raw_ctx);
-      int r = xdnn::set_workspace_l3_size(_tls_raw_ctx,
-                                          _workspace_l3_size_per_thread);
-      if (r != 0) {
-        LOG(WARNING) << "xdnn::set_workspace_l3_size() failed, r = " << r
-                     << ", _workspace_l3_size_per_thread = "
-                     << _workspace_l3_size_per_thread;
-      }
-    }
-    return _tls_raw_ctx;
-  }
-
-  static void SetWorkspaceL3Size(int l3_size = 0xfffc00) {
-    _workspace_l3_size_per_thread = l3_size;
-  }
-
-  // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker
-  // thread
-  static void SetDev(int dev_no = 0) {
-    const char* dev_env = getenv("LITE_XPU_DEV");
-    if (dev_env) {
-      xpu_set_device(atoi(dev_env));
-      return;
-    }
-
-    xpu_set_device(dev_no);
+    return TargetWrapperXPU::GetRawContext();
   }
 
   std::string name() const { return "XPUContext"; }
-
- public:
-  static std::string _multi_encoder_precision;  // NOLINT
-
- private:
-  static thread_local xdnn::Context* _tls_raw_ctx;
-  static int _workspace_l3_size_per_thread;
 };
 #endif
 
@@ -249,11 +217,11 @@ class Context<TargetType::kMLU> {
   void InitOnce() {}
 
   MLUContext& operator=(const MLUContext& ctx) {
-    this->Init(ctx.device_id_, ctx.exec_queue_id_, ctx.io_queue_id_);
+    this->Init(ctx.device_id_, ctx.exec_queue_id_);
     return *this;
   }
 
-  void Init(int dev_id, int exec_queue_id = 0, int io_queue_id = 0) {
+  void Init(int dev_id, int exec_queue_id = 0) {
     CHECK_GT(devs.size(), 0UL)
         << "Env is not initialized or current target is not exit!";
     if (dev_id >= static_cast<int>(devs.size())) {
@@ -264,21 +232,19 @@ class Context<TargetType::kMLU> {
       device_id_ = dev_id;
     }
     SetMluDevice(device_id_);
-    if (io_queue_id >= devs[dev_id].max_queue()) {
-      LOG(WARNING) << "data queue index exceeds the maximum queue number, "
-                      "set to default qeueu(0)!";
-      io_queue_id = 0;
-    }
-    if (exec_queue_id >= devs[dev_id].max_queue()) {
-      LOG(WARNING) << "exec queue index exceeds the maximum queue number, "
-                      "set to default qeueu(0)!";
-      exec_queue_id = 0;
+
+    // get queue id from map
+    std::unique_lock<std::mutex> lk(map_mutex_);
+    if (queue_id_map_.find(exec_queue_id) == queue_id_map_.end()) {
+      queue_id_map_[exec_queue_id] =
+          next_queue_id_++ % devs[dev_id].max_queue();
     }
-    io_queue_ = devs[dev_id].io_queues()[io_queue_id];
-    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id];
+    exec_queue_id_ = queue_id_map_[exec_queue_id];
+    VLOG(4) << "pick mlu queue id: " << exec_queue_id_;
+    lk.unlock();
 
-    exec_queue_id_ = exec_queue_id;
-    io_queue_id_ = io_queue_id;
+    io_queue_ = devs[dev_id].io_queues()[exec_queue_id_];
+    exec_queue_ = devs[dev_id].exec_queues()[exec_queue_id_];
   }
 
   void CopySharedTo(MLUContext* ctx) { ctx->forward_param_ = forward_param_; }
@@ -290,10 +256,12 @@ class Context<TargetType::kMLU> {
   void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; }
 
   cnmlCoreVersion_t MLUCoreVersion() {
-    return DeviceInfo::Global().MLUCoreVersion();
+    return paddle::lite::TargetWrapperMlu::MLUCoreVersion();
   }
 
-  int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); }
+  int MLUCoreNumber() {
+    return paddle::lite::TargetWrapperMlu::MLUCoreNumber();
+  }
 
   u32_t affinity() { return affinity_; }
 
@@ -304,10 +272,12 @@ class Context<TargetType::kMLU> {
   std::string name() const { return "MLUContext"; }
 
  private:
+  static int next_queue_id_;
+  static std::map<int, int> queue_id_map_;
+  static std::mutex map_mutex_;
   int device_id_;
   // overall information
   int exec_queue_id_;
-  int io_queue_id_;
   cnrtQueue_t io_queue_;
   cnrtQueue_t exec_queue_;
 
@@ -455,7 +425,7 @@ class ContextScheduler {
       case TARGET(kMLU): {
         int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
         auto& context = ctx->As<MLUContext>();
-        context.Init(dev_id);
+        context.Init(dev_id, exec_stream_id);
         kernel_contexts_[TargetType::kMLU].As<MLUContext>().CopySharedTo(
             &context);
         LOG(INFO) << "New Context for MLU";
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index ac79ede37406188f495690179b4a4886bc009d80..6d404cee9718a94d2646728c8f2d79576ceb7860 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -66,15 +66,6 @@ thread_local std::vector<int> DeviceInfo::active_ids_;
 thread_local TensorLite DeviceInfo::workspace_;
 thread_local int64_t DeviceInfo::count_ = 0;
 
-#ifdef LITE_WITH_MLU
-thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270};
-thread_local int DeviceInfo::mlu_core_number_{1};
-thread_local bool DeviceInfo::use_first_conv_{false};
-thread_local std::vector<float> DeviceInfo::mean_vec_;
-thread_local std::vector<float> DeviceInfo::std_vec_;
-thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)};
-#endif
-
 #ifdef TARGET_IOS
 const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
 const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
@@ -1089,45 +1080,6 @@ int DeviceInfo::Setup() {
   return 0;
 }
 
-#ifdef LITE_WITH_MLU
-void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version,
-                               int core_number,
-                               bool use_first_conv,
-                               const std::vector<float>& mean_vec,
-                               const std::vector<float>& std_vec,
-                               DataLayoutType input_layout) {
-  switch (core_version) {
-    case (lite_api::MLUCoreVersion::MLU_220):
-      mlu_core_version_ = CNML_MLU220;
-      break;
-    case (lite_api::MLUCoreVersion::MLU_270):
-      mlu_core_version_ = CNML_MLU270;
-      break;
-    default:
-      mlu_core_version_ = CNML_MLU270;
-      break;
-  }
-  mlu_core_number_ = core_number;
-  use_first_conv_ = use_first_conv;
-  mean_vec_ = mean_vec;
-  std_vec_ = std_vec;
-  input_layout_ = input_layout;
-}
-
-cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; }
-
-int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; }
-
-bool DeviceInfo::UseFirstConv() { return use_first_conv_; }
-
-const std::vector<float>& DeviceInfo::MeanVec() const { return mean_vec_; }
-
-const std::vector<float>& DeviceInfo::StdVec() const { return std_vec_; }
-
-DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; }
-
-#endif  // LITE_WITH_MLU
-
 void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
 #ifdef ARM_WITH_OMP
   thread_num = std::min(thread_num, core_num_);
diff --git a/lite/core/device_info.h b/lite/core/device_info.h
index 603e3e6b91dc6035ffce2265a27bed4d59db5a9c..7aa3131d8fb1a5f8d573c483bafcb7f4d5c62ec7 100644
--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -59,20 +59,6 @@ class DeviceInfo {
   int Setup();
 
   void SetRunMode(lite_api::PowerMode mode, int thread_num);
-#ifdef LITE_WITH_MLU
-  void SetMLURunMode(lite_api::MLUCoreVersion core_version,
-                     int core_number,
-                     bool use_first_conv,
-                     const std::vector<float>& mean_vec,
-                     const std::vector<float>& std_vec,
-                     DataLayoutType input_layout);
-  cnmlCoreVersion_t MLUCoreVersion();
-  int MLUCoreNumber();
-  bool UseFirstConv();
-  const std::vector<float>& MeanVec() const;
-  const std::vector<float>& StdVec() const;
-  DataLayoutType InputLayout() const;
-#endif
   void SetCache(int l1size, int l2size, int l3size);
   void SetArch(ARMArch arch) { arch_ = arch; }
 
@@ -124,15 +110,6 @@ class DeviceInfo {
   static thread_local TensorLite workspace_;
   static thread_local int64_t count_;
 
-#ifdef LITE_WITH_MLU
-  static thread_local cnmlCoreVersion_t mlu_core_version_;
-  static thread_local int mlu_core_number_;
-  static thread_local bool use_first_conv_;
-  static thread_local std::vector<float> mean_vec_;
-  static thread_local std::vector<float> std_vec_;
-  static thread_local DataLayoutType input_layout_;
-#endif
-
   void SetDotInfo(int argc, ...);
   void SetFP16Info(int argc, ...);
   void SetFP32Info(int argc, ...);
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
index 9fffcc60012060327612345528c705bcf7722f17..361d014acc512dc2a46061f86efa83e1e1845807 100644
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -66,7 +66,7 @@ class KernelBase {
   virtual void SetProfileRuntimeKernelInfo(
       paddle::lite::profile::OpCharacter* ch) {
     ch->kernel_func_name = std::string("NotImpl");
-#ifdef LITE_WITH_ARM
+#ifdef LITE_WITH_OPENCL
     ch->cl_event = event_;
 #endif
   }
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index 1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd..83e41d2c0960d87a0201b55b943529a9df4f6ab2 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -140,6 +140,11 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
           dst, src, size, IoDirection::HtoD);
       break;
 #endif
+#ifdef LITE_WITH_XPU
+    case TargetType::kXPU:
+      TargetWrapperXPU::MemcpySync(dst, src, size, IoDirection::HtoD);
+      break;
+#endif
 #ifdef LITE_WITH_OPENCL
     case TargetType::kOpenCL:
       TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
diff --git a/lite/core/memory.h b/lite/core/memory.h
index a1013910019251271ddfccfbc700297c45226fe6..c80c8fb6b6e1356ebfa52920a8ee39f61ed20692 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -97,6 +97,11 @@ void CopySync(void* dst, const void* src, size_t size, IoDirection dir) {
     case TARGET(kBM):
       TargetWrapper<TARGET(kBM)>::MemcpySync(dst, src, size, dir);
       break;
+#endif
+#ifdef LITE_WITH_XPU
+    case TARGET(kXPU):
+      TargetWrapperXPU::MemcpySync(dst, src, size, dir);
+      break;
 #endif
     default:
       LOG(FATAL)
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index 2540bb56d4082570c984e8eea009b5575825fec9..be09ed4b1a63154b8561f4d39cff7d987a9fcba7 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -23,9 +23,11 @@ lite_cc_library(mir_passes
       fusion/sequence_pool_concat_fuse_pass.cc
       fusion/scale_activation_fuse_pass.cc
       fusion/__xpu__resnet_fuse_pass.cc
+      fusion/__xpu__resnet_cbam_fuse_pass.cc
       fusion/__xpu__multi_encoder_fuse_pass.cc
       fusion/__xpu__embedding_with_eltwise_add_fuse_pass.cc
       fusion/__xpu__fc_fuse_pass.cc
+      fusion/__xpu__mmdnn_fuse_pass.cc
       elimination/identity_scale_eliminate_pass.cc
       elimination/identity_dropout_eliminate_pass.cc
       elimination/elementwise_mul_constant_eliminate_pass.cc
diff --git a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
index f4226820d0437db8cad0cfdac92be15359bb90bd..673854b118a8adaca73cb905eda4892b6903665c 100644
--- a/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
+++ b/lite/core/mir/elimination/remove_tf_redundant_ops_pass.cc
@@ -18,7 +18,7 @@
 #include "lite/core/mir/pass.h"
 #include "lite/core/mir/pass_registry.h"
 #include "lite/core/mir/pattern_matcher.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61aeb2ab1f51ddcd6b153971253f8239472a1031
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__mmdnn_fuse_pass.cc
@@ -0,0 +1,1183 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class XPUMmdnnFloat2Fix {
+ public:
+  void operator()(SSAGraph* graph) {
+    for (auto* node : graph->StmtTopologicalOrder()) {
+      CHECK(node->IsStmt());
+      auto* op_info = node->stmt()->op_info();
+      std::string op_type = op_info->Type();
+
+      static const std::vector<std::string> target_ops{"var_conv_2d",
+                                                       "search_fc"};
+      if (std::find(target_ops.begin(), target_ops.end(), op_type) !=
+          target_ops.end()) {
+        std::string weight_name = op_info->Input("W").front();
+        auto* scope = node->stmt()->op()->scope();
+        auto* weight_t = scope->FindMutableTensor(weight_name);
+        auto weight_dims = weight_t->dims();
+        auto weight_len = weight_t->numel();
+        float* weight_on_host = weight_t->mutable_data<float>();
+        float max_f =
+            paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+        std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+        paddle::lite::xpu::math::ConvertFP32ToInt16(
+            weight_on_host, weight_int16.get(), max_f, weight_len);
+        memcpy(
+            weight_on_host, weight_int16.get(), weight_len * sizeof(int16_t));
+
+        auto update_op_info = *op_info;
+        update_op_info.SetAttr<bool>("__xpu__float_to_fix", true);
+        update_op_info.SetAttr<float>("__xpu__w_max", max_f);
+        node->stmt()->ResetOp(update_op_info, graph->valid_places());
+        VLOG(3) << "Float2Fix, op_type=" << op_type
+                << ", weight_name=" << weight_name;
+      } else if (op_type == "match_matrix_tensor") {
+        std::string weight_name = op_info->Input("W").front();
+        auto* scope = node->stmt()->op()->scope();
+        auto* weight_t = scope->FindMutableTensor(weight_name);
+        auto weight_dims = weight_t->dims();
+        auto weight_len = weight_t->numel();
+        float* weight_on_host = weight_t->mutable_data<float>();
+        float max_f =
+            paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len);
+        std::unique_ptr<int16_t[]> weight_int16(new int16_t[weight_len]);
+        std::unique_ptr<int16_t[]> weight_trans_int16(new int16_t[weight_len]);
+        paddle::lite::xpu::math::ConvertFP32ToInt16(
+            weight_on_host, weight_int16.get(), max_f, weight_len);
+        paddle::lite::xpu::math::Transpose(weight_int16.get(),
+                                           weight_trans_int16.get(),
+                                           weight_dims[0],
+                                           weight_dims[1] * weight_dims[2]);
+        memcpy(weight_on_host,
+               weight_trans_int16.get(),
+               weight_len * sizeof(int16_t));
+
+        auto update_op_info = *op_info;
+        update_op_info.SetAttr<bool>("__xpu__float_to_fix", true);
+        update_op_info.SetAttr<float>("__xpu__w_max", max_f);
+        node->stmt()->ResetOp(update_op_info, graph->valid_places());
+        VLOG(3) << "Float2Fix && Transposed, op_type=" << op_type
+                << ", weight_name=" << weight_name;
+      } else if (op_type == "search_grnn") {
+        auto* scope = node->stmt()->op()->scope();
+
+        std::string wi_name = op_info->Input("Wi").front();
+        auto* wi_t = scope->FindMutableTensor(wi_name);
+        auto wi_dims = wi_t->dims();
+        auto wi_len = wi_t->numel();
+        auto wi_stride_len = wi_len / 3;
+        float* wi_on_host = wi_t->mutable_data<float>();
+        std::unique_ptr<int16_t[]> wi_int16(new int16_t[wi_len]);
+        std::vector<float> wi_max(3);
+        for (int i = 0; i < 3; ++i) {
+          float max_f = paddle::lite::xpu::math::FindMaxAbs(
+              wi_on_host + i * wi_stride_len, wi_stride_len);
+          paddle::lite::xpu::math::ConvertFP32ToInt16(
+              wi_on_host + i * wi_stride_len,
+              wi_int16.get() + i * wi_stride_len,
+              max_f,
+              wi_stride_len);
+          wi_max[i] = max_f;
+        }
+        memcpy(wi_on_host, wi_int16.get(), wi_len * sizeof(int16_t));
+
+        std::string wh_name = op_info->Input("Wh").front();
+        auto* wh_t = scope->FindMutableTensor(wh_name);
+        auto wh_dims = wh_t->dims();
+        auto wh_len = wh_t->numel();
+        auto wh_stride_len = wh_len / 3;
+        float* wh_on_host = wh_t->mutable_data<float>();
+        std::unique_ptr<int16_t[]> wh_int16(new int16_t[wh_len]);
+        std::vector<float> wh_max(3);
+        for (int i = 0; i < 3; ++i) {
+          float max_f = paddle::lite::xpu::math::FindMaxAbs(
+              wh_on_host + i * wh_stride_len, wh_stride_len);
+          paddle::lite::xpu::math::ConvertFP32ToInt16(
+              wh_on_host + i * wh_stride_len,
+              wh_int16.get() + i * wh_stride_len,
+              max_f,
+              wh_stride_len);
+          wh_max[i] = max_f;
+        }
+        memcpy(wh_on_host, wh_int16.get(), wh_len * sizeof(int16_t));
+
+        auto update_op_info = *op_info;
+        update_op_info.SetAttr<bool>("__xpu__float_to_fix", true);
+        update_op_info.SetAttr<std::vector<float>>("__xpu__wi_max", wi_max);
+        update_op_info.SetAttr<std::vector<float>>("__xpu__wh_max", wh_max);
+        node->stmt()->ResetOp(update_op_info, graph->valid_places());
+        VLOG(3) << "Float2Fix, op_type=" << op_type << ", wi_name=" << wi_name
+                << ", wh_name=" << wh_name;
+      }
+    }
+  }
+};
+
+class XPUMmdnnSearchAttentionFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input = VarNode("input")->AsInput();
+
+    auto* search_group_padding =
+        OpNode("search_group_padding", "search_group_padding");
+    auto* out_emb_padding =
+        VarNode("out_emb_padding")
+            ->assert_is_op_output("search_group_padding", "Out_emb_padding")
+            ->AsIntermediate();
+    auto* out_new = VarNode("out_new")
+                        ->assert_is_op_output("search_group_padding", "Out_new")
+                        ->AsIntermediate();
+    auto* out_padding =
+        VarNode("out_padding")
+            ->assert_is_op_output("search_group_padding", "Out_padding")
+            ->AsIntermediate();
+
+    auto* search_seq_fc_w = VarNode("search_seq_fc_w")
+                                ->assert_is_op_input("search_seq_fc", "W")
+                                ->AsInput();
+    auto* search_seq_fc_b = VarNode("search_seq_fc_b")
+                                ->assert_is_op_input("search_seq_fc", "b")
+                                ->AsInput();
+    auto* search_seq_fc =
+        OpNode("search_seq_fc", "search_seq_fc")->AsIntermediate();
+    auto* search_seq_fc_out = VarNode("search_seq_fc_out")
+                                  ->assert_is_op_output("search_seq_fc", "Out")
+                                  ->AsIntermediate();
+
+    auto* search_aligned_mat_mul =
+        OpNode("search_aligned_mat_mul", "search_aligned_mat_mul")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_out =
+        VarNode("search_aligned_mat_mul_out")
+            ->assert_is_op_output("search_aligned_mat_mul", "Out")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_a =
+        VarNode("search_aligned_mat_mul_a")
+            ->assert_is_op_output("search_aligned_mat_mul", "_a_addr")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_b =
+        VarNode("search_aligned_mat_mul_b")
+            ->assert_is_op_output("search_aligned_mat_mul", "_b_addr")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_c =
+        VarNode("search_aligned_mat_mul_c")
+            ->assert_is_op_output("search_aligned_mat_mul", "_c_addr")
+            ->AsIntermediate();
+
+    auto* search_attention_padding_mask =
+        OpNode("search_attention_padding_mask", "search_attention_padding_mask")
+            ->AsIntermediate();
+    auto* search_attention_padding_mask_out =
+        VarNode("search_attention_padding_mask_out")
+            ->assert_is_op_output("search_attention_padding_mask", "Out")
+            ->AsIntermediate();
+    auto* search_attention_padding_mask_pad_begin =
+        VarNode("search_attention_padding_mask_pad_begin")
+            ->assert_is_op_output("search_attention_padding_mask", "pad_begin")
+            ->AsIntermediate();
+
+    auto* search_seq_softmax =
+        OpNode("search_seq_softmax", "search_seq_softmax")->AsIntermediate();
+    auto* search_seq_softmax_out =
+        VarNode("search_seq_softmax_out")
+            ->assert_is_op_output("search_seq_softmax", "Out")
+            ->AsIntermediate();
+    auto* search_seq_softmax_out_log =
+        VarNode("search_seq_softmax_out_log")
+            ->assert_is_op_output("search_seq_softmax", "Out_log")
+            ->AsIntermediate();
+
+    auto* search_aligned_mat_mul_2 =
+        OpNode("search_aligned_mat_mul_2", "search_aligned_mat_mul")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_2_out =
+        VarNode("search_aligned_mat_mul_2_out")
+            ->assert_is_op_output("search_aligned_mat_mul", "Out")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_2_a =
+        VarNode("search_aligned_mat_mul_2_a")
+            ->assert_is_op_output("search_aligned_mat_mul", "_a_addr")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_2_b =
+        VarNode("search_aligned_mat_mul_2_b")
+            ->assert_is_op_output("search_aligned_mat_mul", "_b_addr")
+            ->AsIntermediate();
+    auto* search_aligned_mat_mul_2_c =
+        VarNode("search_aligned_mat_mul_2_c")
+            ->assert_is_op_output("search_aligned_mat_mul", "_c_addr")
+            ->AsIntermediate();
+
+    auto* search_seq_depadding =
+        OpNode("search_seq_depadding")->AsIntermediate();
+    auto* search_seq_depadding_out =
+        VarNode("search_seq_depadding_out")->AsOutput();
+
+    *input >> *search_group_padding >> *out_emb_padding;
+    *search_group_padding >> *out_new;
+    *search_group_padding >> *out_padding;
+
+    *search_seq_fc_w >> *search_seq_fc;
+    *search_seq_fc_b >> *search_seq_fc;
+    *out_emb_padding >> *search_seq_fc;
+    *search_seq_fc >> *search_seq_fc_out;
+
+    *search_seq_fc_out >> *search_aligned_mat_mul;
+    *out_emb_padding >> *search_aligned_mat_mul;
+    *search_aligned_mat_mul >> *search_aligned_mat_mul_out;
+    *search_aligned_mat_mul >> *search_aligned_mat_mul_a;
+    *search_aligned_mat_mul >> *search_aligned_mat_mul_b;
+    *search_aligned_mat_mul >> *search_aligned_mat_mul_c;
+
+    *search_aligned_mat_mul_out >> *search_attention_padding_mask;
+    *out_padding >> *search_attention_padding_mask;
+    *search_attention_padding_mask >> *search_attention_padding_mask_out;
+    *search_attention_padding_mask >> *search_attention_padding_mask_pad_begin;
+
+    *search_attention_padding_mask_out >> *search_seq_softmax;
+    *search_seq_softmax >> *search_seq_softmax_out;
+    *search_seq_softmax >> *search_seq_softmax_out_log;
+
+    *search_seq_softmax_out >> *search_aligned_mat_mul_2;
+    *out_emb_padding >> *search_aligned_mat_mul_2;
+    *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_out;
+    *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_a;
+    *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_b;
+    *search_aligned_mat_mul_2 >> *search_aligned_mat_mul_2_c;
+
+    *search_aligned_mat_mul_2_out >> *search_seq_depadding;
+    *out_new >> *search_seq_depadding;
+    *search_seq_depadding >> *search_seq_depadding_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_search_attention");
+    op_desc.SetInput("X", {matched.at("input")->arg()->name});
+    op_desc.SetInput("W", {matched.at("search_seq_fc_w")->arg()->name});
+    op_desc.SetInput("b", {matched.at("search_seq_fc_b")->arg()->name});
+    op_desc.SetOutput("Out",
+                      {matched.at("search_seq_depadding_out")->arg()->name});
+
+    auto* padding_op_info =
+        matched.at("search_group_padding")->stmt()->op_info();
+    op_desc.SetAttr<int>("pad_id", padding_op_info->GetAttr<int>("pad_id"));
+    auto* matmul_0_op_info =
+        matched.at("search_aligned_mat_mul")->stmt()->op_info();
+    op_desc.SetAttr<float>("alpha0", matmul_0_op_info->GetAttr<float>("alpha"));
+    auto* matmul_1_op_info =
+        matched.at("search_aligned_mat_mul_2")->stmt()->op_info();
+    op_desc.SetAttr<float>("alpha1", matmul_1_op_info->GetAttr<float>("alpha"));
+    auto* mask_op_info =
+        matched.at("search_attention_padding_mask")->stmt()->op_info();
+    op_desc.SetAttr<float>("mask", mask_op_info->GetAttr<float>("mask"));
+
+    auto* new_stmt = matched.at("search_group_padding")->stmt();
+    auto* scope = new_stmt->op()->scope();
+    auto w_name = matched.at("search_seq_fc_w")->arg()->name;
+    auto* w_t = scope->FindMutableTensor(w_name);
+    auto w_dims = w_t->dims();
+    int w_len = w_t->numel();
+    float* w_on_host = w_t->mutable_data<float>();
+
+    float max_f = paddle::lite::xpu::math::FindMaxAbs(w_on_host, w_len);
+    std::unique_ptr<int16_t[]> w_int16(new int16_t[w_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        w_on_host, w_int16.get(), max_f, w_len);
+    memcpy(w_on_host, w_int16.get(), w_len * sizeof(int16_t));
+    op_desc.SetAttr<float>("W_max", max_f);
+
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, scope);
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    DirectedLink(matched.at("search_seq_fc_w"),
+                 matched.at("search_group_padding"));
+    DirectedLink(matched.at("search_seq_fc_b"),
+                 matched.at("search_group_padding"));
+    IR_OP_VAR_LINK(matched.at("search_group_padding"),
+                   matched.at("search_seq_depadding_out"));
+  }
+};
+
+class XPUMmdnnMatchConvTopkFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input_x = VarNode("input_x")
+                        ->assert_is_op_input("match_matrix_tensor", "X")
+                        ->AsInput();
+    auto* input_y = VarNode("input_y")
+                        ->assert_is_op_input("match_matrix_tensor", "Y")
+                        ->AsInput();
+    auto* input_w = VarNode("input_w")
+                        ->assert_is_op_input("match_matrix_tensor", "W")
+                        ->AsInput();
+
+    auto* match_matrix_tensor =
+        OpNode("match_matrix_tensor", "match_matrix_tensor");
+    auto* match_out = VarNode("match_out")
+                          ->assert_is_op_output("match_matrix_tensor", "Out")
+                          ->AsIntermediate();
+    auto* match_tmp = VarNode("match_tmp")
+                          ->assert_is_op_output("match_matrix_tensor", "Tmp")
+                          ->AsIntermediate();
+    auto* relu0 = OpNode("relu0", "relu")->AsIntermediate();
+    auto* relu0_out = VarNode("relu0_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* conv_w =
+        VarNode("conv_w")->assert_is_op_input("var_conv_2d", "W")->AsInput();
+    auto* conv = OpNode("conv", "var_conv_2d")->AsIntermediate();
+    auto* conv_out = VarNode("conv_out")
+                         ->assert_is_op_output("var_conv_2d", "Out")
+                         ->AsIntermediate();
+    auto* conv_col = VarNode("conv_col")
+                         ->assert_is_op_output("var_conv_2d", "Col")
+                         ->AsIntermediate();
+    auto* relu1 = OpNode("relu1", "relu")->AsIntermediate();
+    auto* relu1_out = VarNode("relu1_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* seq_concat =
+        OpNode("seq_concat", "sequence_concat")->AsIntermediate();
+    auto* seq_concat_out =
+        VarNode("seq_concat_out")
+            ->assert_is_op_output("sequence_concat", "Out")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "X")
+            ->AsIntermediate();
+    auto* topk_col =
+        VarNode("topk_col")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "COLUMN")
+            ->AsInput();
+    auto* topk_row =
+        VarNode("topk_row")
+            ->assert_is_op_input("sequence_topk_avg_pooling", "ROW")
+            ->AsInput();
+    auto* topk = OpNode("topk", "sequence_topk_avg_pooling")->AsIntermediate();
+    auto* topk_out =
+        VarNode("topk_out")
+            ->assert_is_op_output("sequence_topk_avg_pooling", "Out")
+            ->AsOutput();
+    auto* topk_pos =
+        VarNode("topk_pos")
+            ->assert_is_op_output("sequence_topk_avg_pooling", "pos")
+            ->AsIntermediate();
+
+    *input_x >> *match_matrix_tensor;
+    *input_y >> *match_matrix_tensor;
+    *input_w >> *match_matrix_tensor;
+    *match_matrix_tensor >> *match_out >> *relu0 >> *relu0_out;
+    *match_matrix_tensor >> *match_tmp;
+
+    *relu0_out >> *conv >> *conv_out >> *relu1 >> *relu1_out;
+    *conv_w >> *conv;
+    *conv >> *conv_col;
+
+    *relu0_out >> *seq_concat;
+    *relu1_out >> *seq_concat;
+    *seq_concat >> *seq_concat_out >> *topk >> *topk_out;
+    *topk_col >> *topk;
+    *topk_row >> *topk;
+    *topk >> *topk_pos;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_match_conv_topk");
+    op_desc.SetInput("input_x", {matched.at("input_x")->arg()->name});
+    op_desc.SetInput("input_y", {matched.at("input_y")->arg()->name});
+    op_desc.SetInput("input_w", {matched.at("input_w")->arg()->name});
+    op_desc.SetInput("conv_w", {matched.at("conv_w")->arg()->name});
+    op_desc.SetOutput("topk_out", {matched.at("topk_out")->arg()->name});
+
+    auto* match_op_info = matched.at("match_matrix_tensor")->stmt()->op_info();
+    op_desc.SetAttr<float>("input_w_max",
+                           match_op_info->GetAttr<float>("w_max"));
+    op_desc.SetAttr<int>("dim_t", match_op_info->GetAttr<int>("dim_t"));
+    auto* conv_op_info = matched.at("conv")->stmt()->op_info();
+    op_desc.SetAttr<float>("conv_w_max", conv_op_info->GetAttr<float>("w_max"));
+    auto* topk_op_info = matched.at("topk")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<int>>(
+        "topks", topk_op_info->GetAttr<std::vector<int>>("topks"));
+    op_desc.SetAttr<int>("channel_num",
+                         topk_op_info->GetAttr<int>("channel_num"));
+
+    auto* new_stmt = matched.at("match_matrix_tensor")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    // XXX(miaotianxiang): redundant links around |topk| are automatically
+    // removed as |topk| is
+    // marked intermediate.
+    // RemoveDirectedLink(matched.at("topk_col"), matched.at("topk"));
+    // RemoveDirectedLink(matched.at("topk_row"), matched.at("topk"));
+    std::vector<std::string> arg_names{"conv_w"};
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("match_matrix_tensor"));
+    }
+    std::vector<std::string> out_names{"topk_out"};
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("match_matrix_tensor"), matched.at(name));
+    }
+  }
+};
+
+class XPUMmdnnBidSeqRevEmbEltwiseFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input0 = VarNode("input0")->AsInput();
+    auto* input1 = VarNode("input1")->AsInput();
+    auto* emb_tbl = VarNode("emb_tbl")->AsInput();
+
+    // fwd emb
+    auto* emb0 = OpNode("emb0", "lookup_table");
+    auto* emb0_out =
+        VarNode("emb0_out")->assert_is_op_output("lookup_table", "Out");
+    auto* emb1 = OpNode("emb1", "lookup_table");
+    auto* emb1_out =
+        VarNode("emb1_out")->assert_is_op_output("lookup_table", "Out");
+
+    auto* eltwise01 = OpNode("eltwise01", "search_seq_arithmetic");
+    auto* eltwise01_out =
+        VarNode("eltwise01_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    // rev emb
+    auto* seq_rev2 = OpNode("seq_rev2", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev2_out = VarNode("seq_rev2_out")
+                             ->assert_is_op_output("sequence_reverse", "Y")
+                             ->AsIntermediate();
+    auto* seq_rev3 = OpNode("seq_rev3", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev3_out = VarNode("seq_rev3_out")
+                             ->assert_is_op_output("sequence_reverse", "Y")
+                             ->AsIntermediate();
+    auto* emb2 = OpNode("emb2", "lookup_table")->AsIntermediate();
+    auto* emb2_out = VarNode("emb2_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* emb3 = OpNode("emb3", "lookup_table")->AsIntermediate();
+    auto* emb3_out = VarNode("emb3_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+
+    auto* eltwise23 =
+        OpNode("eltwise23", "search_seq_arithmetic")->AsIntermediate();
+    auto* eltwise23_out =
+        VarNode("eltwise23_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out;
+    *emb_tbl >> *emb0;
+    *input1 >> *emb1 >> *emb1_out >> *eltwise01;
+    *emb_tbl >> *emb1;
+
+    *input0 >> *seq_rev2 >> *seq_rev2_out >> *emb2 >> *emb2_out >> *eltwise23 >>
+        *eltwise23_out;
+    *emb_tbl >> *emb2;
+    *input1 >> *seq_rev3 >> *seq_rev3_out >> *emb3 >> *emb3_out >> *eltwise23;
+    *emb_tbl >> *emb3;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("sequence_reverse");
+    op_desc.SetInput("X", {matched.at("eltwise01_out")->arg()->name});
+    op_desc.SetOutput("Y", {matched.at("eltwise23_out")->arg()->name});
+
+    auto emb0_op = matched.at("emb0")->stmt()->op();
+    auto new_seq_rev_op = LiteOpRegistry::Global().Create("sequence_reverse");
+    new_seq_rev_op->Attach(op_desc, emb0_op->scope());
+    auto* new_seq_rev_node =
+        graph->GraphCreateInstructNode(new_seq_rev_op, emb0_op->valid_places());
+
+    DirectedLink(matched.at("eltwise01_out"), new_seq_rev_node);
+    DirectedLink(new_seq_rev_node, matched.at("eltwise23_out"));
+  }
+};
+
+class XPUMmdnnBidEmbAttFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input0 = VarNode("input0")->AsInput();
+    auto* input1 = VarNode("input1")->AsInput();
+    auto* emb_tbl = VarNode("emb_tbl")->AsInput();
+
+    auto* emb0 = OpNode("emb0", "lookup_table");
+    auto* emb0_out = VarNode("emb0_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate();
+    auto* emb1_out = VarNode("emb1_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* eltwise01 =
+        OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate();
+    auto* eltwise01_out =
+        VarNode("eltwise01_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    auto* att_2in1_w =
+        VarNode("att_2in1_w")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "W")
+            ->AsInput();
+    auto* att_2in1_b =
+        VarNode("att_2in1_b")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "b")
+            ->AsInput();
+    auto* att_2in1 =
+        OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate();
+    auto* att_2in1_out =
+        VarNode("att_2in1_out")
+            ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out")
+            ->AsIntermediate();
+    auto* seq_pool_2in1 =
+        OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_2in1_max_idx =
+        VarNode("seq_pool_2in1_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out;
+    *emb_tbl >> *emb0;
+    *input1 >> *emb1 >> *emb1_out >> *eltwise01;
+    *emb_tbl >> *emb1;
+
+    *eltwise01_out >> *att_2in1 >> *att_2in1_out >> *seq_pool_2in1 >>
+        *seq_pool_2in1_out;
+    *seq_pool_2in1 >> *seq_pool_2in1_max_idx;
+    *att_2in1_w >> *att_2in1;
+    *att_2in1_b >> *att_2in1;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_bid_emb_att");
+    op_desc.SetInput("id0", {matched.at("input0")->arg()->name});
+    op_desc.SetInput("id1", {matched.at("input1")->arg()->name});
+    op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name});
+    op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name});
+    op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name});
+    op_desc.SetOutput("att_pool_out",
+                      {matched.at("seq_pool_2in1_out")->arg()->name});
+    op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name});
+
+    auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info();
+    op_desc.SetAttr<float>("att_fc_w_max",
+                           att_fc_op_info->GetAttr<float>("W_max"));
+
+    auto* new_stmt = matched.at("emb0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    std::vector<std::string> arg_names{
+        "input1", "att_2in1_w", "att_2in1_b",
+    };
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("emb0"));
+    }
+    std::vector<std::string> out_names{
+        "seq_pool_2in1_out", "eltwise01_out",
+    };
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name));
+    }
+  }
+};
+
+class XPUMmdnnBidEmbGrnnAttFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* input0 = VarNode("input0")->AsInput();
+    auto* input1 = VarNode("input1")->AsInput();
+    auto* emb_tbl = VarNode("emb_tbl")->AsInput();
+
+    auto* emb0 = OpNode("emb0", "lookup_table");
+    auto* emb0_out = VarNode("emb0_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* emb1 = OpNode("emb1", "lookup_table")->AsIntermediate();
+    auto* emb1_out = VarNode("emb1_out")
+                         ->assert_is_op_output("lookup_table", "Out")
+                         ->AsIntermediate();
+    auto* eltwise01 =
+        OpNode("eltwise01", "search_seq_arithmetic")->AsIntermediate();
+    auto* eltwise01_out =
+        VarNode("eltwise01_out")
+            ->assert_is_op_output("search_seq_arithmetic", "Out")
+            ->AsOutput();
+
+    auto* seq_rev_right0 =
+        OpNode("seq_rev_right0", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_right0_out =
+        VarNode("seq_rev_right0_out")
+            ->assert_is_op_output("sequence_reverse", "Y")
+            ->AsIntermediate();
+    auto* grnn_right_wh = VarNode("grnn_right_wh")
+                              ->assert_is_op_input("search_grnn", "Wh")
+                              ->AsInput();
+    auto* grnn_right_wi = VarNode("grnn_right_wi")
+                              ->assert_is_op_input("search_grnn", "Wi")
+                              ->AsInput();
+    auto* grnn_right = OpNode("grnn_right", "search_grnn")->AsIntermediate();
+    auto* grnn_right_out = VarNode("grnn_right_out")
+                               ->assert_is_op_output("search_grnn", "Out")
+                               ->AsIntermediate();
+    auto* grnn_right_idx_sorted_by_width =
+        VarNode("grnn_right_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_right_layout_input =
+        VarNode("grnn_right_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_right_tmp_buffer =
+        VarNode("grnn_right_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_rev_right1 =
+        OpNode("seq_rev_right1", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_right1_out =
+        VarNode("seq_rev_right1_out")
+            ->assert_is_op_output("sequence_reverse", "Y")
+            ->AsIntermediate();
+    auto* seq_pool_right =
+        OpNode("seq_pool_right", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_right_out = VarNode("seq_pool_right_out")
+                                   ->assert_is_op_output("sequence_pool", "Out")
+                                   ->AsOutput();
+    auto* seq_pool_right_max_idx =
+        VarNode("seq_pool_right_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* grnn_left_wh = VarNode("grnn_left_wh")
+                             ->assert_is_op_input("search_grnn", "Wh")
+                             ->AsInput();
+    auto* grnn_left_wi = VarNode("grnn_left_wi")
+                             ->assert_is_op_input("search_grnn", "Wi")
+                             ->AsInput();
+    auto* grnn_left = OpNode("grnn_left", "search_grnn")->AsIntermediate();
+    auto* grnn_left_out = VarNode("grnn_left_out")
+                              ->assert_is_op_output("search_grnn", "Out")
+                              ->AsIntermediate();
+    auto* grnn_left_idx_sorted_by_width =
+        VarNode("grnn_left_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_left_layout_input =
+        VarNode("grnn_left_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_left_tmp_buffer =
+        VarNode("grnn_left_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_pool_left =
+        OpNode("seq_pool_left", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_left_out = VarNode("seq_pool_left_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_left_max_idx =
+        VarNode("seq_pool_left_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate();
+    auto* concat_2in1_out = VarNode("concat_2in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    auto* att_2in1_w =
+        VarNode("att_2in1_w")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "W")
+            ->AsInput();
+    auto* att_2in1_b =
+        VarNode("att_2in1_b")
+            ->assert_is_op_input("__xpu__mmdnn_search_attention", "b")
+            ->AsInput();
+    auto* att_2in1 =
+        OpNode("att_2in1", "__xpu__mmdnn_search_attention")->AsIntermediate();
+    auto* att_2in1_out =
+        VarNode("att_2in1_out")
+            ->assert_is_op_output("__xpu__mmdnn_search_attention", "Out")
+            ->AsIntermediate();
+    auto* seq_pool_2in1 =
+        OpNode("seq_pool_2in1", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_2in1_out = VarNode("seq_pool_2in1_out")
+                                  ->assert_is_op_output("sequence_pool", "Out")
+                                  ->AsOutput();
+    auto* seq_pool_2in1_max_idx =
+        VarNode("seq_pool_2in1_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* concat_3in1 = OpNode("concat_3in1", "concat")->AsIntermediate();
+    auto* concat_3in1_out = VarNode("concat_3in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsOutput();
+
+    *input0 >> *emb0 >> *emb0_out >> *eltwise01 >> *eltwise01_out;
+    *emb_tbl >> *emb0;
+    *input1 >> *emb1 >> *emb1_out >> *eltwise01;
+    *emb_tbl >> *emb1;
+
+    *eltwise01_out >> *seq_rev_right0 >> *seq_rev_right0_out >> *grnn_right >>
+        *grnn_right_out >> *seq_rev_right1 >> *seq_rev_right1_out;
+    *grnn_right_out >> *seq_pool_right >> *seq_pool_right_out;
+    *seq_pool_right >> *seq_pool_right_max_idx;
+    *grnn_right_wh >> *grnn_right;
+    *grnn_right_wi >> *grnn_right;
+    *grnn_right >> *grnn_right_idx_sorted_by_width;
+    *grnn_right >> *grnn_right_layout_input;
+    *grnn_right >> *grnn_right_tmp_buffer;
+
+    *eltwise01_out >> *grnn_left >> *grnn_left_out >> *seq_pool_left >>
+        *seq_pool_left_out;
+    *seq_pool_left >> *seq_pool_left_max_idx;
+    *grnn_left_wh >> *grnn_left;
+    *grnn_left_wi >> *grnn_left;
+    *grnn_left >> *grnn_left_idx_sorted_by_width;
+    *grnn_left >> *grnn_left_layout_input;
+    *grnn_left >> *grnn_left_tmp_buffer;
+
+    *seq_rev_right1_out >> *concat_2in1;
+    *grnn_left_out >> *concat_2in1;
+    *concat_2in1 >> *concat_2in1_out >> *att_2in1 >> *att_2in1_out >>
+        *seq_pool_2in1 >> *seq_pool_2in1_out;
+    *seq_pool_2in1 >> *seq_pool_2in1_max_idx;
+    *att_2in1_w >> *att_2in1;
+    *att_2in1_b >> *att_2in1;
+
+    *eltwise01_out >> *concat_3in1;
+    *seq_rev_right1_out >> *concat_3in1;
+    *grnn_left_out >> *concat_3in1;
+    *concat_3in1 >> *concat_3in1_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_bid_emb_grnn_att");
+    op_desc.SetInput("id0", {matched.at("input0")->arg()->name});
+    op_desc.SetInput("id1", {matched.at("input1")->arg()->name});
+    op_desc.SetInput("emb_tbl", {matched.at("emb_tbl")->arg()->name});
+    op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_left_wh")->arg()->name});
+    op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_left_wi")->arg()->name});
+    op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_right_wh")->arg()->name});
+    op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_right_wi")->arg()->name});
+    op_desc.SetInput("att_fc_w", {matched.at("att_2in1_w")->arg()->name});
+    op_desc.SetInput("att_fc_b", {matched.at("att_2in1_b")->arg()->name});
+    op_desc.SetOutput("grnn_fw_pool_out",
+                      {matched.at("seq_pool_left_out")->arg()->name});
+    op_desc.SetOutput("grnn_rv_pool_out",
+                      {matched.at("seq_pool_right_out")->arg()->name});
+    op_desc.SetOutput("att_pool_out",
+                      {matched.at("seq_pool_2in1_out")->arg()->name});
+    op_desc.SetOutput("concat_3in1_out",
+                      {matched.at("concat_3in1_out")->arg()->name});
+    op_desc.SetOutput("emb_fw_out", {matched.at("eltwise01_out")->arg()->name});
+
+    auto* grnn_fw_op_info = matched.at("grnn_left")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wh_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wi_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("wi_max"));
+    auto* grnn_rv_op_info = matched.at("grnn_right")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wh_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wi_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("wi_max"));
+    auto* att_fc_op_info = matched.at("att_2in1")->stmt()->op_info();
+    op_desc.SetAttr<float>("att_fc_w_max",
+                           att_fc_op_info->GetAttr<float>("W_max"));
+
+    auto* new_stmt = matched.at("emb0")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    std::vector<std::string> arg_names{
+        "input1",
+        "grnn_left_wh",
+        "grnn_left_wi",
+        "grnn_right_wh",
+        "grnn_right_wi",
+        "att_2in1_w",
+        "att_2in1_b",
+    };
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("emb0"));
+    }
+    std::vector<std::string> out_names{
+        "seq_pool_left_out",
+        "seq_pool_right_out",
+        "seq_pool_2in1_out",
+        "concat_3in1_out",
+        "eltwise01_out",
+    };
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("emb0"), matched.at(name));
+    }
+  }
+};
+
+class XPUMmdnnMergeAllFuser : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* concat_7in1_input0 = VarNode("concat_7in1_input0")
+                                   ->assert_is_op_nth_input("concat", "X", 0)
+                                   ->AsInput();
+    auto* concat_7in1_input1 = VarNode("concat_7in1_input1")
+                                   ->assert_is_op_nth_input("concat", "X", 1)
+                                   ->AsInput();
+    auto* concat_7in1_input2 = VarNode("concat_7in1_input2")
+                                   ->assert_is_op_nth_input("concat", "X", 2)
+                                   ->AsInput();
+    auto* concat_7in1_input3 = VarNode("concat_7in1_input3")
+                                   ->assert_is_op_nth_input("concat", "X", 3)
+                                   ->AsInput();
+    auto* concat_7in1_input4 = VarNode("concat_7in1_input4")
+                                   ->assert_is_op_nth_input("concat", "X", 4)
+                                   ->AsInput();
+    auto* concat_7in1_input5 = VarNode("concat_7in1_input5")
+                                   ->assert_is_op_nth_input("concat", "X", 5)
+                                   ->AsInput();
+    auto* concat_7in1_input6 = VarNode("concat_7in1_input6")
+                                   ->assert_is_op_nth_input("concat", "X", 6)
+                                   ->AsInput();
+    auto* concat_7in1 = OpNode("concat_7in1", "concat");
+    auto* concat_7in1_out = VarNode("concat_7in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    auto* search_fc0_w = VarNode("search_fc0_w")
+                             ->assert_is_op_input("search_fc", "W")
+                             ->AsInput();
+    auto* search_fc0_b = VarNode("search_fc0_b")
+                             ->assert_is_op_input("search_fc", "b")
+                             ->AsInput();
+    auto* search_fc0 = OpNode("search_fc0", "search_fc")->AsIntermediate();
+    auto* search_fc0_out = VarNode("search_fc0_out")
+                               ->assert_is_op_output("search_fc", "Out")
+                               ->AsIntermediate();
+    auto* relu0 = OpNode("relu0", "relu")->AsIntermediate();
+    auto* relu0_out = VarNode("relu0_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+
+    auto* concat_2in1_input0 = VarNode("concat_2in1_input0")
+                                   ->assert_is_op_nth_input("concat", "X", 0)
+                                   ->AsInput();
+    auto* concat_2in1_input1 = VarNode("concat_2in1_input1")
+                                   ->assert_is_op_nth_input("concat", "X", 1)
+                                   ->AsInput();
+    auto* concat_2in1 = OpNode("concat_2in1", "concat")->AsIntermediate();
+    auto* concat_2in1_out = VarNode("concat_2in1_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    auto* seq_rev = OpNode("seq_rev", "sequence_reverse")->AsIntermediate();
+    auto* seq_rev_out = VarNode("seq_rev_out")
+                            ->assert_is_op_output("sequence_reverse", "Y")
+                            ->AsIntermediate();
+
+    auto* grnn_rv_wh = VarNode("grnn_rv_wh")
+                           ->assert_is_op_input("search_grnn", "Wh")
+                           ->AsInput();
+    auto* grnn_rv_wi = VarNode("grnn_rv_wi")
+                           ->assert_is_op_input("search_grnn", "Wi")
+                           ->AsInput();
+    auto* grnn_rv = OpNode("grnn_rv", "search_grnn")->AsIntermediate();
+    auto* grnn_rv_out = VarNode("grnn_rv_out")
+                            ->assert_is_op_output("search_grnn", "Out")
+                            ->AsIntermediate();
+    auto* grnn_rv_idx_sorted_by_width =
+        VarNode("grnn_rv_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_rv_layout_input =
+        VarNode("grnn_rv_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_rv_tmp_buffer =
+        VarNode("grnn_rv_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_pool_rv =
+        OpNode("seq_pool_rv", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_rv_out = VarNode("seq_pool_rv_out")
+                                ->assert_is_op_output("sequence_pool", "Out")
+                                ->AsIntermediate();
+    auto* seq_pool_rv_max_idx =
+        VarNode("seq_pool_rv_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* grnn_fw_wh = VarNode("grnn_fw_wh")
+                           ->assert_is_op_input("search_grnn", "Wh")
+                           ->AsInput();
+    auto* grnn_fw_wi = VarNode("grnn_fw_wi")
+                           ->assert_is_op_input("search_grnn", "Wi")
+                           ->AsInput();
+    auto* grnn_fw = OpNode("grnn_fw", "search_grnn")->AsIntermediate();
+    auto* grnn_fw_out = VarNode("grnn_fw_out")
+                            ->assert_is_op_output("search_grnn", "Out")
+                            ->AsIntermediate();
+    auto* grnn_fw_idx_sorted_by_width =
+        VarNode("grnn_fw_idx_sorted_by_width")
+            ->assert_is_op_output("search_grnn", "idx_sorted_by_width")
+            ->AsIntermediate();
+    auto* grnn_fw_layout_input =
+        VarNode("grnn_fw_layout_input")
+            ->assert_is_op_output("search_grnn", "layout_input")
+            ->AsIntermediate();
+    auto* grnn_fw_tmp_buffer =
+        VarNode("grnn_fw_tmp_buffer")
+            ->assert_is_op_output("search_grnn", "tmp_buffer")
+            ->AsIntermediate();
+    auto* seq_pool_fw =
+        OpNode("seq_pool_fw", "sequence_pool")->AsIntermediate();
+    auto* seq_pool_fw_out = VarNode("seq_pool_fw_out")
+                                ->assert_is_op_output("sequence_pool", "Out")
+                                ->AsIntermediate();
+    auto* seq_pool_fw_max_idx =
+        VarNode("seq_pool_fw_max_idx")
+            ->assert_is_op_output("sequence_pool", "MaxIndex")
+            ->AsIntermediate();
+
+    auto* rv_fw_concat = OpNode("rv_fw_concat", "concat")->AsIntermediate();
+    auto* rv_fw_concat_out = VarNode("rv_fw_concat_out")
+                                 ->assert_is_op_output("concat", "Out")
+                                 ->AsIntermediate();
+
+    auto* last_concat = OpNode("last_concat", "concat")->AsIntermediate();
+    auto* last_concat_out = VarNode("last_concat_out")
+                                ->assert_is_op_output("concat", "Out")
+                                ->AsIntermediate();
+    auto* search_fc1_w = VarNode("search_fc1_w")
+                             ->assert_is_op_input("search_fc", "W")
+                             ->AsInput();
+    auto* search_fc1_b = VarNode("search_fc1_b")
+                             ->assert_is_op_input("search_fc", "b")
+                             ->AsInput();
+    auto* search_fc1 = OpNode("search_fc1", "search_fc")->AsIntermediate();
+    auto* search_fc1_out = VarNode("search_fc1_out")
+                               ->assert_is_op_output("search_fc", "Out")
+                               ->AsIntermediate();
+    auto* relu1 = OpNode("relu1", "relu")->AsIntermediate();
+    auto* relu1_out = VarNode("relu1_out")
+                          ->assert_is_op_output("relu", "Out")
+                          ->AsIntermediate();
+    auto* search_fc2_w = VarNode("search_fc2_w")
+                             ->assert_is_op_input("search_fc", "W")
+                             ->AsInput();
+    auto* search_fc2_b = VarNode("search_fc2_b")
+                             ->assert_is_op_input("search_fc", "b")
+                             ->AsInput();
+    auto* search_fc2 = OpNode("search_fc2", "search_fc")->AsIntermediate();
+    auto* search_fc2_out = VarNode("search_fc2_out")
+                               ->assert_is_op_output("search_fc", "Out")
+                               ->AsOutput();
+
+    *concat_7in1_input0 >> *concat_7in1;
+    *concat_7in1_input1 >> *concat_7in1;
+    *concat_7in1_input2 >> *concat_7in1;
+    *concat_7in1_input3 >> *concat_7in1;
+    *concat_7in1_input4 >> *concat_7in1;
+    *concat_7in1_input5 >> *concat_7in1;
+    *concat_7in1_input6 >> *concat_7in1;
+    *concat_7in1 >> *concat_7in1_out >> *search_fc0 >> *search_fc0_out >>
+        *relu0 >> *relu0_out;
+    *search_fc0_w >> *search_fc0;
+    *search_fc0_b >> *search_fc0;
+
+    *concat_2in1_input0 >> *concat_2in1;
+    *concat_2in1_input1 >> *concat_2in1;
+    *concat_2in1 >> *concat_2in1_out >> *seq_rev >> *seq_rev_out;
+
+    *seq_rev_out >> *grnn_rv >> *grnn_rv_out >> *seq_pool_rv >>
+        *seq_pool_rv_out;
+    *seq_pool_rv >> *seq_pool_rv_max_idx;
+    *grnn_rv_wh >> *grnn_rv;
+    *grnn_rv_wi >> *grnn_rv;
+    *grnn_rv >> *grnn_rv_idx_sorted_by_width;
+    *grnn_rv >> *grnn_rv_layout_input;
+    *grnn_rv >> *grnn_rv_tmp_buffer;
+
+    *concat_2in1_out >> *grnn_fw >> *grnn_fw_out >> *seq_pool_fw >>
+        *seq_pool_fw_out;
+    *seq_pool_fw >> *seq_pool_fw_max_idx;
+    *grnn_fw_wh >> *grnn_fw;
+    *grnn_fw_wi >> *grnn_fw;
+    *grnn_fw >> *grnn_fw_idx_sorted_by_width;
+    *grnn_fw >> *grnn_fw_layout_input;
+    *grnn_fw >> *grnn_fw_tmp_buffer;
+
+    *seq_pool_rv_out >> *rv_fw_concat;
+    *seq_pool_fw_out >> *rv_fw_concat;
+    *rv_fw_concat >> *rv_fw_concat_out;
+
+    *rv_fw_concat_out >> *last_concat;
+    *relu0_out >> *last_concat;
+    *last_concat >> *last_concat_out >> *search_fc1 >> *search_fc1_out >>
+        *relu1 >> *relu1_out >> *search_fc2 >> *search_fc2_out;
+    *search_fc1_w >> *search_fc1;
+    *search_fc1_b >> *search_fc1;
+    *search_fc2_w >> *search_fc2;
+    *search_fc2_b >> *search_fc2;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__mmdnn_merge_all");
+    auto* concat_7in1_op_info = matched.at("concat_7in1")->stmt()->op_info();
+    op_desc.SetInput("concat_7in1_x", concat_7in1_op_info->Input("X"));
+    auto* concat_2in1_op_info = matched.at("concat_2in1")->stmt()->op_info();
+    op_desc.SetInput("concat_2in1_x", concat_2in1_op_info->Input("X"));
+    op_desc.SetInput("grnn_fw_wh", {matched.at("grnn_fw_wh")->arg()->name});
+    op_desc.SetInput("grnn_fw_wi", {matched.at("grnn_fw_wi")->arg()->name});
+    op_desc.SetInput("grnn_rv_wh", {matched.at("grnn_rv_wh")->arg()->name});
+    op_desc.SetInput("grnn_rv_wi", {matched.at("grnn_rv_wi")->arg()->name});
+    op_desc.SetInput("fc0_w", {matched.at("search_fc0_w")->arg()->name});
+    op_desc.SetInput("fc0_b", {matched.at("search_fc0_b")->arg()->name});
+    op_desc.SetInput("fc1_w", {matched.at("search_fc1_w")->arg()->name});
+    op_desc.SetInput("fc1_b", {matched.at("search_fc1_b")->arg()->name});
+    op_desc.SetInput("fc2_w", {matched.at("search_fc2_w")->arg()->name});
+    op_desc.SetInput("fc2_b", {matched.at("search_fc2_b")->arg()->name});
+
+    op_desc.SetOutput("out", {matched.at("search_fc2_out")->arg()->name});
+
+    auto* grnn_fw_op_info = matched.at("grnn_fw")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wh_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_fw_wi_maxs",
+        grnn_fw_op_info->GetAttr<std::vector<float>>("wi_max"));
+    auto* grnn_rv_op_info = matched.at("grnn_rv")->stmt()->op_info();
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wh_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("wh_max"));
+    op_desc.SetAttr<std::vector<float>>(
+        "grnn_rv_wi_maxs",
+        grnn_rv_op_info->GetAttr<std::vector<float>>("wi_max"));
+    auto* fc0_op_info = matched.at("search_fc0")->stmt()->op_info();
+    op_desc.SetAttr<float>("fc0_w_max", fc0_op_info->GetAttr<float>("w_max"));
+    auto* fc1_op_info = matched.at("search_fc1")->stmt()->op_info();
+    op_desc.SetAttr<float>("fc1_w_max", fc1_op_info->GetAttr<float>("w_max"));
+    auto* fc2_op_info = matched.at("search_fc2")->stmt()->op_info();
+    op_desc.SetAttr<float>("fc2_w_max", fc2_op_info->GetAttr<float>("w_max"));
+
+    auto* new_stmt = matched.at("concat_7in1")->stmt();
+    auto new_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    new_op->Attach(op_desc, new_stmt->op()->scope());
+    new_op->SetValidPlaces(new_stmt->op()->valid_places());
+    auto kernels = new_op->CreateKernels(new_op->valid_places());
+    new_stmt->SetOp(new_op);
+    new_stmt->SetKernels(std::move(kernels));
+
+    std::vector<std::string> arg_names{
+        "concat_2in1_input0",
+        "concat_2in1_input1",
+        "grnn_fw_wh",
+        "grnn_fw_wi",
+        "grnn_rv_wh",
+        "grnn_rv_wi",
+        "search_fc0_w",
+        "search_fc0_b",
+        "search_fc1_w",
+        "search_fc1_b",
+        "search_fc2_w",
+        "search_fc2_b",
+    };
+    for (auto name : arg_names) {
+      DirectedLink(matched.at(name), matched.at("concat_7in1"));
+    }
+    std::vector<std::string> out_names{
+        "search_fc2_out",
+    };
+    for (auto name : out_names) {
+      IR_OP_VAR_LINK(matched.at("concat_7in1"), matched.at(name));
+    }
+  }
+};
+
+}  // namespace fusion
+
+class XPUMmdnnFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+
+    fusion::XPUMmdnnFloat2Fix float_2_fix;
+    float_2_fix(graph.get());
+    fusion::XPUMmdnnSearchAttentionFuser search_att_fuser;
+    search_att_fuser(graph.get());
+    fusion::XPUMmdnnMatchConvTopkFuser match_conv_topk_fuser;
+    match_conv_topk_fuser(graph.get());
+
+    fusion::XPUMmdnnBidSeqRevEmbEltwiseFuser bi_seq_rev_emb_eltwise_fuser;
+    bi_seq_rev_emb_eltwise_fuser(graph.get());
+    fusion::XPUMmdnnBidEmbGrnnAttFuser bid_emb_grnn_att_fuser;
+    bid_emb_grnn_att_fuser(graph.get());
+    fusion::XPUMmdnnBidEmbAttFuser bid_emb_att_fuser;
+    bid_emb_att_fuser(graph.get());
+    fusion::XPUMmdnnMergeAllFuser merge_all_fuser;
+    merge_all_fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__mmdnn_fuse_pass, paddle::lite::mir::XPUMmdnnFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("__xpu__mmdnn_search_attention")
+    .BindKernel("__xpu__mmdnn_bid_emb_grnn_att")
+    .BindKernel("__xpu__mmdnn_bid_emb_att")
+    .BindKernel("__xpu__mmdnn_match_conv_topk")
+    .BindKernel("__xpu__mmdnn_merge_all");
diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
index 525042e44b2997013943f392f592d812bd68fa0b..04988612192b79824b1294428fa9b1c38d784979 100644
--- a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
+++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -639,20 +639,21 @@ class XPUMultiEncoderFusePass : public ProgramPass {
     std::set<int> fc_int31_ids;
 #ifdef LITE_WITH_XPU
     // TODO(miaotianxiang): core/mir/*_pass.cc are compiled anyway and need to
-    // access Context<kXPU>::_multi_encoder_precision, but this static member
-    // variable in class specialization defined in lite/core/context.cc
-    // is only compiled iff LITE_WITH_XPU==ON. To suppress linkage error, we use
+    // access TargetWrapperXPU::multi_encoder_precision, but this static member
+    // variable in class specialization defined in
+    // lite/backends/xpu/target_wrapper.cc is only compiled iff
+    // LITE_WITH_XPU==ON. To suppress linkage error, we use
     // #ifdef here. Any better idea?
     if (GetStringFromEnv("XPU_ENCODER_PRECISION", "int16") == "int31" ||
-        lite::Context<TargetType::kXPU>::_multi_encoder_precision == "int31") {
+        lite::TargetWrapperXPU::multi_encoder_precision == "int31") {
       fc_int31_ids = {0, 1, 2, 3, 4, 5};
       VLOG(3) << "Use int31 in XPUMultiEncoderOp, "
-              << "lite::Context<>::_multi_encoder_precision="
-              << lite::Context<TargetType::kXPU>::_multi_encoder_precision;
+              << "lite::TargetWrapperXPU::multi_encoder_precision="
+              << lite::TargetWrapperXPU::multi_encoder_precision;
     } else {
       VLOG(3) << "Use int16 in XPUMultiEncoderOp, "
-              << "lite::Context<>::_multi_encoder_precision="
-              << lite::Context<TargetType::kXPU>::_multi_encoder_precision;
+              << "lite::TargetWrapperXPU::multi_encoder_precision="
+              << lite::TargetWrapperXPU::multi_encoder_precision;
     }
 #endif
 
diff --git a/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b25eb084f286fccfa4afe8832f9dc1ff8384d552
--- /dev/null
+++ b/lite/core/mir/fusion/__xpu__resnet_cbam_fuse_pass.cc
@@ -0,0 +1,1389 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/math.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/xpu_pattern_matcher_high_api.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class XPUResNetCbamBlock0Fuser : public FuseBase {
+ public:
+  XPUResNetCbamBlock0Fuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* left_conv1_weight = VarNode("left_conv1_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv1 = OpNode("left_conv1", "conv2d");
+    auto* left_conv1_out = VarNode("left_conv1_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn1_scale = VarNode("left_bn1_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn1_bias = VarNode("left_bn1_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn1_mean = VarNode("left_bn1_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn1_var = VarNode("left_bn1_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate();
+    auto* left_bn1_out = VarNode("left_bn1_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn1_mean_out = VarNode("left_bn1_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn1_var_out =
+        VarNode("left_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn1_saved_mean =
+        VarNode("left_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn1_saved_var =
+        VarNode("left_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate();
+    auto* left_relu1_out = VarNode("left_relu1_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv2_weight = VarNode("left_conv2_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate();
+    auto* left_conv2_out = VarNode("left_conv2_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn2_scale = VarNode("left_bn2_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn2_bias = VarNode("left_bn2_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn2_mean = VarNode("left_bn2_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn2_var = VarNode("left_bn2_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate();
+    auto* left_bn2_out = VarNode("left_bn2_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->assert_is_op_input("relu", "X")
+                             ->AsIntermediate();
+    auto* left_bn2_mean_out = VarNode("left_bn2_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn2_var_out =
+        VarNode("left_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn2_saved_mean =
+        VarNode("left_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn2_saved_var =
+        VarNode("left_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate();
+    auto* left_relu2_out = VarNode("left_relu2_out")
+                               ->assert_is_op_output("relu", "Out")
+                               ->assert_is_op_input("conv2d", "Input")
+                               ->AsIntermediate();
+
+    auto* left_conv3_weight = VarNode("left_conv3_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate();
+    auto* left_conv3_out = VarNode("left_conv3_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("batch_norm", "X")
+                               ->AsIntermediate();
+    auto* left_bn3_scale = VarNode("left_bn3_scale")
+                               ->assert_is_op_input("batch_norm", "Scale")
+                               ->AsIntermediate();
+    auto* left_bn3_bias = VarNode("left_bn3_bias")
+                              ->assert_is_op_input("batch_norm", "Bias")
+                              ->AsInput();
+    auto* left_bn3_mean = VarNode("left_bn3_mean")
+                              ->assert_is_op_input("batch_norm", "Mean")
+                              ->AsIntermediate();
+    auto* left_bn3_var = VarNode("left_bn3_variance")
+                             ->assert_is_op_input("batch_norm", "Variance")
+                             ->AsIntermediate();
+    auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate();
+    auto* left_bn3_out = VarNode("left_bn3_out")
+                             ->assert_is_op_output("batch_norm", "Y")
+                             ->AsIntermediate();
+    auto* left_bn3_mean_out = VarNode("left_bn3_mean_out")
+                                  ->assert_is_op_output("batch_norm", "MeanOut")
+                                  ->AsIntermediate();
+    auto* left_bn3_var_out =
+        VarNode("left_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* left_bn3_saved_mean =
+        VarNode("left_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* left_bn3_saved_var =
+        VarNode("left_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    // cbam specific
+    auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_out = VarNode("reduce_mean_out")
+                                ->assert_is_op_output("reduce_mean", "Out")
+                                ->assert_is_op_input("concat")
+                                ->AsIntermediate();
+    auto* reduce_max = OpNode("reduce_max", "reduce_max")->AsIntermediate();
+    auto* reduce_max_out = VarNode("reduce_max_out")
+                               ->assert_is_op_output("reduce_max", "Out")
+                               ->assert_is_op_input("concat")
+                               ->AsIntermediate();
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* concat_out = VarNode("concat_out")
+                           ->assert_is_op_output("concat", "Out")
+                           ->assert_is_op_input("conv2d", "Input")
+                           ->AsIntermediate();
+    auto* left_conv4_weight = VarNode("left_conv4_weight")
+                                  ->assert_is_op_input("conv2d", "Filter")
+                                  ->AsInput();
+    auto* left_conv4 = OpNode("left_conv4", "conv2d")->AsIntermediate();
+    auto* left_conv4_out = VarNode("left_conv4_out")
+                               ->assert_is_op_output("conv2d", "Output")
+                               ->assert_is_op_input("sigmoid", "X")
+                               ->AsIntermediate();
+    auto* sigmoid = OpNode("sigmoid", "sigmoid")->AsIntermediate();
+    auto* sigmoid_out = VarNode("sigmoid_out")
+                            ->assert_is_op_output("sigmoid", "Out")
+                            ->assert_is_op_input("elementwise_mul")
+                            ->AsIntermediate();
+    auto* reshape = OpNode("reshape", "reshape2")->AsIntermediate();
+    auto* reshape_out = VarNode("reshape_out")
+                            ->assert_is_op_output("reshape2", "Out")
+                            ->assert_is_op_input("elementwise_mul")
+                            ->AsIntermediate();
+    auto* reshape_xshape = VarNode("reshape_xshape")
+                               ->assert_is_op_output("reshape2", "XShape")
+                               ->AsIntermediate();
+    auto* eltwise_mul =
+        OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate();
+    auto* eltwise_mul_out = VarNode("eltwise_mul_out")
+                                ->assert_is_op_output("elementwise_mul", "Out")
+                                ->assert_is_op_input("elementwise_add")
+                                ->AsIntermediate();
+
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate();
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("elementwise_add")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >>
+        *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >>
+        *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >>
+        *left_conv3 >> *left_conv3_out >> *left_bn3 >>
+        *left_bn3_out /* >> *add*/;
+
+    *left_bn3_out >> *reduce_mean >> *reduce_mean_out >> *concat;
+    *left_bn3_out >> *reduce_max >> *reduce_max_out >> *concat;
+    *concat >> *concat_out >> *left_conv4 >> *left_conv4_out >> *sigmoid >>
+        *sigmoid_out >> *eltwise_mul;
+    *left_conv4_weight >> *left_conv4;
+    *left_bn3_out >> *reshape >> *reshape_out >> *eltwise_mul;
+    *reshape >> *reshape_xshape;
+    *eltwise_mul >> *eltwise_mul_out >> *add;
+
+    *left_conv1_weight >> *left_conv1;
+    *left_bn1_scale >> *left_bn1;
+    *left_bn1_bias >> *left_bn1;
+    *left_bn1_mean >> *left_bn1;
+    *left_bn1_var >> *left_bn1;
+    *left_bn1 >> *left_bn1_mean_out;
+    *left_bn1 >> *left_bn1_var_out;
+    *left_bn1 >> *left_bn1_saved_mean;
+    *left_bn1 >> *left_bn1_saved_var;
+
+    *left_conv2_weight >> *left_conv2;
+    *left_bn2_scale >> *left_bn2;
+    *left_bn2_bias >> *left_bn2;
+    *left_bn2_mean >> *left_bn2;
+    *left_bn2_var >> *left_bn2;
+    *left_bn2 >> *left_bn2_mean_out;
+    *left_bn2 >> *left_bn2_var_out;
+    *left_bn2 >> *left_bn2_saved_mean;
+    *left_bn2 >> *left_bn2_saved_var;
+
+    *left_conv3_weight >> *left_conv3;
+    *left_bn3_scale >> *left_bn3;
+    *left_bn3_bias >> *left_bn3;
+    *left_bn3_mean >> *left_bn3;
+    *left_bn3_var >> *left_bn3;
+    *left_bn3 >> *left_bn3_mean_out;
+    *left_bn3 >> *left_bn3_var_out;
+    *left_bn3 >> *left_bn3_saved_mean;
+    *left_bn3 >> *left_bn3_saved_var;
+
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *add;
+
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_cbam_block0");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("left_conv1_weight")->arg()->name,
+                         matched.at("left_conv2_weight")->arg()->name,
+                         matched.at("left_conv3_weight")->arg()->name,
+                         matched.at("left_conv4_weight")->arg()->name,
+                         matched.at("right_conv1_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("left_bn1_scale")->arg()->name,
+                         matched.at("left_bn2_scale")->arg()->name,
+                         matched.at("left_bn3_scale")->arg()->name,
+                         "placeholder_sa_conv",
+                         matched.at("right_bn1_scale")->arg()->name,
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("left_bn1_bias")->arg()->name,
+                         matched.at("left_bn2_bias")->arg()->name,
+                         matched.at("left_bn3_bias")->arg()->name,
+                         "placeholder_sa_conv",
+                         matched.at("right_bn1_bias")->arg()->name,
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("left_bn1_mean")->arg()->name,
+                         matched.at("left_bn2_mean")->arg()->name,
+                         matched.at("left_bn3_mean")->arg()->name,
+                         "placeholder_sa_conv",
+                         matched.at("right_bn1_mean")->arg()->name,
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("left_bn1_variance")->arg()->name,
+                         matched.at("left_bn2_variance")->arg()->name,
+                         matched.at("left_bn3_variance")->arg()->name,
+                         "placeholder_sa_conv",
+                         matched.at("right_bn1_variance")->arg()->name,
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    auto block0_stmt = matched.at("left_conv1")->stmt();
+    // block0_stmt->ResetOp(op_desc, graph->valid_places());
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
+    block0_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "left_conv2_weight",
+        "left_conv3_weight",
+        "left_conv4_weight",
+        "right_conv1_weight",
+        "left_bn1_bias",
+        "left_bn2_bias",
+        "left_bn3_bias",
+        "right_bn1_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out"));
+  }
+};
+
+class XPUResNetCbamBlock1Fuser : public FuseBase {
+ public:
+  XPUResNetCbamBlock1Fuser() {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")
+                      ->assert_is_op_input("conv2d", "Input")
+                      ->assert_is_op_input("elementwise_add")
+                      ->AsInput();
+
+    auto* right_conv1_weight = VarNode("right_conv1_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv1 = OpNode("right_conv1", "conv2d");
+    auto* right_conv1_out = VarNode("right_conv1_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn1_scale = VarNode("right_bn1_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn1_bias = VarNode("right_bn1_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn1_mean = VarNode("right_bn1_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn1_var = VarNode("right_bn1_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate();
+    auto* right_bn1_out = VarNode("right_bn1_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn1_mean_out =
+        VarNode("right_bn1_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn1_var_out =
+        VarNode("right_bn1_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn1_saved_mean =
+        VarNode("right_bn1_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn1_saved_var =
+        VarNode("right_bn1_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate();
+    auto* right_relu1_out = VarNode("right_relu1_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+
+    auto* right_conv2_weight = VarNode("right_conv2_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate();
+    auto* right_conv2_out = VarNode("right_conv2_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn2_scale = VarNode("right_bn2_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn2_bias = VarNode("right_bn2_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn2_mean = VarNode("right_bn2_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn2_var = VarNode("right_bn2_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate();
+    auto* right_bn2_out = VarNode("right_bn2_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->assert_is_op_input("relu", "X")
+                              ->AsIntermediate();
+    auto* right_bn2_mean_out =
+        VarNode("right_bn2_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn2_var_out =
+        VarNode("right_bn2_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn2_saved_mean =
+        VarNode("right_bn2_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn2_saved_var =
+        VarNode("right_bn2_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate();
+    auto* right_relu2_out = VarNode("right_relu2_out")
+                                ->assert_is_op_output("relu", "Out")
+                                ->assert_is_op_input("conv2d", "Input")
+                                ->AsIntermediate();
+
+    auto* right_conv3_weight = VarNode("right_conv3_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate();
+    auto* right_conv3_out = VarNode("right_conv3_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("batch_norm", "X")
+                                ->AsIntermediate();
+    auto* right_bn3_scale = VarNode("right_bn3_scale")
+                                ->assert_is_op_input("batch_norm", "Scale")
+                                ->AsIntermediate();
+    auto* right_bn3_bias = VarNode("right_bn3_bias")
+                               ->assert_is_op_input("batch_norm", "Bias")
+                               ->AsInput();
+    auto* right_bn3_mean = VarNode("right_bn3_mean")
+                               ->assert_is_op_input("batch_norm", "Mean")
+                               ->AsIntermediate();
+    auto* right_bn3_var = VarNode("right_bn3_variance")
+                              ->assert_is_op_input("batch_norm", "Variance")
+                              ->AsIntermediate();
+    auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate();
+    auto* right_bn3_out = VarNode("right_bn3_out")
+                              ->assert_is_op_output("batch_norm", "Y")
+                              ->AsIntermediate();
+    auto* right_bn3_mean_out =
+        VarNode("right_bn3_mean_out")
+            ->assert_is_op_output("batch_norm", "MeanOut")
+            ->AsIntermediate();
+    auto* right_bn3_var_out =
+        VarNode("right_bn3_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* right_bn3_saved_mean =
+        VarNode("right_bn3_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* right_bn3_saved_var =
+        VarNode("right_bn3_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+
+    // cbam specific
+    auto* reduce_mean = OpNode("reduce_mean", "reduce_mean")->AsIntermediate();
+    auto* reduce_mean_out = VarNode("reduce_mean_out")
+                                ->assert_is_op_output("reduce_mean", "Out")
+                                ->assert_is_op_input("concat")
+                                ->AsIntermediate();
+    auto* reduce_max = OpNode("reduce_max", "reduce_max")->AsIntermediate();
+    auto* reduce_max_out = VarNode("reduce_max_out")
+                               ->assert_is_op_output("reduce_max", "Out")
+                               ->assert_is_op_input("concat")
+                               ->AsIntermediate();
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* concat_out = VarNode("concat_out")
+                           ->assert_is_op_output("concat", "Out")
+                           ->assert_is_op_input("conv2d", "Input")
+                           ->AsIntermediate();
+    auto* right_conv4_weight = VarNode("right_conv4_weight")
+                                   ->assert_is_op_input("conv2d", "Filter")
+                                   ->AsInput();
+    auto* right_conv4 = OpNode("right_conv4", "conv2d")->AsIntermediate();
+    auto* right_conv4_out = VarNode("right_conv4_out")
+                                ->assert_is_op_output("conv2d", "Output")
+                                ->assert_is_op_input("sigmoid", "X")
+                                ->AsIntermediate();
+    auto* sigmoid = OpNode("sigmoid", "sigmoid")->AsIntermediate();
+    auto* sigmoid_out = VarNode("sigmoid_out")
+                            ->assert_is_op_output("sigmoid", "Out")
+                            ->assert_is_op_input("elementwise_mul")
+                            ->AsIntermediate();
+    auto* reshape = OpNode("reshape", "reshape2")->AsIntermediate();
+    auto* reshape_out = VarNode("reshape_out")
+                            ->assert_is_op_output("reshape2", "Out")
+                            ->assert_is_op_input("elementwise_mul")
+                            ->AsIntermediate();
+    auto* reshape_xshape = VarNode("reshape_xshape")
+                               ->assert_is_op_output("reshape2", "XShape")
+                               ->AsIntermediate();
+    auto* eltwise_mul =
+        OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate();
+    auto* eltwise_mul_out = VarNode("eltwise_mul_out")
+                                ->assert_is_op_output("elementwise_mul", "Out")
+                                ->assert_is_op_input("elementwise_add")
+                                ->AsIntermediate();
+
+    auto* add = OpNode("add", "elementwise_add")->AsIntermediate();
+    auto* add_out = VarNode("add_out")
+                        ->assert_is_op_output("elementwise_add", "Out")
+                        ->assert_is_op_input("relu", "X")
+                        ->AsIntermediate();
+    auto* relu = OpNode("relu", "relu")->AsIntermediate();
+    auto* relu_out =
+        VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput();
+
+    *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >>
+        *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >>
+        *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >>
+        *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >>
+        *right_bn3_out /* >> *add*/;
+
+    *right_bn3_out >> *reduce_mean >> *reduce_mean_out >> *concat;
+    *right_bn3_out >> *reduce_max >> *reduce_max_out >> *concat;
+    *concat >> *concat_out >> *right_conv4 >> *right_conv4_out >> *sigmoid >>
+        *sigmoid_out >> *eltwise_mul;
+    *right_conv4_weight >> *right_conv4;
+    *right_bn3_out >> *reshape >> *reshape_out >> *eltwise_mul;
+    *reshape >> *reshape_xshape;
+    *eltwise_mul >> *eltwise_mul_out >> *add;
+
+    *right_conv1_weight >> *right_conv1;
+    *right_bn1_scale >> *right_bn1;
+    *right_bn1_bias >> *right_bn1;
+    *right_bn1_mean >> *right_bn1;
+    *right_bn1_var >> *right_bn1;
+    *right_bn1 >> *right_bn1_mean_out;
+    *right_bn1 >> *right_bn1_var_out;
+    *right_bn1 >> *right_bn1_saved_mean;
+    *right_bn1 >> *right_bn1_saved_var;
+
+    *right_conv2_weight >> *right_conv2;
+    *right_bn2_scale >> *right_bn2;
+    *right_bn2_bias >> *right_bn2;
+    *right_bn2_mean >> *right_bn2;
+    *right_bn2_var >> *right_bn2;
+    *right_bn2 >> *right_bn2_mean_out;
+    *right_bn2 >> *right_bn2_var_out;
+    *right_bn2 >> *right_bn2_saved_mean;
+    *right_bn2 >> *right_bn2_saved_var;
+
+    *right_conv3_weight >> *right_conv3;
+    *right_bn3_scale >> *right_bn3;
+    *right_bn3_bias >> *right_bn3;
+    *right_bn3_mean >> *right_bn3;
+    *right_bn3_var >> *right_bn3;
+    *right_bn3 >> *right_bn3_mean_out;
+    *right_bn3 >> *right_bn3_var_out;
+    *right_bn3 >> *right_bn3_saved_mean;
+    *right_bn3 >> *right_bn3_saved_var;
+
+    *input >> *add;
+
+    *add >> *add_out >> *relu >> *relu_out;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_cbam_block1");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter",
+                     {
+                         matched.at("right_conv1_weight")->arg()->name,
+                         matched.at("right_conv2_weight")->arg()->name,
+                         matched.at("right_conv3_weight")->arg()->name,
+                         matched.at("right_conv4_weight")->arg()->name,
+                     });
+    op_desc.SetInput("Scale",
+                     {
+                         matched.at("right_bn1_scale")->arg()->name,
+                         matched.at("right_bn2_scale")->arg()->name,
+                         matched.at("right_bn3_scale")->arg()->name,
+                         "placeholder_sa_conv",
+                     });
+    op_desc.SetInput("Bias",
+                     {
+                         matched.at("right_bn1_bias")->arg()->name,
+                         matched.at("right_bn2_bias")->arg()->name,
+                         matched.at("right_bn3_bias")->arg()->name,
+                         "placeholder_sa_conv",
+                     });
+    op_desc.SetInput("Mean",
+                     {
+                         matched.at("right_bn1_mean")->arg()->name,
+                         matched.at("right_bn2_mean")->arg()->name,
+                         matched.at("right_bn3_mean")->arg()->name,
+                         "placeholder_sa_conv",
+                     });
+    op_desc.SetInput("Var",
+                     {
+                         matched.at("right_bn1_variance")->arg()->name,
+                         matched.at("right_bn2_variance")->arg()->name,
+                         matched.at("right_bn3_variance")->arg()->name,
+                         "placeholder_sa_conv",
+                     });
+    op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    auto block1_stmt = matched.at("right_conv1")->stmt();
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
+    fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
+    block1_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "right_conv2_weight",
+        "right_conv3_weight",
+        "right_conv4_weight",
+        "right_bn1_bias",
+        "right_bn2_bias",
+        "right_bn3_bias",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1"));
+    }
+    IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out"));
+  }
+};
+
+class XPUResNetCbamBlock2Fuser : public FuseBase {
+ public:
+  XPUResNetCbamBlock2Fuser() {}
+
+  void BuildPattern() override {
+    auto* input = VarNode("input")->assert_is_op_input("clip", "X")->AsInput();
+
+    auto* clip = OpNode("clip", "clip");
+    auto* clip_out = VarNode("clip_out")
+                         ->assert_is_op_output("clip", "Out")
+                         ->assert_is_op_input("elementwise_pow")
+                         ->AsIntermediate();
+    auto* eltwise_y = VarNode("eltwise_y")
+                          ->assert_is_op_input("elementwise_pow")
+                          ->assert_is_op_input("elementwise_div")
+                          ->AsIntermediate();
+    auto* eltwise_pow =
+        OpNode("eltwise_pow", "elementwise_pow")->AsIntermediate();
+    auto* eltwise_pow_out = VarNode("eltwise_pow_out")
+                                ->assert_is_op_output("elementwise_pow", "Out")
+                                ->assert_is_op_input("pad2d", "X")
+                                ->AsIntermediate();
+    auto* pad2d = OpNode("pad2d", "pad2d")->AsIntermediate();
+    auto* pad2d_out = VarNode("pad2d_out")
+                          ->assert_is_op_output("pad2d", "Out")
+                          ->assert_is_op_input("pool2d", "X")
+                          ->AsIntermediate();
+    auto* pool2d = OpNode("pool2d", "pool2d")->AsIntermediate();
+    auto* pool2d_out = VarNode("pool2d_out")
+                           ->assert_is_op_output("pool2d", "Out")
+                           ->assert_is_op_input("elementwise_pow")
+                           ->AsIntermediate();
+
+    auto* fill_const = OpNode("fill_const", "fill_constant")->AsIntermediate();
+    auto* fill_const_out = VarNode("fill_const_out")
+                               ->assert_is_op_output("fill_constant", "Out")
+                               ->assert_is_op_input("elementwise_div")
+                               ->AsIntermediate();
+    auto* eltwise_div =
+        OpNode("eltwise_div", "elementwise_div")->AsIntermediate();
+    auto* eltwise_div_out = VarNode("eltwise_div_out")
+                                ->assert_is_op_output("elementwise_div", "Out")
+                                ->assert_is_op_input("elementwise_pow")
+                                ->AsIntermediate();
+
+    auto* eltwise_pow2 =
+        OpNode("eltwise_pow2", "elementwise_pow")->AsIntermediate();
+    auto* eltwise_pow2_out = VarNode("eltwise_pow2_out")
+                                 ->assert_is_op_output("elementwise_pow", "Out")
+                                 ->AsIntermediate();
+
+    auto* shape = OpNode("shape", "shape")->AsIntermediate();
+    auto* shape_out = VarNode("shape_out")
+                          ->assert_is_op_output("shape", "Out")
+                          ->assert_is_op_input("gather")
+                          ->AsIntermediate();
+    auto* fill_const2 =
+        OpNode("fill_const2", "fill_constant")->AsIntermediate();
+    auto* fill_const2_out = VarNode("fill_const2_out")
+                                ->assert_is_op_output("fill_constant", "Out")
+                                ->assert_is_op_input("gather")
+                                ->AsIntermediate();
+    auto* gather = OpNode("gather", "gather")->AsIntermediate();
+    auto* gather_out = VarNode("gather_out")
+                           ->assert_is_op_output("gather", "Out")
+                           ->assert_is_op_input("assign", "X")
+                           ->AsIntermediate();
+    auto* assign = OpNode("assign", "assign")->AsIntermediate();
+    auto* assign_out = VarNode("assign_out")
+                           ->assert_is_op_output("assign", "Out")
+                           ->assert_is_op_input("concat")
+                           ->AsIntermediate();
+
+    auto* fill_const3 =
+        OpNode("fill_const3", "fill_constant")->AsIntermediate();
+    auto* fill_const3_out = VarNode("fill_const3_out")
+                                ->assert_is_op_output("fill_constant", "Out")
+                                ->assert_is_op_input("assign")
+                                ->AsIntermediate();
+    auto* assign2 = OpNode("assign2", "assign")->AsIntermediate();
+    auto* assign2_out = VarNode("assign2_out")
+                            ->assert_is_op_output("assign", "Out")
+                            ->assert_is_op_input("concat")
+                            ->AsIntermediate();
+
+    auto* concat = OpNode("concat", "concat")->AsIntermediate();
+    auto* concat_out = VarNode("concat_out")
+                           ->assert_is_op_output("concat", "Out")
+                           ->assert_is_op_input("cast", "X")
+                           ->AsIntermediate();
+    auto* cast = OpNode("cast", "cast")->AsIntermediate();
+    auto* cast_out = VarNode("cast_out")
+                         ->assert_is_op_output("cast", "Out")
+                         ->assert_is_op_input("reshape2", "Shape")
+                         ->AsIntermediate();
+
+    auto* reshape2 = OpNode("reshape2", "reshape2")->AsIntermediate();
+    auto* reshape2_out = VarNode("reshape2_out")
+                             ->assert_is_op_output("reshape2", "Out")
+                             ->assert_is_op_input("matmul", "X")
+                             ->AsIntermediate();
+    auto* reshape2_xshape = VarNode("reshape2_xshape")
+                                ->assert_is_op_output("reshape2", "XShape")
+                                ->AsIntermediate();
+    auto* matmul_y =
+        VarNode("matmul_y")->assert_is_op_input("matmul", "Y")->AsInput();
+    auto* matmul = OpNode("matmul", "matmul")->AsIntermediate();
+    auto* matmul_out = VarNode("matmul_out")
+                           ->assert_is_op_output("matmul", "Out")
+                           ->assert_is_op_input("elementwise_add")
+                           ->AsIntermediate();
+    auto* eltwise_add_y = VarNode("eltwise_add_y")
+                              ->assert_is_op_input("elementwise_add")
+                              ->AsInput();
+    auto* eltwise_add =
+        OpNode("eltwise_add", "elementwise_add")->AsIntermediate();
+    auto* eltwise_add_out = VarNode("eltwise_add_out")
+                                ->assert_is_op_output("elementwise_add", "Out")
+                                ->AsIntermediate();
+
+    auto* norm = OpNode("norm", "norm")->AsIntermediate();
+    auto* norm_out = VarNode("norm_out")
+                         ->assert_is_op_output("norm", "Out")
+                         ->assert_is_op_input("elementwise_add")
+                         ->AsIntermediate();
+    auto* norm_norm = VarNode("norm_norm")
+                          ->assert_is_op_output("norm", "Norm")
+                          ->AsIntermediate();
+    auto* fill_const4 =
+        OpNode("fill_const4", "fill_constant")->AsIntermediate();
+    auto* fill_const4_out = VarNode("fill_const4_out")
+                                ->assert_is_op_output("fill_constant", "Out")
+                                ->assert_is_op_input("elementwise_add")
+                                ->AsIntermediate();
+    auto* eltwise_add2 =
+        OpNode("eltwise_add2", "elementwise_add")->AsIntermediate();
+    auto* eltwise_add2_out = VarNode("eltwise_add2_out")
+                                 ->assert_is_op_output("elementwise_add", "Out")
+                                 ->assert_is_op_input("elementwise_mul")
+                                 ->AsIntermediate();
+    auto* fill_const5 =
+        OpNode("fill_const5", "fill_constant")->AsIntermediate();
+    auto* fill_const5_out = VarNode("fill_const5_out")
+                                ->assert_is_op_output("fill_constant", "Out")
+                                ->assert_is_op_input("elementwise_mul")
+                                ->AsIntermediate();
+    auto* eltwise_mul =
+        OpNode("eltwise_mul", "elementwise_mul")->AsIntermediate();
+    auto* eltwise_mul_out = VarNode("eltwise_mul_out")
+                                ->assert_is_op_output("elementwise_mul", "Out")
+                                ->assert_is_op_input("elementwise_div")
+                                ->AsIntermediate();
+
+    auto* eltwise_div2 =
+        OpNode("eltwise_div2", "elementwise_div")->AsIntermediate();
+    auto* eltwise_div2_out = VarNode("eltwise_div2_out")
+                                 ->assert_is_op_output("elementwise_div", "Out")
+                                 ->AsOutput();
+
+    *input >> *clip >> *clip_out >> *eltwise_pow >> *eltwise_pow_out >>
+        *pad2d >> *pad2d_out >> *pool2d >> *pool2d_out >> *eltwise_pow2;
+    *eltwise_y >> *eltwise_pow;
+
+    *fill_const >> *fill_const_out >> *eltwise_div >> *eltwise_div_out >>
+        *eltwise_pow2;
+    *eltwise_y >> *eltwise_div;
+
+    *eltwise_pow2 >> *eltwise_pow2_out >> *shape >> *shape_out >> *gather >>
+        *gather_out >> *assign >> *assign_out >> *concat >> *concat_out >>
+        *cast >> *cast_out >> *reshape2;
+    *fill_const2 >> *fill_const2_out >> *gather;
+    *fill_const3 >> *fill_const3_out >> *assign2 >> *assign2_out >> *concat;
+    *eltwise_pow2_out >> *reshape2;
+
+    *reshape2 >> *reshape2_out >> *matmul >> *matmul_out >> *eltwise_add >>
+        *eltwise_add_out;
+    *reshape2 >> *reshape2_xshape;
+    *matmul_y >> *matmul;
+    *eltwise_add_y >> *eltwise_add;
+
+    *eltwise_add_out >> *norm >> *norm_out >> *eltwise_add2 >>
+        *eltwise_add2_out >> *eltwise_mul >> *eltwise_mul_out >>
+        *eltwise_div2 >> *eltwise_div2_out;
+    *norm >> *norm_norm;
+    *fill_const4 >> *fill_const4_out >> *eltwise_add2;
+    *fill_const5 >> *fill_const5_out >> *eltwise_mul;
+    *eltwise_add_out >> *eltwise_div2;
+  }
+
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("resnet_cbam_block2");
+    op_desc.SetInput("Inputs", {matched.at("input")->arg()->name});
+    op_desc.SetInput("Filter", {matched.at("matmul_y")->arg()->name});
+    op_desc.SetInput("Scale", {"placeholder_last_fc"});
+    op_desc.SetInput("Bias", {matched.at("eltwise_add_y")->arg()->name});
+    op_desc.SetInput("Mean", {"placeholder_last_fc"});
+    op_desc.SetInput("Var", {"placeholder_last_fc"});
+    op_desc.SetOutput("Outputs", {matched.at("eltwise_div2_out")->arg()->name});
+    // XXX: keep these to fool SubgraphOp::AttachImpl()
+    op_desc.SetAttr<int>("sub_block", 0);
+    op_desc.SetAttr<std::vector<std::string>>("input_data_names", {});
+    op_desc.SetAttr<std::vector<std::string>>("output_data_names", {});
+
+    // extra traits to distill
+    auto block2_stmt = matched.at("clip")->stmt();
+    auto* scope = block2_stmt->op()->scope();
+    auto pow_tensor_name = matched.at("eltwise_y")->arg()->name;
+    auto* pow_tensor = scope->FindTensor(pow_tensor_name);
+    float pool_p = pow_tensor->data<float>()[0];
+    op_desc.SetAttr<float>("pool_p", pool_p);
+    auto* matmul_op_info = matched.at("matmul")->stmt()->op_info();
+    CHECK(matmul_op_info->GetAttr<bool>("transpose_Y") == true)
+        << "Y of last fc must have been transposed";
+
+    auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+    // XXX: memleak?
+    auto sub_block_desc = new cpp::BlockDesc();
+    static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
+        ->SetSubBlock(sub_block_desc);
+    fake_subgraph_op->Attach(op_desc, scope);
+    fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places());
+    block2_stmt->SetOp(fake_subgraph_op);
+
+    std::vector<std::string> froms = {
+        "matmul_y", "eltwise_add_y",
+    };
+    for (auto& from : froms) {
+      IR_NODE_LINK_TO(matched.at(from), matched.at("clip"));
+    }
+    IR_OP_VAR_LINK(matched.at("clip"), matched.at("eltwise_div2_out"));
+  }
+};
+
+class XPUResNetCbamFuser : public xpu::XPUFuseBase {
+ public:
+  XPUResNetCbamFuser() {}
+
+  void BuildPattern() override {
+    auto* input =
+        VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput();
+
+    auto* top_conv_weight = VarNode("top_conv_weight")
+                                ->assert_is_op_input("conv2d", "Filter")
+                                ->AsInput();
+    auto* top_conv = OpNode("top_conv", "conv2d");
+    auto* top_conv_out = VarNode("top_conv_out")
+                             ->assert_is_op_output("conv2d", "Output")
+                             ->assert_is_op_input("batch_norm", "X")
+                             ->AsIntermediate();
+    auto* top_bn_scale = VarNode("top_bn_scale")
+                             ->assert_is_op_input("batch_norm", "Scale")
+                             ->AsIntermediate();
+    auto* top_bn_bias = VarNode("top_bn_bias")
+                            ->assert_is_op_input("batch_norm", "Bias")
+                            ->AsInput();
+    auto* top_bn_mean = VarNode("top_bn_mean")
+                            ->assert_is_op_input("batch_norm", "Mean")
+                            ->AsIntermediate();
+    auto* top_bn_var = VarNode("top_bn_variance")
+                           ->assert_is_op_input("batch_norm", "Variance")
+                           ->AsIntermediate();
+    auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate();
+    auto* top_bn_out = VarNode("top_bn_out")
+                           ->assert_is_op_output("batch_norm", "Y")
+                           ->assert_is_op_input("relu", "X")
+                           ->AsIntermediate();
+    auto* top_bn_mean_out = VarNode("top_bn_mean_out")
+                                ->assert_is_op_output("batch_norm", "MeanOut")
+                                ->AsIntermediate();
+    auto* top_bn_var_out =
+        VarNode("top_bn_var_out")
+            ->assert_is_op_output("batch_norm", "VarianceOut")
+            ->AsIntermediate();
+    auto* top_bn_saved_mean =
+        VarNode("top_bn_saved_mean")
+            ->assert_is_op_output("batch_norm", "SavedMean")
+            ->AsIntermediate();
+    auto* top_bn_saved_var =
+        VarNode("top_bn_saved_var")
+            ->assert_is_op_output("batch_norm", "SavedVariance")
+            ->AsIntermediate();
+    auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate();
+    auto* top_relu_out = VarNode("top_relu_out")
+                             ->assert_is_op_output("relu", "Out")
+                             ->assert_is_op_input("pool2d", "X")
+                             ->AsIntermediate();
+    auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate();
+    auto* top_pool_out =
+        VarNode("top_pool_out")
+            ->assert_is_op_output("pool2d", "Out")
+            ->assert_is_op_input("resnet_cbam_block0", "Inputs")
+            ->AsIntermediate();
+
+    // args are left out
+    auto* resnet_block0_1 =
+        OpNode("resnet_block0_1", "resnet_cbam_block0")->AsIntermediate();
+    auto* resnet_block0_1_out =
+        VarNode("resnet_block0_1_out")
+            ->assert_is_op_output("resnet_cbam_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_1 =
+        OpNode("resnet_block1_1_1", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_1_1_out =
+        VarNode("resnet_block1_1_1_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_1_2 =
+        OpNode("resnet_block1_1_2", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_1_2_out =
+        VarNode("resnet_block1_1_2_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_2 =
+        OpNode("resnet_block0_2", "resnet_cbam_block0")->AsIntermediate();
+    auto* resnet_block0_2_out =
+        VarNode("resnet_block0_2_out")
+            ->assert_is_op_output("resnet_cbam_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_1 =
+        OpNode("resnet_block1_2_1", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_2_1_out =
+        VarNode("resnet_block1_2_1_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_2 =
+        OpNode("resnet_block1_2_2", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_2_2_out =
+        VarNode("resnet_block1_2_2_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_2_3 =
+        OpNode("resnet_block1_2_3", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_2_3_out =
+        VarNode("resnet_block1_2_3_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_3 =
+        OpNode("resnet_block0_3", "resnet_cbam_block0")->AsIntermediate();
+    auto* resnet_block0_3_out =
+        VarNode("resnet_block0_3_out")
+            ->assert_is_op_output("resnet_cbam_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_1 =
+        OpNode("resnet_block1_3_1", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_1_out =
+        VarNode("resnet_block1_3_1_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_2 =
+        OpNode("resnet_block1_3_2", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_2_out =
+        VarNode("resnet_block1_3_2_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_3 =
+        OpNode("resnet_block1_3_3", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_3_out =
+        VarNode("resnet_block1_3_3_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_4 =
+        OpNode("resnet_block1_3_4", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_4_out =
+        VarNode("resnet_block1_3_4_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_3_5 =
+        OpNode("resnet_block1_3_5", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_3_5_out =
+        VarNode("resnet_block1_3_5_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block0_4 =
+        OpNode("resnet_block0_4", "resnet_cbam_block0")->AsIntermediate();
+    auto* resnet_block0_4_out =
+        VarNode("resnet_block0_4_out")
+            ->assert_is_op_output("resnet_cbam_block0", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_1 =
+        OpNode("resnet_block1_4_1", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_4_1_out =
+        VarNode("resnet_block1_4_1_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+    auto* resnet_block1_4_2 =
+        OpNode("resnet_block1_4_2", "resnet_cbam_block1")->AsIntermediate();
+    auto* resnet_block1_4_2_out =
+        VarNode("resnet_block1_4_2_out")
+            ->assert_is_op_output("resnet_cbam_block1", "Outputs")
+            ->AsIntermediate();
+
+    auto* resnet_block2 =
+        OpNode("resnet_block2", "resnet_cbam_block2")->AsIntermediate();
+    auto* resnet_block2_out =
+        VarNode("resnet_block2_out")
+            ->assert_is_op_output("resnet_cbam_block2", "Outputs")
+            ->AsOutput();
+
+    *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >>
+        *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >>
+        *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >>
+        *resnet_block1_1_1_out >> *resnet_block1_1_2 >>
+        *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >>
+        *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >>
+        *resnet_block1_2_2_out >> *resnet_block1_2_3 >>
+        *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >>
+        *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >>
+        *resnet_block1_3_2_out >> *resnet_block1_3_3 >>
+        *resnet_block1_3_3_out >> *resnet_block1_3_4 >>
+        *resnet_block1_3_4_out >> *resnet_block1_3_5 >>
+        *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >>
+        *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >>
+        *resnet_block1_4_2_out >> *resnet_block2 >> *resnet_block2_out;
+
+    *top_conv_weight >> *top_conv;
+    *top_bn_scale >> *top_bn;
+    *top_bn_bias >> *top_bn;
+    *top_bn_mean >> *top_bn;
+    *top_bn_var >> *top_bn;
+    *top_bn >> *top_bn_mean_out;
+    *top_bn >> *top_bn_var_out;
+    *top_bn >> *top_bn_saved_mean;
+    *top_bn >> *top_bn_saved_var;
+  }
+
+  void handle_placeholder_sa_conv(SSAGraph* graph,
+                                  const key2nodes_t& matched,
+                                  paddle::lite::Scope* scope,
+                                  const std::string& filter_name,
+                                  std::vector<std::string>* max_filter_name) {
+    auto* filter_t = scope->FindMutableTensor(filter_name);
+    int filter_len = filter_t->numel();
+    float* filter_on_host = filter_t->mutable_data<float>();
+
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+    std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        filter_on_host, filter_int16.get(), max_f, filter_len);
+    memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+    // create new arg in graph and scope
+    std::string max_name = filter_name + "_max";
+    max_filter_name->push_back(max_name);
+    auto* max_filter_node = graph->NewArgumentNode(max_name);
+    max_filter_node->arg()->is_weight = true;
+    max_filter_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    DirectedLink(max_filter_node, matched.at("top_conv"));
+    auto* max_filter_t = scope->NewTensor(max_name);
+    max_filter_t->Resize({4});
+    float* max_ptr = max_filter_t->mutable_data<float>();
+    max_ptr[0] = max_f;
+    max_ptr[1] = max_f;
+    max_ptr[2] = max_f;
+    max_ptr[3] = max_f;
+  }
+
+  void handle_placeholder_last_fc(SSAGraph* graph,
+                                  const key2nodes_t& matched,
+                                  paddle::lite::Scope* scope,
+                                  const std::string& filter_name,
+                                  std::vector<std::string>* max_filter_name) {
+    auto* filter_t = scope->FindMutableTensor(filter_name);
+    auto filter_dims = filter_t->dims();
+    int filter_len = filter_t->numel();
+    float* filter_on_host = filter_t->mutable_data<float>();
+
+    // XXX(miaotianxiang): Y has already been transposed in model...
+    float max_f =
+        paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+    std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+    paddle::lite::xpu::math::ConvertFP32ToInt16(
+        filter_on_host, filter_int16.get(), max_f, filter_len);
+    memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+    // create new arg in graph and scope
+    std::string max_name = filter_name + "_max";
+    max_filter_name->push_back(max_name);
+    auto* max_filter_node = graph->NewArgumentNode(max_name);
+    max_filter_node->arg()->is_weight = true;
+    max_filter_node->arg()->type = LiteType::GetTensorTy(
+        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    DirectedLink(max_filter_node, matched.at("top_conv"));
+    auto* max_filter_t = scope->NewTensor(max_name);
+    max_filter_t->Resize({4});
+    float* max_ptr = max_filter_t->mutable_data<float>();
+    max_ptr[0] = max_f;
+    max_ptr[1] = max_f;
+    max_ptr[2] = max_f;
+    max_ptr[3] = max_f;
+  }
+
+  void InsertNewNode(SSAGraph* graph,
+                     const key2nodes_t& matched,
+                     const std::vector<Node*>& extra_input_vars) override {
+    cpp::OpDesc op_desc;
+    op_desc.SetType("__xpu__resnet_cbam");
+    op_desc.SetInput("Input", {matched.at("input")->arg()->name});
+    std::vector<std::string> filter_name = {
+        matched.at("top_conv_weight")->arg()->name};
+    std::vector<std::string> scale_name = {
+        matched.at("top_bn_scale")->arg()->name};
+    std::vector<std::string> bias_name = {
+        matched.at("top_bn_bias")->arg()->name};
+    std::vector<std::string> mean_name = {
+        matched.at("top_bn_mean")->arg()->name};
+    std::vector<std::string> var_name = {
+        matched.at("top_bn_variance")->arg()->name};
+    std::vector<std::string> max_filter_name;
+    std::vector<std::string> resnet_block_vec = {
+        "resnet_block0_1",
+        "resnet_block1_1_1",
+        "resnet_block1_1_2",
+        "resnet_block0_2",
+        "resnet_block1_2_1",
+        "resnet_block1_2_2",
+        "resnet_block1_2_3",
+        "resnet_block0_3",
+        "resnet_block1_3_1",
+        "resnet_block1_3_2",
+        "resnet_block1_3_3",
+        "resnet_block1_3_4",
+        "resnet_block1_3_5",
+        "resnet_block0_4",
+        "resnet_block1_4_1",
+        "resnet_block1_4_2",
+        "resnet_block2",
+    };
+    for (auto& block : resnet_block_vec) {
+      auto* block_op_info = matched.at(block)->stmt()->op_info();
+      auto block_filter_name = block_op_info->Input("Filter");
+      std::copy(block_filter_name.begin(),
+                block_filter_name.end(),
+                std::back_inserter(filter_name));
+      auto block_scale_name = block_op_info->Input("Scale");
+      std::copy(block_scale_name.begin(),
+                block_scale_name.end(),
+                std::back_inserter(scale_name));
+      auto block_bias_name = block_op_info->Input("Bias");
+      std::copy(block_bias_name.begin(),
+                block_bias_name.end(),
+                std::back_inserter(bias_name));
+      auto block_mean_name = block_op_info->Input("Mean");
+      std::copy(block_mean_name.begin(),
+                block_mean_name.end(),
+                std::back_inserter(mean_name));
+      auto block_var_name = block_op_info->Input("Var");
+      std::copy(block_var_name.begin(),
+                block_var_name.end(),
+                std::back_inserter(var_name));
+    }
+
+    auto* resnet_cbam_stmt = matched.at("top_conv")->stmt();
+    auto* scope = resnet_cbam_stmt->op()->scope();
+    for (size_t i = 0; i < filter_name.size(); ++i) {
+      if (scale_name[i] == "placeholder_sa_conv") {
+        handle_placeholder_sa_conv(
+            graph, matched, scope, filter_name[i], &max_filter_name);
+        continue;
+      } else if (scale_name[i] == "placeholder_last_fc") {
+        handle_placeholder_last_fc(
+            graph, matched, scope, filter_name[i], &max_filter_name);
+        continue;
+      }
+
+      auto* filter_t = scope->FindMutableTensor(filter_name[i]);
+      auto* scale_t = scope->FindMutableTensor(scale_name[i]);
+      auto* bias_t = scope->FindMutableTensor(bias_name[i]);
+      auto* mean_t = scope->FindMutableTensor(mean_name[i]);
+      auto* var_t = scope->FindMutableTensor(var_name[i]);
+
+      int mean_len = mean_t->numel();
+      int filter_len = filter_t->numel();
+      int filter_stride = filter_len / mean_len;
+
+      float* filter_on_host = filter_t->mutable_data<float>();
+      float* scale_on_host = scale_t->mutable_data<float>();
+      float* bias_on_host = bias_t->mutable_data<float>();
+      float* mean_on_host = mean_t->mutable_data<float>();
+      float* var_on_host = var_t->mutable_data<float>();
+
+      // Perform preprocess
+      for (int i = 0; i < mean_len; ++i) {
+        scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f);
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        for (int j = 0; j < filter_stride; ++j) {
+          filter_on_host[i * filter_stride + j] *= scale_on_host[i];
+        }
+      }
+      for (int i = 0; i < mean_len; ++i) {
+        bias_on_host[i] += -mean_on_host[i] * scale_on_host[i];
+      }
+
+      float max_f =
+          paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len);
+      std::unique_ptr<int16_t[]> filter_int16(new int16_t[filter_len]);
+      paddle::lite::xpu::math::ConvertFP32ToInt16(
+          filter_on_host, filter_int16.get(), max_f, filter_len);
+      memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t));
+
+      // create new arg in graph and scope
+      std::string max_name = filter_name[i] + "_max";
+      max_filter_name.push_back(max_name);
+      auto* max_filter_node = graph->NewArgumentNode(max_name);
+      max_filter_node->arg()->is_weight = true;
+      max_filter_node->arg()->type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+      DirectedLink(max_filter_node, matched.at("top_conv"));
+      auto* max_filter_t = scope->NewTensor(max_name);
+      max_filter_t->Resize({4});
+      float* max_ptr = max_filter_t->mutable_data<float>();
+      max_ptr[0] = max_f;
+      max_ptr[1] = max_f;
+      max_ptr[2] = max_f;
+      max_ptr[3] = max_f;
+    }
+    op_desc.SetInput("Filter", filter_name);
+    op_desc.SetInput("Bias", bias_name);
+    op_desc.SetInput("MaxFilter", max_filter_name);
+    op_desc.SetOutput("Output", {matched.at("resnet_block2_out")->arg()->name});
+    op_desc.SetAttr<int>("xpu", 1);
+    auto* block2_op_info = matched.at("resnet_block2")->stmt()->op_info();
+    op_desc.SetAttr<float>("pool_p", block2_op_info->GetAttr<float>("pool_p"));
+
+    auto resnet_cbam_op = LiteOpRegistry::Global().Create(op_desc.Type());
+    resnet_cbam_op->Attach(op_desc, scope);
+    resnet_cbam_op->SetValidPlaces(resnet_cbam_stmt->op()->valid_places());
+    auto kernels =
+        resnet_cbam_op->CreateKernels(resnet_cbam_op->valid_places());
+    resnet_cbam_stmt->SetOp(resnet_cbam_op);
+    resnet_cbam_stmt->SetKernels(std::move(kernels));
+
+    IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv"));
+    for (auto* node : extra_input_vars) {
+      IR_NODE_LINK_TO(node, matched.at("top_conv"));
+    }
+    IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("resnet_block2_out"));
+  }
+};
+
+}  // namespace fusion
+
+class XPUResNetCbamFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
+    fusion::XPUResNetCbamBlock0Fuser block0_fuser;
+    block0_fuser(graph.get());
+    fusion::XPUResNetCbamBlock1Fuser block1_fuser;
+    block1_fuser(graph.get());
+    fusion::XPUResNetCbamBlock2Fuser block2_fuser;
+    block2_fuser(graph.get());
+    fusion::XPUResNetCbamFuser resnet_fuser;
+    resnet_fuser(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(__xpu__resnet_cbam_fuse_pass,
+                  paddle::lite::mir::XPUResNetCbamFusePass)
+    .BindTargets({TARGET(kXPU)})
+    .BindKernel("__xpu__resnet_cbam");
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
index 68c07c0ffd0694aec0ff073082e1192213a0ef4a..20023830123939f1cf83706f69ca8a7a2703b646 100644
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -25,21 +25,21 @@ namespace mir {
 void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::vector<std::string> act_types{"relu"};
   bool has_int8 = false;
-  bool has_arm_float = false;
+  bool has_arm = false;
   bool has_cuda = false;
   for (auto& place : graph->valid_places()) {
     if (place.precision == PRECISION(kInt8)) {
       has_int8 = true;
     }
-    if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
-      has_arm_float = true;
+    if (place.target == TARGET(kARM)) {
+      has_arm = true;
     }
     if (place.target == TARGET(kCUDA)) {
       has_cuda = true;
     }
   }
 
-  if (!has_int8 && has_arm_float) {
+  if (has_arm) {
     act_types.push_back("relu6");
     act_types.push_back("leaky_relu");
   }
@@ -64,4 +64,5 @@ REGISTER_MIR_PASS(lite_conv_activation_fuse_pass,
                   paddle::lite::mir::ConvActivationFusePass)
     .BindTargets({TARGET(kAny)})
     .ExcludeTargets({TARGET(kXPU)})
+    .ExcludeTargets({TARGET(kMLU)})
     .BindKernel("conv2d");
diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc
index 69be8dab0a06c26d5ca2bcdfe8327634edb9637d..a8a5a5deb2a57982587d9db9f94cadb367af8595 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -156,12 +156,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
   //       little difference for int8
   ///////////////////////////////////////////////////////////////////////////////
   if (enable_int8) {
-    PADDLE_ENFORCE(conv_op_desc->HasAttr("weight_scale"),
-                   "INT8 mode: Conv should has weight_scale attr");
+    std::string weight_name = conv_op_desc->Input("Filter").front();
+    CHECK(conv_op_desc->HasInputScale(weight_name))
+        << "INT8 mode: Conv should has weight_scale attr";
     auto conv_weight_d = conv_weight_t->mutable_data<int8_t>();
     // compute new conv_weight for int8
-    auto weight_scale =
-        conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
+    auto weight_scale = conv_op_desc->GetInputScale(weight_name);
     if (conv_type_ == "conv2d_transpose" && !depthwise) {
       int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                    conv_weight_t->dims()[3];
@@ -188,7 +188,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
         }
       }
     }
-    conv_op_desc->SetAttr("weight_scale", weight_scale);
+    conv_op_desc->SetInputScale(weight_name, weight_scale);
   } else if (is_weight_quantization) {
     std::string scale_name = conv_weight_name + "_quant_scale";
     if (conv_op_desc->HasAttr(scale_name)) {
diff --git a/lite/core/mir/fusion/conv_bn_fuser.h b/lite/core/mir/fusion/conv_bn_fuser.h
index 8bd8c0ce0600bb68667d96d07d43fa3028b5a856..841566067ba6675271227adfa82c74defac35f2a 100644
--- a/lite/core/mir/fusion/conv_bn_fuser.h
+++ b/lite/core/mir/fusion/conv_bn_fuser.h
@@ -18,7 +18,7 @@
 #include <memory>
 #include <string>
 #include "lite/core/mir/pattern_matcher_high_api.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc
index e2d8f96c53bd76d9495035c6ec56a5364b9bdcf5..d9bffffebfaabcca9c63700caf6e3ee91fa2eecb 100644
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -24,8 +24,13 @@ namespace mir {
 
 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 #ifdef LITE_WITH_X86
+#ifdef LITE_WITH_MLU
+  fusion::FcFuser fuser(false);
+  fuser(graph.get());
+#else
   fusion::FcFuser fuser(true);
   fuser(graph.get());
+#endif
 #endif
 
   fusion::FcFuser fuser2(false);
@@ -38,7 +43,9 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
     .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kXPU), TARGET(kX86)})
+    .ExcludeTargets({TARGET(kXPU)})
+#ifndef LITE_WITH_MLU
+    .ExcludeTargets({TARGET(kX86)})
+#endif
     .ExcludeTargets({TARGET(kBM)})
-    .ExcludeTargets({TARGET(kCUDA)})
     .BindKernel("fc");
diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc
index 3c99131083d37ea2c8511ed136bff17c891529af..8fdde50fc3015b411ee13fed15e92a93a1c722e5 100644
--- a/lite/core/mir/fusion/fc_fuser.cc
+++ b/lite/core/mir/fusion/fc_fuser.cc
@@ -71,7 +71,20 @@ void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
 }
 
 cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc = *matched.at("mul")->stmt()->op_info();
+  auto op_desc = *matched.at("mul")->stmt()->op_info();
+
+  // Get the input scale from mul
+  std::vector<float> x_scale_vct;
+  std::vector<float> y_scale_vct;
+  auto input_x_name = op_desc.Input("X").front();
+  auto input_y_name = op_desc.Input("Y").front();
+  bool is_quantized_op = op_desc.HasInputScale(input_x_name) &&
+                         op_desc.HasInputScale(input_y_name);
+  if (is_quantized_op) {
+    x_scale_vct = op_desc.GetInputScale(input_x_name);
+    y_scale_vct = op_desc.GetInputScale(op_desc.Input("Y").front());
+  }
+
   op_desc.mutable_inputs()->clear();
   op_desc.mutable_outputs()->clear();
   op_desc.SetType("fc");
@@ -85,6 +98,13 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
   if (with_relu_) {
     op_desc.SetAttr("activation_type", std::string{"relu"});
   }
+
+  // Set the input scale into fc
+  if (is_quantized_op) {
+    op_desc.SetInputScale(matched.at("x")->arg()->name, x_scale_vct);
+    op_desc.SetInputScale(matched.at("W")->arg()->name, y_scale_vct);
+  }
+
   return op_desc;
 }
 
diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
index 80a033c75f2e23efa091375ee2a9f78e3ff40d71..ea8400b0bb2cd1680e52d9a92ef79aca4e09887b 100644
--- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
+++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc
@@ -34,12 +34,13 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   }
 
   // fuse quantized node and dequant node
-  for (auto& op_type : {"conv2d", "mul", "depthwise_conv2d"}) {
+  for (auto& op_type :
+       {"conv2d", "mul", "depthwise_conv2d", "conv2d_transpose"}) {
     fusion::DequantOpFuser fuser(op_type);
     fuser(graph.get());
   }
 
-  for (auto& op_type : {"conv2d", "depthwise_conv2d"}) {
+  for (auto& op_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
     fusion::ChannelWiseDequantOpFuser fuser(op_type);
     fuser(graph.get());
   }
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index f6d03cc23d56f8ae25f22b5b2667ed451ef8afaa..1335518b00db5311b4605148817faed52164fd7a 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -23,6 +23,20 @@ namespace lite {
 namespace mir {
 namespace fusion {
 
+static std::string GetWeightArgname(const std::string& op_type) {
+  std::string weight_argname{};
+  std::vector<std::string> conv_ops = {
+      "conv2d", "depthwise_conv2d", "conv2d_transpose"};
+  std::vector<std::string> mul_ops = {"mul", "matmul"};
+  if (std::find(conv_ops.begin(), conv_ops.end(), op_type) != conv_ops.end()) {
+    weight_argname = "Filter";
+  } else if (std::find(mul_ops.begin(), mul_ops.end(), op_type) !=
+             mul_ops.end()) {
+    weight_argname = "Y";
+  }
+  return weight_argname;
+}
+
 void DeleteQuantOpFuser::BuildPattern() {
   auto* input_scale_node = VarNode("input_scale_node")
                                ->assert_is_op_input(quant_op_type_, "InScale");
@@ -64,13 +78,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph,
   for (auto* quantized_node : outlinks) {
     // save input scale in quantized op by input argname + index
     auto op_desc = *quantized_node->stmt()->mutable_op_info();
-    std::string argname;
-    int index;
-    op_desc.GetInputArgname(out_act_name, &argname);
-    op_desc.GetInputIndex(out_act_name, &index);
-    op_desc.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
-                           scale_value);
-    op_desc.SetAttr<float>("input_scale", scale_value);  // save it for now
+    op_desc.SetInputScale(out_act_name, {scale_value});
     op_desc.SetAttr<int>("bit_length", bit_length);
     op_desc.UpdateAllInputs(out_act_name, in_act_name);
     quantized_node->stmt()->ResetOp(op_desc, graph->valid_places());
@@ -89,20 +97,13 @@ cpp::OpDesc DeleteQuantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 }
 
 void DequantOpFuser::BuildPattern() {
-  std::string weight_name = "";
-  if (quantized_op_type_ == "conv2d" ||
-      quantized_op_type_ == "depthwise_conv2d") {
-    weight_name = "Filter";
-  } else {
-    weight_name = "Y";
-  }
-
+  std::string weight_argname = GetWeightArgname(quantized_op_type_);
   auto* quantized_op_input = VarNode("quantized_op_input")
                                  ->assert_is_op_input(quantized_op_type_)
                                  ->AsInput();
   auto* quantized_op_weight =
       VarNode("quantized_op_weight")
-          ->assert_is_op_input(quantized_op_type_, weight_name)
+          ->assert_is_op_input(quantized_op_type_, weight_argname)
           ->AsInput();
   auto* quantized_op = OpNode("quantized_op", quantized_op_type_)
                            ->assert_is_op(quantized_op_type_)
@@ -135,6 +136,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* quantized_op = matched.at("quantized_op");
   auto* dequant_op = matched.at("dequant_op");
   auto* dequant_op_out = matched.at("dequant_op_out");
+  auto weight_name = quantized_op_weight->arg()->name;
 
   // obtain weight_scale from max_range
   auto* scope = quantized_op->stmt()->op()->scope();
@@ -150,14 +152,15 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
   //        = max(abs(weight)) / range
 
   // set op desc
-  cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
+  auto op_desc = *quantized_op->stmt()->op_info();
   auto quantized_weight_var_name = quantized_op_weight->arg()->name;
   auto quantized_weight_t =
       scope->FindVar(quantized_weight_var_name)->GetMutable<lite::Tensor>();
   std::vector<float> weight_scale;
-  int weight_scale_size;
+  int weight_scale_size = 0;
   if (quantized_op_type_ == "conv2d" ||
-      quantized_op_type_ == "depthwise_conv2d") {
+      quantized_op_type_ == "depthwise_conv2d" ||
+      quantized_op_type_ == "conv2d_transpose") {
     op_desc.SetInput("Input", {quantized_op_input->arg()->name});
     op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
     // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should
@@ -173,7 +176,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
     weight_scale.push_back(whole_weight_scale);
   }
   op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("weight_scale", weight_scale);
+  op_desc.SetInputScale(weight_name, weight_scale);
 
   // change the weight from the float type to int8 type.
   Tensor temp_tensor;
@@ -204,12 +207,13 @@ cpp::OpDesc DequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
 
 void ChannelWiseDequantOpFuser::BuildPattern() {
   std::string dequant_op_type = "fake_channel_wise_dequantize_max_abs";
+  std::string weight_argname = GetWeightArgname(quantized_op_type_);
   auto* quantized_op_input = VarNode("quantized_op_input")
                                  ->assert_is_op_input(quantized_op_type_)
                                  ->AsInput();
   auto* quantized_op_weight =
       VarNode("quantized_op_weight")
-          ->assert_is_op_input(quantized_op_type_, "Filter")
+          ->assert_is_op_input(quantized_op_type_, weight_argname)
           ->AsInput();
   auto* quantized_op = OpNode("quantized_op", quantized_op_type_)
                            ->assert_is_op(quantized_op_type_)
@@ -246,6 +250,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
   auto* dequant_op_channel_scale = matched.at("dequant_op_channel_scale");
   auto* dequant_op = matched.at("dequant_op");
   auto* dequant_op_out = matched.at("dequant_op_out");
+  auto weight_name = quantized_op_weight->arg()->name;
 
   // obtain input weight_scale from fake_dequant op
   auto* scope = quantized_op->stmt()->op()->scope();
@@ -265,17 +270,20 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
   }
 
   // set op desc
-  cpp::OpDesc op_desc = *quantized_op->stmt()->op_info();
+  auto op_desc = *quantized_op->stmt()->op_info();
   if (quantized_op_type_ == "conv2d" ||
-      quantized_op_type_ == "depthwise_conv2d") {
+      quantized_op_type_ == "depthwise_conv2d" ||
+      quantized_op_type_ == "conv2d_transpose") {
     op_desc.SetInput("Input", {quantized_op_input->arg()->name});
     op_desc.SetOutput("Output", {dequant_op_out->arg()->name});
   } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") {
     op_desc.SetInput("X", {quantized_op_input->arg()->name});
     op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
   }
-  op_desc.SetAttr("enable_int8", true);
-  op_desc.SetAttr("weight_scale", weight_scale);
+  if (quantized_op_type_ != "conv2d_transpose") {
+    op_desc.SetAttr("enable_int8", true);
+  }
+  op_desc.SetInputScale(weight_name, weight_scale);
 
   // change the weight from the float type to int8 type.
   auto quantized_weight_var_name = quantized_op_weight->arg()->name;
@@ -352,22 +360,7 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
     // Save quantization info in op_info attr
     auto op_info = *quantized_node->stmt()->op_info();
     op_info.SetAttr<int>("bit_length", bit_length);
-
-    std::string argname;
-    int index;
-    op_info.GetInputArgname(output_act_name, &argname);
-    op_info.GetInputIndex(output_act_name, &index);
-    op_info.SetAttr<float>(argname + std::to_string(index) + "_input_scale",
-                           scale_value);
-    std::string op_type = op_info.Type();
-    // Analyse the weight scale or input scale.
-    if (((op_type == "conv2d" || op_type == "depthwise_conv2d") &&
-         argname == "Input") ||
-        ((op_type == "mul" || op_type == "matmul") && argname == "Y")) {
-      op_info.SetAttr<float>("weight_scale", scale_value);
-    } else {
-      op_info.SetAttr<float>("input_scale", scale_value);
-    }
+    op_info.SetInputScale(output_act_name, {scale_value});
 
     op_info.UpdateAllInputs(output_act_name, input_act_name);
     quantized_node->stmt()->ResetOp(op_info, graph->valid_places());
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 5ad094fd4219bcbb3c59ec1c71f42af6cac5a11a..92804d6e72e7a2de6f3a6f3b47f338aecd25aa8c 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -314,4 +314,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
                      TARGET(kXPU),
                      TARGET(kBM),
                      TARGET(kRKNPU),
-                     TARGET(kAPU)});
+                     TARGET(kAPU),
+                     TARGET(kMLU)});
diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc
index ba48d5d4ead5ea922ded0bff3a87c2c127595790..46738dd49c16fd9736d61711b4baf56d51247699 100644
--- a/lite/core/mir/mlu_postprocess_pass.cc
+++ b/lite/core/mir/mlu_postprocess_pass.cc
@@ -14,18 +14,22 @@
 
 #include "lite/core/mir/mlu_postprocess_pass.h"
 #include <list>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
 #include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/subgraph/subgraph_detector.h"
 #include "lite/operators/subgraph_op.h"
 
 namespace paddle {
 namespace lite {
 namespace mir {
 
+static thread_local int g_stream_id = 0;
+
 Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
                                            const std::string& cast_arg_name,
                                            SSAGraph* graph,
@@ -37,6 +41,10 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
   cast_arg->AsArg().type = cast_type;
   inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
 
+  VLOG(4) << "insert cast before subgraph";
+  VLOG(4) << "curent node type: " << cur_node->AsArg().type->name()
+          << " cast to node type: " << cast_type->name();
+
   // create the stmt node
   auto* cast_inst = graph->NewInstructNode();
   // create op
@@ -60,14 +68,17 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
     CHECK(0) << "Unsupport cast type";
   }
   cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+
+  auto v_places = graph->valid_places();
   // create kernels
-  auto kernels = cast_op->CreateKernels(graph->valid_places());
+  auto kernels = cast_op->CreateKernels(v_places);
   std::vector<std::unique_ptr<KernelBase>> selected_kernels;
   bool is_found = false;
   for (auto& kernel : kernels) {
     if (op_type == "cast") {
       const Type* in_arg_ty = kernel->GetInputDeclType("X");
-      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type)) {
+      if (PrecisionCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) &&
+          DataLayoutCompatible(*in_arg_ty, *cur_node->AsArg().type)) {
         is_found = true;
       }
     } else if (op_type == "layout") {
@@ -83,24 +94,22 @@ Node* MLUPostprocessPass::InsertCastBefore(const std::string& op_type,
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
       if (TargetCompatibleTo(*in_arg_ty, *cur_node->AsArg().type) &&
-          TargetCompatibleTo(*out_arg_ty, *cast_type)) {
+          TargetCompatibleTo(*out_arg_ty, *cast_type) &&
+          PrecisionCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
+          PrecisionCompatible(*out_arg_ty, *cast_type)) {
         is_found = true;
       }
     } else {
       CHECK(0) << "Unsupport cast type";
     }
     if (is_found) {
+      VLOG(4) << "insert kernel: " << kernel->name();
       selected_kernels.emplace_back(std::move(kernel));
       // we pick the kernel
       cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
       auto& stmt = cast_inst->AsStmt();
-      if (op_type == "layout") {
-        stmt.picked_kernel().SetContext(
-            ContextScheduler::Global().NewContext(TARGET(kX86)));
-      } else {
-        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
-            stmt.picked_kernel().target()));
-      }
+      stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+          stmt.picked_kernel().target(), g_stream_id));
       break;
     }
   }
@@ -124,6 +133,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
   auto* var = inst_node->AsStmt().op()->scope()->Var(cast_arg_name);
   // for CastAfter manully set the tensor's type
   var->GetMutable<paddle::lite::Tensor>();
+  VLOG(4) << "insert cast after subgraph";
+  VLOG(4) << "curent node type: " << cur_node->AsArg().type->name()
+          << " cast to node type: " << cast_type->name();
 
   // create the stmt node
   auto* cast_inst = graph->NewInstructNode();
@@ -133,8 +145,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
   cpp::OpDesc op_desc;
   op_desc.SetType(op_type);
   if (op_type == "cast") {
-    op_desc.SetAttr<int>("in_dtype", 4);   // FP32
-    op_desc.SetAttr<int>("out_dtype", 5);  // FP16
+    op_desc.SetAttr<int>("in_dtype", 4);   // FP16
+    op_desc.SetAttr<int>("out_dtype", 5);  // FP32
     op_desc.SetInput("X", {cast_arg_name});
     op_desc.SetOutput("Out", {cur_node->AsArg().name});
   } else if (op_type == "layout") {
@@ -150,8 +162,9 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
 
   cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
 
+  auto v_places = graph->valid_places();
   // create kernels
-  auto kernels = cast_op->CreateKernels(graph->valid_places());
+  auto kernels = cast_op->CreateKernels(v_places);
   std::vector<std::unique_ptr<KernelBase>> selected_kernels;
   bool is_found = false;
   for (auto& kernel : kernels) {
@@ -164,14 +177,17 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
       if (DataLayoutCompatible(*in_arg_ty, *cast_type) &&
-          DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type)) {
+          DataLayoutCompatible(*out_arg_ty, *cur_node->AsArg().type) &&
+          PrecisionCompatibleTo(*in_arg_ty, *cast_type)) {
         is_found = true;
       }
     } else if (op_type == "io_copy") {
       const Type* in_arg_ty = kernel->GetInputDeclType("Input");
       const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
       if (TargetCompatibleTo(*in_arg_ty, *cast_type) &&
-          TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type)) {
+          TargetCompatibleTo(*out_arg_ty, *cur_node->AsArg().type) &&
+          PrecisionCompatible(*in_arg_ty, *cur_node->AsArg().type) &&
+          PrecisionCompatible(*out_arg_ty, *cast_type)) {
         is_found = true;
       }
     } else {
@@ -182,13 +198,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
       // we pick the kernel
       cast_inst->AsStmt(op_type, std::move(selected_kernels), cast_op);
       auto& stmt = cast_inst->AsStmt();
-      if (op_type == "layout") {
-        stmt.picked_kernel().SetContext(
-            ContextScheduler::Global().NewContext(TARGET(kX86)));
-      } else {
-        stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
-            stmt.picked_kernel().target()));
-      }
+      stmt.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+          stmt.picked_kernel().target(), g_stream_id));
       break;
     }
   }
@@ -203,7 +214,8 @@ Node* MLUPostprocessPass::InsertCastAfter(const std::string& op_type,
 void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                                       Node* head_node,
                                       Node* inst_node,
-                                      const Type* inst_type) {
+                                      const Type* inst_type,
+                                      bool use_mlu_cast) {
   const auto* head_type = head_node->AsArg().type;
 
   // break original link
@@ -218,39 +230,52 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
                 head_node->AsArg().name) != first_conv_nodes_.end();
 
   // precision cast node
-  if (head_type->precision() != inst_type->precision() && !is_first_conv_head) {
+  if (!use_mlu_cast) {
+    if (head_type->precision() != inst_type->precision() &&
+        !is_first_conv_head) {
+      cur_node = InsertCastBefore("cast",
+                                  name_prefix + "cast",
+                                  graph,
+                                  cur_node,
+                                  inst_node,
+                                  LiteType::GetTensorTy(head_type->target(),
+                                                        inst_type->precision(),
+                                                        head_type->layout()));
+    }
+
+    // layout cast node
+    if (head_type->layout() != inst_type->layout()) {
+      cur_node = InsertCastBefore("layout",
+                                  name_prefix + "layout",
+                                  graph,
+                                  cur_node,
+                                  inst_node,
+                                  LiteType::GetTensorTy(head_type->target(),
+                                                        inst_type->precision(),
+                                                        inst_type->layout()));
+    }
+
+    // io copy
     cur_node = InsertCastBefore(
-        "cast",
-        name_prefix + "cast",
+        "io_copy",
+        name_prefix + "io_copy",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            head_type->target(), inst_type->precision(), head_type->layout()));
-  }
-
-  // layout cast node
-  if (head_type->layout() != inst_type->layout()) {
+            inst_type->target(), inst_type->precision(), inst_type->layout()));
+  } else {
+    // io copy
     cur_node = InsertCastBefore(
-        "layout",
-        name_prefix + "layout",
+        "io_copy",
+        name_prefix + "io_copy",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            head_type->target(), inst_type->precision(), inst_type->layout()));
+            inst_type->target(), head_type->precision(), head_type->layout()));
   }
 
-  // io copy
-  cur_node = InsertCastBefore(
-      "io_copy",
-      name_prefix + "io_copy",
-      graph,
-      cur_node,
-      inst_node,
-      LiteType::GetTensorTy(
-          inst_type->target(), inst_type->precision(), inst_type->layout()));
-
   // connect cur_node to inst_node
   DirectedLink(cur_node, inst_node);
 
@@ -311,10 +336,9 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
       CHECK(subgraph_precision == PRECISION(kFloat) ||
             subgraph_precision == PRECISION(kFP16))
           << "Mlu node has unsupport precision";
-      VLOG(4) << "picked kernel precision: "
-              << PrecisionToStr(subgraph_precision);
       *arg_type = LiteType::GetTensorTy(
           subgraph_target, subgraph_precision, subgraph_layout);
+      VLOG(4) << "picked subgraph kernel type: " << (*arg_type)->name();
       break;
     }
   }
@@ -356,7 +380,8 @@ bool MLUPostprocessPass::NeedInsert(Node* node, const Type* inst_type) {
 void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
                                      Node* tail_node,
                                      Node* inst_node,
-                                     const Type* inst_type) {
+                                     const Type* inst_type,
+                                     bool use_mlu_cast) {
   const auto* tail_type = tail_node->AsArg().type;
 
   // break original link
@@ -367,39 +392,50 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
       tail_node->AsArg().name + string_format("_%p", inst_node) + "/trans_";
 
   // precision cast node
-  if (tail_type->precision() != inst_type->precision()) {
+  if (!use_mlu_cast) {
+    if (tail_type->precision() != inst_type->precision()) {
+      cur_node = InsertCastAfter("cast",
+                                 name_prefix + "cast",
+                                 graph,
+                                 cur_node,
+                                 inst_node,
+                                 LiteType::GetTensorTy(tail_type->target(),
+                                                       inst_type->precision(),
+                                                       tail_type->layout()));
+    }
+
+    // layout cast node
+    if (tail_type->layout() != inst_type->layout()) {
+      cur_node = InsertCastAfter("layout",
+                                 name_prefix + "layout",
+                                 graph,
+                                 cur_node,
+                                 inst_node,
+                                 LiteType::GetTensorTy(tail_type->target(),
+                                                       inst_type->precision(),
+                                                       inst_type->layout()));
+    }
+
+    // io copy
     cur_node = InsertCastAfter(
-        "cast",
-        name_prefix + "cast",
+        "io_copy",
+        name_prefix + "io_copy",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            tail_type->target(), inst_type->precision(), tail_type->layout()));
-  }
-
-  // layout cast node
-  if (tail_type->layout() != inst_type->layout()) {
+            inst_type->target(), inst_type->precision(), inst_type->layout()));
+  } else {
     cur_node = InsertCastAfter(
-        "layout",
-        name_prefix + "layout",
+        "io_copy",
+        name_prefix + "io_copy",
         graph,
         cur_node,
         inst_node,
         LiteType::GetTensorTy(
-            tail_type->target(), inst_type->precision(), inst_type->layout()));
+            inst_type->target(), tail_type->precision(), tail_type->layout()));
   }
 
-  // io copy
-  cur_node = InsertCastAfter(
-      "io_copy",
-      name_prefix + "io_copy",
-      graph,
-      cur_node,
-      inst_node,
-      LiteType::GetTensorTy(
-          inst_type->target(), inst_type->precision(), inst_type->layout()));
-
   // connect cur_node to inst_node
   DirectedLink(inst_node, cur_node);
 
@@ -496,6 +532,74 @@ void MLUPostprocessPass::GatherAndModifyFirstConvNodes(SSAGraph* graph) {
   }
 }
 
+void MLUPostprocessPass::ModifyInputOutputDataType(SSAGraph* graph) {
+  for (auto& node : graph->mutable_nodes()) {
+    if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
+      const Type* subgraph_arg_type = nullptr;
+      GetSubgraphOpArgType(&node, &subgraph_arg_type, graph);
+      for (auto& in_node : node.inlinks) {
+        const auto* in_node_type = in_node->AsArg().type;
+        VLOG(4) << "MLU subgraph input type: " << in_node->AsArg().name
+                << *in_node_type;
+        if (in_node->AsArg().is_weight || in_node->AsArg().is_persist) {
+          CHECK(in_node_type->target() == TARGET(kHost) &&
+                in_node_type->precision() == PRECISION(kAny) &&
+                in_node_type->layout() == DATALAYOUT(kNCHW))
+              << "MLU subgraph unexpected persistent input type!";
+          in_node->AsArg().type = LiteType::GetTensorTy(
+              TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC));
+        } else {
+          CHECK((in_node_type->target() == TARGET(kHost) ||
+                 in_node_type->target() == TARGET(kX86)) &&
+                in_node_type->precision() == PRECISION(kFloat) &&
+                in_node_type->layout() == DATALAYOUT(kNCHW))
+              << "MLU subgraph unexpected common input type!";
+        }
+      }
+      for (auto& out_node : node.outlinks) {
+        const auto* out_node_type = out_node->AsArg().type;
+        auto& out_arg = out_node->AsArg();
+        VLOG(4) << "MLU subgraph output type: " << out_node->AsArg().name
+                << *out_node_type;
+        if (out_node->AsArg().is_weight || out_node->AsArg().is_persist) {
+          CHECK(out_node_type->target() == TARGET(kHost) &&
+                out_node_type->precision() == PRECISION(kAny) &&
+                out_node_type->layout() == DATALAYOUT(kNCHW))
+              << "MLU subgraph unexpected persistent input type!";
+          out_node->AsArg().type = LiteType::GetTensorTy(
+              TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC));
+        } else if (out_node_type->precision() == PRECISION(kAny) &&
+                   out_node->outlinks.empty()) {
+          out_arg.is_persist = true;
+          out_arg.type = LiteType::GetTensorTy(
+              TARGET(kMLU), PRECISION(kAny), DATALAYOUT(kNHWC));
+        } else {
+          CHECK(out_node_type->precision() == PRECISION(kFloat))
+              << "MLU subgraph unexpected common output type!";
+          if (out_node->outlinks.empty()) {
+            out_arg.type = LiteType::GetTensorTy(TARGET(kHost),
+                                                 subgraph_arg_type->precision(),
+                                                 DATALAYOUT(kNHWC));
+            VLOG(4) << "unused output node type: " << out_arg.name
+                    << out_node_type->name();
+          } else {
+            out_arg.type = LiteType::GetTensorTy(
+                TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+            VLOG(4) << "output node type: " << out_arg.name
+                    << out_node_type->name();
+          }
+        }
+        const auto target = out_node->AsArg().type->target();
+        const auto precision = out_node->AsArg().type->precision();
+        const auto layout = out_node->AsArg().type->layout();
+        VLOG(4) << "arg name: " << out_node->AsArg().name
+                << " type: " << TargetToStr(target) << ", "
+                << PrecisionToStr(precision) << ", " << DataLayoutToStr(layout);
+      }
+    }
+  }
+}
+
 void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
   for (auto& node : graph->mutable_nodes()) {
     if (!node.IsStmt()) continue;
@@ -515,6 +619,16 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
                                     old_type->precision(),
                                     paddle::lite_api::DataLayoutType::kNHWC,
                                     old_type->device());
+          // modify inst feed to NHWC, while set_mlu_input_layout(kNHWC)
+          // invoked, to keep consistent with actual data layout
+          auto place = node.AsStmt().place();
+          place.layout = DATALAYOUT(kNHWC);
+          std::vector<Place> valid_places = {place};
+          auto updated_op_info = *node.AsStmt().op_info();
+          node.AsStmt().ResetOp(updated_op_info, valid_places, nullptr);
+          auto kernel = &(node.AsStmt().picked_kernel());
+          VLOG(4) << "kernel info: " << kernel->name();
+          node.AsStmt().op()->AttachKernel(kernel);
         }
       }
     }
@@ -540,6 +654,213 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) {
   }
 }
 
+std::pair<bool, std::string> CheckInputAndInsert(Scope* scope,
+                                                 cpp::BlockDesc* block_desc,
+                                                 const std::string& input_name,
+                                                 const Type* tensor_type,
+                                                 const Type* subgraph_type) {
+  auto cur_node = input_name;
+  bool do_insert = false;
+  if (!DataLayoutCompatible(*tensor_type, *subgraph_type)) {
+    auto layout_op = block_desc->AddOp<cpp::OpDesc>();
+    auto layout_arg_name = string_format("%s/layout", cur_node.c_str());
+    scope->Var(layout_arg_name);
+    VLOG(4) << "insert layout for subgraph input, arg tensor name: "
+            << layout_arg_name;
+    layout_op->SetType("layout");
+    layout_op->SetInput("Input", {cur_node});
+    layout_op->SetOutput("Out", {layout_arg_name});
+    cur_node = layout_arg_name;
+    do_insert = true;
+  }
+
+  if (!PrecisionCompatible(*tensor_type, *subgraph_type) &&
+      tensor_type->precision() != PRECISION(kInt8) &&
+      tensor_type->precision() != PRECISION(kInt32)) {
+    auto cast_op = block_desc->AddOp<cpp::OpDesc>();
+    auto cast_arg_name = string_format("%s/cast", cur_node.c_str());
+    scope->Var(cast_arg_name);
+    VLOG(4) << "insert cast for subgraph input, arg tensor name: "
+            << cast_arg_name;
+    cast_op->SetType("cast");
+    cast_op->SetAttr<int>("in_dtype", 5);   // FP32
+    cast_op->SetAttr<int>("out_dtype", 4);  // FP16
+    cast_op->SetInput("X", {cur_node});
+    cast_op->SetOutput("Out", {cast_arg_name});
+    cur_node = cast_arg_name;
+    do_insert = true;
+  }
+
+  return std::make_pair(do_insert, cur_node);
+}
+
+std::pair<bool, std::string> CheckOutputAndInsert(
+    Scope* scope,
+    cpp::BlockDesc* block_desc,
+    const std::string& output_name,
+    const Type* tensor_type,
+    const Type* subgraph_type) {
+  auto cur_node = output_name;
+  bool do_insert = false;
+  cpp::OpDesc *layout_op = nullptr, *cast_op = nullptr;
+  size_t cast_idx = 0;
+
+  // subgraph -> cast -> layout -> output
+  if (!PrecisionCompatible(*tensor_type, *subgraph_type)) {
+    cast_op = block_desc->AddOp<cpp::OpDesc>();
+    cast_idx = block_desc->OpsSize() - 1;
+    CHECK_EQ(cast_op, block_desc->GetOp<cpp::OpDesc>(cast_idx));
+    cast_op->SetType("cast");
+    cast_op->SetAttr<int>("in_dtype", 4);   // FP16
+    cast_op->SetAttr<int>("out_dtype", 5);  // FP32
+    do_insert = true;
+  }
+
+  if (!DataLayoutCompatible(*tensor_type, *subgraph_type)) {
+    auto layout_arg_name = string_format("%s/layout", cur_node.c_str());
+    scope->Var(layout_arg_name);
+    VLOG(4) << "insert layout for subgraph output, arg tensor name: "
+            << layout_arg_name;
+    layout_op = block_desc->AddOp<cpp::OpDesc>();
+    layout_op->SetType("layout");
+    layout_op->SetInput("Input", {layout_arg_name});
+    layout_op->SetOutput("Out", {cur_node});
+    cur_node = layout_arg_name;
+    do_insert = true;
+  }
+
+  if (cast_op) {
+    cast_op = block_desc->GetOp<cpp::OpDesc>(cast_idx);
+    auto cast_arg_name = string_format("%s/cast", cur_node.c_str());
+    scope->Var(cast_arg_name);
+    VLOG(4) << "insert cast for subgraph output, arg tensor name: "
+            << cast_arg_name;
+    cast_op->SetInput("X", {cast_arg_name});
+    cast_op->SetOutput("Out", {cur_node});
+    cur_node = cast_arg_name;
+  }
+
+  return std::make_pair(do_insert, cur_node);
+}
+
+// insert cast op on mlu, to avoid cast on cpu
+void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
+                                        const Type* subgraph_type) {
+  auto subgraph_op = subgraph_node->AsStmt().op();
+  CHECK_EQ(subgraph_op->Type(), "subgraph");
+  auto op = dynamic_cast<operators::SubgraphOp*>(subgraph_op.get());
+  CHECK(op);
+  auto block_desc = op->GetSubBlock();
+
+  // create a new block desc to keep op sequence correct
+  cpp::BlockDesc* new_block_desc = new cpp::BlockDesc();
+  new_block_desc->ClearOps();
+  new_block_desc->ClearVars();
+  new_block_desc->SetIdx(block_desc->Idx());
+  new_block_desc->SetParentIdx(block_desc->ParentIdx());
+  new_block_desc->SetForwardBlockIdx(block_desc->ForwardBlockIdx());
+
+  // find all IO that is not weight or persist
+  std::list<std::string> i_names, o_names;
+  std::map<std::string, std::string> node_replace;
+
+  // Insert cast op for iotensor which is not weight or persist
+  for (auto& input : subgraph_node->inlinks) {
+    auto input_name = input->AsArg().name;
+    if (!(input->AsArg().is_weight || input->AsArg().is_persist)) {
+      i_names.emplace_back(input_name);
+      auto ret = CheckInputAndInsert(op->scope(),
+                                     new_block_desc,
+                                     input_name,
+                                     input->AsArg().type,
+                                     subgraph_type);
+      if (ret.first) {
+        node_replace[input_name] = ret.second;
+      }
+    }
+  }
+  for (auto& output : subgraph_node->outlinks) {
+    auto output_name = output->AsArg().name;
+    if (!(output->AsArg().is_weight || output->AsArg().is_persist)) {
+      o_names.emplace_back(output_name);
+      auto ret = CheckOutputAndInsert(op->scope(),
+                                      block_desc,
+                                      output_name,
+                                      output->AsArg().type,
+                                      subgraph_type);
+      if (ret.first) {
+        node_replace[output_name] = ret.second;
+      }
+    }
+  }
+
+  // update input and output
+  for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); ++op_idx) {
+    auto desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
+    auto new_desc = new_block_desc->AddOp<cpp::OpDesc>();
+    *new_desc = *desc;
+
+    if (desc->Type() != "layout" && desc->Type() != "cast") {
+      auto op_input_args = new_desc->InputArgumentNames();
+      for (auto& input_arg : op_input_args) {
+        auto op_input = new_desc->Input(input_arg);
+        for (auto& it : i_names) {
+          auto index = std::find(op_input.begin(), op_input.end(), it);
+          if (index != op_input.end() &&
+              node_replace.find(it) != node_replace.end()) {
+            index = op_input.erase(index);
+            op_input.emplace(index, node_replace.at(it));
+            VLOG(4) << new_desc->Type() << "] change input from " << it
+                    << " to " << node_replace.at(it);
+          }
+        }
+        new_desc->SetInput(input_arg, op_input);
+      }
+
+      auto op_output_args = new_desc->OutputArgumentNames();
+      for (auto& output_arg : op_output_args) {
+        auto op_output = new_desc->Output(output_arg);
+        for (auto& it : o_names) {
+          auto index = std::find(op_output.begin(), op_output.end(), it);
+          if (index != op_output.end() &&
+              node_replace.find(it) != node_replace.end()) {
+            index = op_output.erase(index);
+            op_output.emplace(index, node_replace.at(it));
+            VLOG(4) << new_desc->Type() << "] change output from " << it
+                    << " to " << node_replace.at(it);
+          }
+        }
+        new_desc->SetOutput(output_arg, op_output);
+      }
+    }
+  }
+  op->SetSubBlock(new_block_desc);
+}
+
+void ModifyValidPlaces(SSAGraph* graph, bool use_mlu_cast) {
+  // remove invalid places, since only support X86, host, MLU
+  auto v_places = graph->valid_places();
+  for (auto it = v_places.begin(); it != v_places.end();) {
+    if (it->target != TARGET(kMLU) && it->target != TARGET(kHost) &&
+        it->target != TARGET(kX86)) {
+      it = v_places.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
+  if (use_mlu_cast) {
+    // insert mlu float place for float io copy, no effect to subgraph type
+    v_places.emplace_back(TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNHWC));
+  }
+
+  graph->SetValidPlaces(v_places);
+  VLOG(4) << "valid places after modified:";
+  for (auto& p : v_places) {
+    VLOG(4) << p.DebugString();
+  }
+}
+
 void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 // currently for non-persistent input and output args, mlu subgraph op
 // only support float16/float32 data type
@@ -549,35 +870,47 @@ void MLUPostprocessPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 // arg_in and arg_out are assumed to be NHWC which user should be aware of.
 // Thus here we change these args' layout to NHWC
 #ifdef LITE_WITH_MLU
-  if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) {
+  ModifyInputOutputDataType(graph.get());
+
+  if (lite::TargetWrapperMlu::InputLayout() == DATALAYOUT(kNHWC)) {
     ModifyLayout(graph.get());
   }
 
-  if (lite::DeviceInfo::Global().UseFirstConv()) {
+  if (lite::TargetWrapperMlu::UseFirstConv()) {
     GatherAndModifyFirstConvNodes(graph.get());
   }
 #endif
 
+  g_stream_id = static_cast<int>(reinterpret_cast<int64_t>(graph.get()));
+  bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
+  ModifyValidPlaces(graph.get(), !disable_mlu_cast);
   // insert io_copy, layout and precision cast of subgraph's inputs and outputs
   for (auto& node : graph->mutable_nodes()) {
     if (node.IsStmt() && node.AsStmt().op_type() == "subgraph") {
       const Type* subgraph_arg_type = nullptr;
       GetSubgraphOpArgType(&node, &subgraph_arg_type, graph.get());
+      if (!disable_mlu_cast) {
+        AdjustSubgraph(&node, subgraph_arg_type);
+      }
 
       auto links_tmp = node.inlinks;
       for (auto p_in : links_tmp) {
         if (NeedInsert(p_in, subgraph_arg_type)) {
-          InsertBefore(graph.get(), p_in, &node, subgraph_arg_type);
+          InsertBefore(
+              graph.get(), p_in, &node, subgraph_arg_type, !disable_mlu_cast);
         }
       }
       links_tmp.assign(node.outlinks.begin(), node.outlinks.end());
       for (auto p_out : links_tmp) {
         if (NeedInsert(p_out, subgraph_arg_type)) {
-          InsertAfter(graph.get(), p_out, &node, subgraph_arg_type);
+          InsertAfter(
+              graph.get(), p_out, &node, subgraph_arg_type, !disable_mlu_cast);
         }
       }
     }
   }
+  // std::vector<std::vector<Node*>> subgraphs({graph->NodeTopologicalOrder()});
+  // SubgraphVisualizer(graph.get(), subgraphs)();
 }
 
 }  // namespace mir
diff --git a/lite/core/mir/mlu_postprocess_pass.h b/lite/core/mir/mlu_postprocess_pass.h
index 688dd06fb5fbec0c8e1c53acfe4215456ddb4192..5a31c1d8322db7bbc57de8dd18fdaf8ff4b0c885 100644
--- a/lite/core/mir/mlu_postprocess_pass.h
+++ b/lite/core/mir/mlu_postprocess_pass.h
@@ -79,6 +79,8 @@ class MLUPostprocessPass : public ProgramPass {
                             const Type** arg_type,
                             SSAGraph* graph);
 
+  void ModifyInputOutputDataType(SSAGraph* graph);
+
   void ModifyLayout(SSAGraph* graph);
 
   bool NeedInsert(Node* node, const Type* inst_type);
@@ -86,12 +88,14 @@ class MLUPostprocessPass : public ProgramPass {
   void InsertBefore(SSAGraph* graph,
                     Node* head_node,
                     Node* inst_node,
-                    const Type* type);
+                    const Type* type,
+                    bool use_mlu_cast);
 
   void InsertAfter(SSAGraph* graph,
                    Node* tail_node,
                    Node* inst_node,
-                   const Type* type);
+                   const Type* type,
+                   bool use_mlu_cast);
 
   Node* InsertCastBefore(const std::string& op_type,
                          const std::string& cast_arg_name,
@@ -115,6 +119,8 @@ class MLUPostprocessPass : public ProgramPass {
 
   bool IsFirstConvInSubgraph(Node* arg_node, Node* inst);
 
+  void AdjustSubgraph(Node* subgraph_node, const Type* op_type);
+
  private:
   std::set<std::string> first_conv_nodes_;
 };
diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc
index 66b37446a4cc6a33c09757266c9dd2cbc818325e..259447aa21b76261a266a243dcc9c2a7530c9dc5 100644
--- a/lite/core/mir/quantized_op_attributes_inference_pass.cc
+++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc
@@ -37,34 +37,53 @@ void QuantizedOpAttributesInferencePass::Apply(
     auto& inst = op_node->AsStmt();
     auto op_info = inst.op_info();
     auto op_type = op_info->Type();
-    if (!op_info->HasAttr("input_scale")) continue;
-    bool found = false;
-    float output_scale;
+
+    // Check if any of the inputs of the op have scale value
+    bool has_input_scale = false;
+    for (auto in_var_node : op_node->inlinks) {
+      CHECK(in_var_node->IsArg());
+      auto in_var_node_name = in_var_node->arg()->name;
+      has_input_scale |= op_info->HasInputScale(in_var_node_name);
+    }
+    if (!has_input_scale) continue;
+
+    // Infer the output scale according to its out_threshold or the input scale
+    // of its adjacent ops
+    bool is_quantized = true;
     for (auto out_var_node : op_node->outlinks) {
       CHECK(out_var_node->IsArg());
+      std::vector<float> output_scale;
+      bool has_output_scale = false;
+      auto out_var_node_name = out_var_node->arg()->name;
       for (auto out_op_node : out_var_node->outlinks) {
         CHECK(out_op_node->IsStmt());
         auto& out_inst = out_op_node->AsStmt();
         auto out_op_info = out_inst.op_info();
-        if (!out_op_info->HasAttr("input_scale")) continue;
-        auto input_scale = out_op_info->GetAttr<float>("input_scale");
-        if (!found) {
-          found = true;
+        if (!out_op_info->HasInputScale(out_var_node_name)) continue;
+        auto input_scale = out_op_info->GetInputScale(out_var_node_name);
+        if (!has_output_scale) {
           output_scale = input_scale;
+          has_output_scale = true;
         } else {
-          CHECK_EQ(output_scale, input_scale);
+          CHECK_EQ(output_scale.size(), input_scale.size());
         }
       }
+      if (has_output_scale) {
+        inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
+      } else if (op_info->HasAttr("out_threshold")) {
+        // Only consider one output, there are only one out_threshold
+        int bit_length = op_info->GetAttr<int>("bit_length");
+        int range = (1 << (bit_length - 1)) - 1;
+        output_scale = std::vector<float>{
+            op_info->GetAttr<float>("out_threshold") / range};
+        inst.mutable_op_info()->SetOutputScale(out_var_node_name, output_scale);
+      } else {
+        is_quantized = false;
+      }
     }
-    if (found) {
-      inst.mutable_op_info()->SetAttr("output_scale", output_scale);
-    } else if (op_info->HasAttr("output_scale")) {
-      int bit_length = op_info->GetAttr<int>("bit_length");
-      int range = (1 << (bit_length - 1)) - 1;
-      output_scale = op_info->GetAttr<float>("output_scale");
-      inst.mutable_op_info()->SetAttr("output_scale", output_scale / range);
-    }
-    if (op_info->HasAttr("output_scale")) {
+
+    // Fix the missing of the attribute 'enable_int8'.
+    if (is_quantized) {
       inst.mutable_op_info()->SetAttr("enable_int8", true);
     }
   }
diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc
index 5b6f968484b7b49838a004c3edfd00ff9b7e5e5e..7ad833b22885204130b50a931dc2da7d040c654c 100644
--- a/lite/core/mir/runtime_context_assign_pass.cc
+++ b/lite/core/mir/runtime_context_assign_pass.cc
@@ -44,6 +44,10 @@ class RuntimeContextAssignPass : public StmtPass {
         inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
             inst.picked_kernel().target()));
       }
+#elif LITE_WITH_MLU
+      inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
+          inst.picked_kernel().target(),
+          static_cast<int>(reinterpret_cast<int64_t>(graph.get()))));
 #else
       int stream_id = inst.stream_id_;
 
diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc
index 1de0d1a26577b31e1dfc5187562cc80bce6fe4d1..b5dd1f8b9c119f4647b72a35eb71df37f31fc6f8 100644
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
@@ -110,15 +110,16 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       if (out_type_int8) {
         auto out_node = node.outlinks.front();
         CHECK(out_node->IsArg());
+        auto out_node_name = out_node->arg()->name;
         auto one_adj_op_node = out_node->outlinks.front();
         CHECK(one_adj_op_node->IsStmt());
         auto& one_adj_instruct = one_adj_op_node->AsStmt();
         CHECK(one_adj_instruct.op_info()->HasAttr("enable_int8"));
-        CHECK(one_adj_instruct.op_info()->HasAttr("input_scale"));
+        CHECK(one_adj_instruct.op_info()->HasInputScale(out_node_name));
 
-        instruct.mutable_op_info()->SetAttr(
-            "output_scale",
-            one_adj_instruct.op_info()->GetAttr<float>("input_scale"));
+        instruct.mutable_op_info()->SetOutputScale(
+            out_node_name,
+            one_adj_instruct.op_info()->GetInputScale(out_node_name));
 
         auto update_desc = *instruct.mutable_op_info();
         instruct.ResetOp(update_desc, graph->valid_places());
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
index 31a38280ff537d486f5fb3ba46dee5b025d3f1f1..4b9f34225f70e9050b2605b49e888ed323536b2f 100644
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -425,20 +425,45 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
 
   // Extract input and output nodes from the target subgraph
-  std::set<Node *> input_var_nodes;
+  std::set<Node *> idata_var_nodes;
   std::set<Node *> weight_var_nodes;
-  std::set<Node *> output_var_nodes;
+  std::set<Node *> odata_var_nodes;
   std::set<Node *> local_var_nodes;
   std::set<Node *> unused_var_nodes;
   ExtractInputsOutputs(subgraph_nodes,
-                       &input_var_nodes,
+                       &idata_var_nodes,
                        &weight_var_nodes,
-                       &output_var_nodes,
+                       &odata_var_nodes,
                        &local_var_nodes,
                        &unused_var_nodes);
-
+  // A simplified model without the original weight/local/unused nodes on the
+  // subgraph ops will be saved only if 'SUBGRAPH_DISABLE_ONLINE_MODE' is set to
+  // true and Predictor->Run(...), Predictor->Save(...) is called.
+  std::set<Node *> input_var_nodes(idata_var_nodes.begin(),
+                                   idata_var_nodes.end());
+  std::set<Node *> output_var_nodes(odata_var_nodes.begin(),
+                                    odata_var_nodes.end());
+  if (!GetBoolFromEnv(SUBGRAPH_DISABLE_ONLINE_MODE)) {
+    input_var_nodes.insert(weight_var_nodes.begin(), weight_var_nodes.end());
+    output_var_nodes.insert(local_var_nodes.begin(), local_var_nodes.end());
+    output_var_nodes.insert(unused_var_nodes.begin(), unused_var_nodes.end());
+  }
   // Set input and output name mapping which stores the real inputs and
   // outputs
+  std::vector<std::string> idata_var_names;
+  std::vector<std::string> odata_var_names;
+  for (auto &var_node : idata_var_nodes) {
+    idata_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : odata_var_nodes) {
+    odata_var_names.push_back(var_node->AsArg().name);
+  }
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
+                                                     idata_var_names);
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
+                                                     odata_var_names);
+  // Set all of the inputs and outputs to the target subgraph op
+  // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
   std::vector<std::string> input_var_names;
   std::vector<std::string> output_var_names;
   for (auto &var_node : input_var_nodes) {
@@ -447,60 +472,36 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   for (auto &var_node : output_var_nodes) {
     output_var_names.push_back(var_node->AsArg().name);
   }
-  subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
-                                                     input_var_names);
-  subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
-                                                     output_var_names);
+  subgraph_op_desc.SetInput("Inputs", input_var_names);
+  subgraph_op_desc.SetOutput("Outputs", output_var_names);
+  auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+  static_cast<operators::SubgraphOp *>(subgraph_op.get())
+      ->SetSubBlock(sub_block_desc);
+  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
+  subgraph_op->Attach(subgraph_op_desc, any_op->scope());
 
-  // Set input/output scale values of input/output var nodes for
-  // type_precision_cast_pass.
-  std::vector<float> input_data_scales;
-  std::vector<float> output_data_scales;
+  // Export the scale values of the input/output var nodes of the inner op nodes
+  // only for type_precision_cast_pass.
   for (auto &var_node : input_var_nodes) {
+    auto var_node_name = var_node->arg()->name;
     auto any_op_node = var_node->outlinks.front();
     CHECK(any_op_node->IsStmt());
     auto &any_inst = any_op_node->AsStmt();
-    if (any_inst.op_info()->HasAttr("input_scale")) {
-      input_data_scales.push_back(
-          any_inst.op_info()->GetAttr<float>("input_scale"));
+    if (any_inst.op_info()->HasInputScale(var_node_name)) {
+      subgraph_op->mutable_op_info()->SetInputScale(
+          var_node_name, any_inst.op_info()->GetInputScale(var_node_name));
     }
   }
   for (auto &var_node : output_var_nodes) {
+    auto var_node_name = var_node->arg()->name;
     auto any_op_node = var_node->inlinks.front();
     CHECK(any_op_node->IsStmt());
     auto &any_inst = any_op_node->AsStmt();
-    if (any_inst.op_info()->HasAttr("output_scale")) {
-      output_data_scales.push_back(
-          any_inst.op_info()->GetAttr<float>("output_scale"));
+    if (any_inst.op_info()->HasOutputScale(var_node_name)) {
+      subgraph_op->mutable_op_info()->SetOutputScale(
+          var_node_name, any_inst.op_info()->GetOutputScale(var_node_name));
     }
   }
-  if (input_data_scales.size() > 0) {
-    subgraph_op_desc.SetAttr<std::vector<float>>("input_data_scales",
-                                                 input_data_scales);
-  }
-  if (output_data_scales.size() > 0) {
-    subgraph_op_desc.SetAttr<std::vector<float>>("output_data_scales",
-                                                 output_data_scales);
-  }
-
-  // Set all of the inputs and outputs to the target subgraph op
-  // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
-  for (auto &var_node : weight_var_nodes) {
-    input_var_names.push_back(var_node->AsArg().name);
-  }
-  for (auto &var_node : local_var_nodes) {
-    output_var_names.push_back(var_node->AsArg().name);
-  }
-  for (auto &var_node : unused_var_nodes) {
-    output_var_names.push_back(var_node->AsArg().name);
-  }
-  subgraph_op_desc.SetInput("Inputs", input_var_names);
-  subgraph_op_desc.SetOutput("Outputs", output_var_names);
-  auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
-  static_cast<operators::SubgraphOp *>(subgraph_op.get())
-      ->SetSubBlock(sub_block_desc);
-  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
-  subgraph_op->Attach(subgraph_op_desc, any_op->scope());
 
   // Create and add a new subgraph node into the graph
   auto subgraph_op_node =
@@ -508,26 +509,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
   for (auto &var_node : input_var_nodes) {
     IR_NODE_LINK_TO(var_node, subgraph_op_node);
   }
-  for (auto &var_node : weight_var_nodes) {
-    IR_NODE_LINK_TO(var_node, subgraph_op_node);
-  }
   for (auto &var_node : output_var_nodes) {
     IR_OP_VAR_LINK(subgraph_op_node, var_node);
   }
-  for (auto &var_node : local_var_nodes) {
-    IR_OP_VAR_LINK(subgraph_op_node, var_node);
-  }
-  for (auto &var_node : unused_var_nodes) {
-    IR_OP_VAR_LINK(subgraph_op_node, var_node);
-  }
 
   // Remove subgraph nodes and unused var nodes
-  auto nodes2rm = GetNodes2RM(subgraph_nodes,
-                              {input_var_nodes,
-                               weight_var_nodes,
-                               output_var_nodes,
-                               local_var_nodes,
-                               unused_var_nodes});
+  auto nodes2rm =
+      GetNodes2RM(subgraph_nodes, {input_var_nodes, output_var_nodes});
   GraphSafeRemoveNodes(graph, nodes2rm);
 }
 
@@ -602,7 +590,17 @@ std::set<const Node *> GetNodes2RM(
   std::set<const Node *> nodes2rm(op_nodes.begin(), op_nodes.end());
   for (auto &op_node : op_nodes) {
     for (auto &var_node : op_node->inlinks) {
-      if (!nodes2rm.count(var_node)) {
+      bool skip = false;
+      // skip the var node which is used by any other ops that doesn't belong to
+      // the subgraph ops.
+      for (auto &out_op_node : var_node->outlinks) {
+        if (std::find(op_nodes.begin(), op_nodes.end(), out_op_node) !=
+            op_nodes.end()) {
+          skip = true;
+          break;
+        }
+      }
+      if (!skip && !nodes2rm.count(var_node)) {
         nodes2rm.insert(var_node);
       }
     }
diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
index f52c0332fa3cfce904d2b7c8bf010bc3d3ac6ac9..06c9c4c78fedba7cfabcd4ff2dd3804b404f966d 100644
--- a/lite/core/mir/subgraph/subgraph_detector_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -20,7 +20,7 @@
 #include "lite/api/paddle_use_passes.h"
 #include "lite/core/mir/ssa_graph.h"
 #include "lite/core/program.h"
-#include "lite/model_parser/cpp/program_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/model_parser/model_parser.h"
 
 DEFINE_string(model_dir, "", "model_dir");
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
index 8fd3751f9ca1585af6b8b00f23acd6bacf5b7a51..104ad5b4fa819de5ff3501c08c60e9918c93cddf 100644
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -13,8 +13,12 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <cmath>
+
 #include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
 #include "lite/api/test_helper.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc
index 1133e5ba8203ec9fea177844a6311c993f6b8ff7..44b6eaf1eb0c5c96630dd66d129919b40f3ea8c6 100644
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -249,11 +249,13 @@ void OpenCLTypeLayoutTransformPass::Apply(
 REGISTER_MIR_PASS(type_layout_cast_pass,
                   paddle::lite::mir::TypeLayoutTransformPass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kMLU)})
     .BindKernel("layout_once")
     .BindKernel("layout");
 
 REGISTER_MIR_PASS(type_layout_cast_preprocess_pass,
                   paddle::lite::mir::OpenCLTypeLayoutTransformPass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kMLU)})
     .BindKernel("layout_once")
     .BindKernel("layout");
diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc
index 25648877568f6427843f8ded6890450c265b4f06..39a94cbca6bd6222da5da1d314ea07475592bf0e 100644
--- a/lite/core/mir/type_precision_cast_pass.cc
+++ b/lite/core/mir/type_precision_cast_pass.cc
@@ -66,65 +66,30 @@ void UpdateInputs(OpLite* op, const std::string& from, const std::string& to) {
   }
 }
 
-// Infer the scale value for the new calib op from the subgraph op
-static bool InferScaleFromSubgraph(std::string var_name,
-                                   const OpInfo* op_info,
-                                   float* scale,
-                                   bool reverse = false) {
-  std::string attr_name = reverse ? "output_data_names" : "input_data_names";
-  if (!op_info->HasAttr(attr_name)) return false;
-  auto input_or_output_names =
-      op_info->GetAttr<std::vector<std::string>>(attr_name);
-  attr_name = reverse ? "output_data_scales" : "input_data_scales";
-  if (!op_info->HasAttr(attr_name)) return false;
-  auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
-  auto size = input_or_output_names.size();
-  CHECK(size == input_or_output_scales.size());
-  for (size_t i = 0; i < size; i++) {
-    if (input_or_output_names[i] == var_name) {
-      *scale = input_or_output_scales[i];
-      return true;
-    }
-  }
-  return false;
-}
-
 // Infer the scale value for the new calib op from the input_scale of the
 // current op and output_scale of the previous op.
 // case 1: prev_op->var_node->op_node(int8->any op, with input_scale).
-// case 2: prev_op->var_node->op_node(subgraph op, int8->any, with
-// input_data_scales).
-// case 3: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
+// case 2: prev_op(any->int8, with output_scale)->var_node->op_node(fp32->any,
 // without input_scale).
-// case 4: prev_op(any->int8, subgraph_op, with
-// output_data_scales)->var_node->op_node(fp32->any, without input_scale).
 static bool InferScale(Node* var_node, Node* op_node, float* scale) {
   bool found = false;
   auto& inst = op_node->AsStmt();
   auto op_info = inst.op_info();
   auto op_type = op_info->Type();
   auto var_name = var_node->AsArg().name;
-  if (op_type == "subgraph") {
-    found = InferScaleFromSubgraph(var_name, op_info, scale, false);
+  if (op_info->HasInputScale(var_name)) {
+    *scale = op_info->GetInputScale(var_name)[0];
+    found = true;
   } else {
-    if (op_info->HasAttr("input_scale")) {
-      *scale = op_info->GetAttr<float>("input_scale");
+    // Obtain the output_scale from one of its previous Ops
+    auto prev_op_node = var_node->inlinks.front();
+    CHECK(prev_op_node->IsStmt());
+    auto& prev_inst = prev_op_node->AsStmt();
+    auto prev_op_info = prev_inst.op_info();
+    auto prev_op_type = prev_op_info->Type();
+    if (prev_op_info->HasOutputScale(var_name)) {
+      *scale = prev_op_info->GetOutputScale(var_name)[0];
       found = true;
-    } else {
-      // Obtain the output_scale from one of its previous Ops
-      auto prev_op_node = var_node->inlinks.front();
-      CHECK(prev_op_node->IsStmt());
-      auto& prev_inst = prev_op_node->AsStmt();
-      auto prev_op_info = prev_inst.op_info();
-      auto prev_op_type = prev_op_info->Type();
-      if (prev_op_type == "subgraph") {
-        found = InferScaleFromSubgraph(var_name, prev_op_info, scale, true);
-      } else {
-        if (prev_op_info->HasAttr("output_scale")) {
-          *scale = prev_op_info->GetAttr<float>("output_scale");
-          found = true;
-        }
-      }
     }
   }
   return found;
diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc
index 537636065d6aeea67fd7c8c71fb00b183720fecc..585aaf3b703bca0a0a34030106dbf793e2a31d52 100644
--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -18,6 +18,7 @@
 #include <utility>
 #include <vector>
 #include "lite/core/op_registry.h"
+#include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
@@ -186,5 +187,114 @@ void OpLite::AttachOutput(const cpp::OpDesc &op_desc,
   }
 }
 
+bool OpInfo::GetInputArgname(const std::string &value_name,
+                             std::string *out) const {
+  for (auto &item : inputs()) {
+    auto it = std::find(item.second.begin(), item.second.end(), value_name);
+    if (it != item.second.end()) {
+      *out = item.first;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool OpInfo::GetOutputArgname(const std::string &value_name,
+                              std::string *out) const {
+  for (auto &item : outputs()) {
+    auto it = std::find(item.second.begin(), item.second.end(), value_name);
+    if (it != item.second.end()) {
+      *out = item.first;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool OpInfo::GetInputIndex(const std::string &input_name, int *out) const {
+  for (auto &item : inputs()) {
+    auto it = std::find(item.second.begin(), item.second.end(), input_name);
+    if (it != item.second.end()) {
+      *out = it - item.second.begin();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool OpInfo::GetOutputIndex(const std::string &output_name, int *out) const {
+  for (auto &item : outputs()) {
+    auto it = std::find(item.second.begin(), item.second.end(), output_name);
+    if (it != item.second.end()) {
+      *out = it - item.second.begin();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool OpInfo::HasInputScale(const std::string &input_name) const {
+  std::string argname;
+  int index;
+  if (GetInputArgname(input_name, &argname) &&
+      GetInputIndex(input_name, &index)) {
+    return HasAttr(argname + to_string(index) + "_scale");
+  } else {
+    return false;
+  }
+}
+
+bool OpInfo::HasOutputScale(const std::string &output_name) const {
+  std::string argname;
+  int index;
+  if (GetOutputArgname(output_name, &argname) &&
+      GetOutputIndex(output_name, &index)) {
+    return HasAttr(argname + to_string(index) + "_scale");
+  } else {
+    return false;
+  }
+}
+
+void OpInfo::SetInputScale(const std::string &input_name,
+                           const std::vector<float> &scale_value) {
+  std::string argname;
+  int index;
+  CHECK(GetInputArgname(input_name, &argname));
+  CHECK(GetInputIndex(input_name, &index));
+  CHECK(scale_value.size() > 0)
+      << "Error in SetInputScale: the scales should not be empty";
+  SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
+                              scale_value);
+}
+
+void OpInfo::SetOutputScale(const std::string &output_name,
+                            const std::vector<float> &scale_value) {
+  std::string argname;
+  int index;
+  CHECK(GetOutputArgname(output_name, &argname));
+  CHECK(GetOutputIndex(output_name, &index));
+  CHECK(scale_value.size() > 0)
+      << "Error in SetOutputScale: the scales should not be empty";
+  SetAttr<std::vector<float>>(argname + to_string(index) + "_scale",
+                              scale_value);
+}
+
+std::vector<float> OpInfo::GetInputScale(const std::string &input_name) const {
+  std::string argname;
+  int index;
+  CHECK(GetInputArgname(input_name, &argname));
+  CHECK(GetInputIndex(input_name, &index));
+  return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
+}
+
+std::vector<float> OpInfo::GetOutputScale(
+    const std::string &output_name) const {
+  std::string argname;
+  int index;
+  CHECK(GetOutputArgname(output_name, &argname));
+  CHECK(GetOutputIndex(output_name, &index));
+  return GetAttr<std::vector<float>>(argname + to_string(index) + "_scale");
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h
index 301065d5b6bb5c4f41b19d9a9034985ca2f74d89..079586d5e0c00f261bfbf4c7658ccca97402f8ac 100644
--- a/lite/core/op_lite.h
+++ b/lite/core/op_lite.h
@@ -24,7 +24,7 @@
 #include "lite/core/context.h"
 #include "lite/core/kernel.h"
 #include "lite/core/scope.h"
-#include "lite/model_parser/cpp/op_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/operators/op_params.h"
 
 namespace paddle {
@@ -229,55 +229,8 @@ class OpInfo : public cpp::OpDesc {
     return OutputArgumentNames();
   }
 
-  bool GetInputArgname(const std::string &value_name, std::string *out) const {
-    for (auto &item : inputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = item.first;
-        return true;
-      }
-    }
-    return false;
-  }
-  bool GetOutputArgname(const std::string &value_name, std::string *out) const {
-    for (auto &item : outputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = item.first;
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // For the input variable name, find the index of the corresponding
-  // input argname
-  bool GetInputIndex(const std::string &value_name, int *out) const {
-    for (auto &item : inputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = it - item.second.begin();
-        return true;
-      }
-    }
-    return false;
-  }
-
-  // For the output variable name, find the index of the corresponding
-  // output argname
-  bool GetOutputIndex(const std::string &value_name, int *out) const {
-    for (auto &item : outputs_) {
-      auto it = std::find(item.second.begin(), item.second.end(), value_name);
-      if (it != item.second.end()) {
-        *out = it - item.second.begin();
-        return true;
-      }
-    }
-    return false;
-  }
-
   void UpdateAllInputs(const std::string &from, const std::string &to) {
-    for (auto &item : inputs_) {
+    for (auto &item : *mutable_inputs()) {
       for (auto &var : item.second) {
         if (var == from) var = to;
       }
@@ -285,12 +238,32 @@ class OpInfo : public cpp::OpDesc {
   }
 
   void UpdateAllOutputs(const std::string &from, const std::string &to) {
-    for (auto &item : outputs_) {
+    for (auto &item : *mutable_outputs()) {
       for (auto &var : item.second) {
         if (var == from) var = to;
       }
     }
   }
+
+  bool GetInputArgname(const std::string &value_name, std::string *out) const;
+  bool GetOutputArgname(const std::string &value_name, std::string *out) const;
+
+  bool GetInputIndex(const std::string &input_name, int *out) const;
+  bool GetOutputIndex(const std::string &output_name, int *out) const;
+
+  bool HasInputScale(const std::string &input_name) const;
+  bool HasOutputScale(const std::string &output_name) const;
+
+  void SetInputScale(const std::string &input_name,
+                     const std::vector<float> &scale_value);
+  void SetOutputScale(const std::string &output_name,
+                      const std::vector<float> &scale_value);
+
+  // For conv2d, depthwise_conv2d and mul, the scale of weight are a vector.
+  // Otherwise, all input and output scales are scalar, but we save these
+  // as vecotr.
+  std::vector<float> GetInputScale(const std::string &input_name) const;
+  std::vector<float> GetOutputScale(const std::string &output_name) const;
 };
 
 }  // namespace lite
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index ef6d3cfaf001ea55cef23faee11d508920c49715..cb773edd18ee236a30cbfcf5d6b1ce5773f0269d 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -17,277 +17,5 @@
 #include <set>
 
 namespace paddle {
-namespace lite {
-
-const std::map<std::string, std::string> &GetOp2PathDict() {
-  return OpKernelInfoCollector::Global().GetOp2PathDict();
-}
-
-std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
-    const std::string &op_type,
-    TargetType target,
-    PrecisionType precision,
-    DataLayoutType layout) {
-  Place place{target, precision, layout};
-  VLOG(5) << "creating " << op_type << " kernel for " << place.DebugString();
-#define CREATE_KERNEL1(target__, precision__)                                \
-  switch (layout) {                                                          \
-    case DATALAYOUT(kNCHW):                                                  \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kNCHW)>(op_type);                             \
-    case DATALAYOUT(kAny):                                                   \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kAny)>(op_type);                              \
-    case DATALAYOUT(kNHWC):                                                  \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kNHWC)>(op_type);                             \
-    case DATALAYOUT(kImageDefault):                                          \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kImageDefault)>(op_type);                     \
-    case DATALAYOUT(kImageFolder):                                           \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kImageFolder)>(op_type);                      \
-    case DATALAYOUT(kImageNW):                                               \
-      return Create<TARGET(target__),                                        \
-                    PRECISION(precision__),                                  \
-                    DATALAYOUT(kImageNW)>(op_type);                          \
-    default:                                                                 \
-      LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \
-  }
-
-#define CREATE_KERNEL(target__)                         \
-  switch (precision) {                                  \
-    case PRECISION(kFloat):                             \
-      CREATE_KERNEL1(target__, kFloat);                 \
-    case PRECISION(kInt8):                              \
-      CREATE_KERNEL1(target__, kInt8);                  \
-    case PRECISION(kFP16):                              \
-      CREATE_KERNEL1(target__, kFP16);                  \
-    case PRECISION(kAny):                               \
-      CREATE_KERNEL1(target__, kAny);                   \
-    case PRECISION(kInt32):                             \
-      CREATE_KERNEL1(target__, kInt32);                 \
-    case PRECISION(kInt64):                             \
-      CREATE_KERNEL1(target__, kInt64);                 \
-    default:                                            \
-      CHECK(false) << "not supported kernel precision " \
-                   << PrecisionToStr(precision);        \
-  }
-
-  switch (target) {
-    case TARGET(kHost): {
-      CREATE_KERNEL(kHost);
-    } break;
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_X86)
-    case TARGET(kX86): {
-      CREATE_KERNEL(kX86);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_CUDA)
-    case TARGET(kCUDA): {
-      CREATE_KERNEL(kCUDA);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_ARM)
-    case TARGET(kARM): {
-      CREATE_KERNEL(kARM);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_OPENCL)
-    case TARGET(kOpenCL): {
-      CREATE_KERNEL(kOpenCL);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_NPU)
-    case TARGET(kNPU): {
-      CREATE_KERNEL(kNPU);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_APU)
-    case TARGET(kAPU): {
-      CREATE_KERNEL(kAPU);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_XPU)
-    case TARGET(kXPU): {
-      CREATE_KERNEL(kXPU);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_FPGA)
-    case TARGET(kFPGA): {
-      CREATE_KERNEL(kFPGA);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_BM)
-    case TARGET(kBM): {
-      CREATE_KERNEL(kBM);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_MLU)
-    case TARGET(kMLU): {
-      CREATE_KERNEL(kMLU);
-    } break;
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_RKNPU)
-    case TARGET(kRKNPU): {
-      CREATE_KERNEL(kRKNPU);
-    } break;
-#endif
-    default:
-      CHECK(false) << "not supported kernel target " << TargetToStr(target);
-  }
-
-#undef CREATE_KERNEL
-  return std::list<std::unique_ptr<KernelBase>>();
-}
-
-KernelRegistry::KernelRegistry() : registries_() {
-#define INIT_FOR(target__, precision__, layout__)            \
-  registries_[std::make_tuple(TARGET(target__),              \
-                              PRECISION(precision__),        \
-                              DATALAYOUT(layout__))]         \
-      .set<KernelRegistryForTarget<TARGET(target__),         \
-                                   PRECISION(precision__),   \
-                                   DATALAYOUT(layout__)> *>( \
-          &KernelRegistryForTarget<TARGET(target__),         \
-                                   PRECISION(precision__),   \
-                                   DATALAYOUT(layout__)>::Global());
-// Currently, just register 2 kernel targets.
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_CUDA)
-  INIT_FOR(kCUDA, kFloat, kNCHW);
-  INIT_FOR(kCUDA, kFloat, kNHWC);
-  INIT_FOR(kCUDA, kInt8, kNCHW);
-  INIT_FOR(kCUDA, kFP16, kNCHW);
-  INIT_FOR(kCUDA, kFP16, kNHWC);
-  INIT_FOR(kCUDA, kAny, kNCHW);
-  INIT_FOR(kCUDA, kAny, kAny);
-  INIT_FOR(kCUDA, kInt8, kNHWC);
-  INIT_FOR(kCUDA, kInt64, kNCHW);
-  INIT_FOR(kCUDA, kInt64, kNHWC);
-  INIT_FOR(kCUDA, kInt32, kNCHW);
-  INIT_FOR(kCUDA, kInt32, kNHWC);
-#endif
-
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_MLU)
-  INIT_FOR(kMLU, kFloat, kNHWC);
-  INIT_FOR(kMLU, kFloat, kNCHW);
-  INIT_FOR(kMLU, kFP16, kNHWC);
-  INIT_FOR(kMLU, kFP16, kNCHW);
-  INIT_FOR(kMLU, kInt8, kNHWC);
-  INIT_FOR(kMLU, kInt8, kNCHW);
-  INIT_FOR(kMLU, kInt16, kNHWC);
-  INIT_FOR(kMLU, kInt16, kNCHW);
-#endif
-
-  INIT_FOR(kHost, kAny, kNCHW);
-  INIT_FOR(kHost, kAny, kNHWC);
-  INIT_FOR(kHost, kAny, kAny);
-  INIT_FOR(kHost, kBool, kNCHW);
-  INIT_FOR(kHost, kBool, kNHWC);
-  INIT_FOR(kHost, kBool, kAny);
-  INIT_FOR(kHost, kFloat, kNCHW);
-  INIT_FOR(kHost, kFloat, kNHWC);
-  INIT_FOR(kHost, kFloat, kAny);
-  INIT_FOR(kHost, kFP16, kNCHW);
-  INIT_FOR(kHost, kFP16, kNHWC);
-  INIT_FOR(kHost, kFP16, kAny);
-  INIT_FOR(kHost, kInt8, kNCHW);
-  INIT_FOR(kHost, kInt8, kNHWC);
-  INIT_FOR(kHost, kInt8, kAny);
-  INIT_FOR(kHost, kInt16, kNCHW);
-  INIT_FOR(kHost, kInt16, kNHWC);
-  INIT_FOR(kHost, kInt16, kAny);
-  INIT_FOR(kHost, kInt32, kNCHW);
-  INIT_FOR(kHost, kInt32, kNHWC);
-  INIT_FOR(kHost, kInt32, kAny);
-  INIT_FOR(kHost, kInt64, kNCHW);
-  INIT_FOR(kHost, kInt64, kNHWC);
-  INIT_FOR(kHost, kInt64, kAny);
-
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_X86)
-  INIT_FOR(kX86, kFloat, kNCHW);
-  INIT_FOR(kX86, kAny, kNCHW);
-  INIT_FOR(kX86, kAny, kAny);
-  INIT_FOR(kX86, kInt64, kNCHW);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_ARM)
-  INIT_FOR(kARM, kFloat, kNCHW);
-  INIT_FOR(kARM, kFloat, kNHWC);
-  INIT_FOR(kARM, kInt8, kNCHW);
-  INIT_FOR(kARM, kInt8, kNHWC);
-  INIT_FOR(kARM, kAny, kNCHW);
-  INIT_FOR(kARM, kAny, kAny);
-  INIT_FOR(kARM, kInt32, kNCHW);
-  INIT_FOR(kARM, kInt64, kNCHW);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_OPENCL)
-  INIT_FOR(kOpenCL, kFloat, kNCHW);
-  INIT_FOR(kOpenCL, kFloat, kNHWC);
-  INIT_FOR(kOpenCL, kAny, kNCHW);
-  INIT_FOR(kOpenCL, kAny, kNHWC);
-  INIT_FOR(kOpenCL, kFloat, kAny);
-  INIT_FOR(kOpenCL, kInt8, kNCHW);
-  INIT_FOR(kOpenCL, kAny, kAny);
-  INIT_FOR(kOpenCL, kFP16, kNCHW);
-  INIT_FOR(kOpenCL, kFP16, kNHWC);
-  INIT_FOR(kOpenCL, kFP16, kImageDefault);
-  INIT_FOR(kOpenCL, kFP16, kImageFolder);
-  INIT_FOR(kOpenCL, kFP16, kImageNW);
-  INIT_FOR(kOpenCL, kFloat, kImageDefault);
-  INIT_FOR(kOpenCL, kFloat, kImageFolder);
-  INIT_FOR(kOpenCL, kFloat, kImageNW);
-  INIT_FOR(kOpenCL, kAny, kImageDefault);
-  INIT_FOR(kOpenCL, kAny, kImageFolder);
-  INIT_FOR(kOpenCL, kAny, kImageNW);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_NPU)
-  INIT_FOR(kNPU, kFloat, kNCHW);
-  INIT_FOR(kNPU, kFloat, kNHWC);
-  INIT_FOR(kNPU, kInt8, kNCHW);
-  INIT_FOR(kNPU, kInt8, kNHWC);
-  INIT_FOR(kNPU, kAny, kNCHW);
-  INIT_FOR(kNPU, kAny, kNHWC);
-  INIT_FOR(kNPU, kAny, kAny);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_APU)
-  INIT_FOR(kAPU, kInt8, kNCHW);
-  INIT_FOR(kXPU, kFloat, kNCHW);
-  INIT_FOR(kXPU, kInt8, kNCHW);
-  INIT_FOR(kXPU, kAny, kNCHW);
-  INIT_FOR(kXPU, kAny, kAny);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_FPGA)
-  INIT_FOR(kFPGA, kFP16, kNHWC);
-  INIT_FOR(kFPGA, kFP16, kAny);
-  INIT_FOR(kFPGA, kFloat, kNHWC);
-  INIT_FOR(kFPGA, kAny, kNHWC);
-  INIT_FOR(kFPGA, kAny, kAny);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_BM)
-  INIT_FOR(kBM, kFloat, kNCHW);
-  INIT_FOR(kBM, kInt8, kNCHW);
-  INIT_FOR(kBM, kAny, kNCHW);
-  INIT_FOR(kBM, kAny, kAny);
-#endif
-#if !defined(LITE_ON_TINY_PUBLISH) || defined(LITE_WITH_RKNPU)
-  INIT_FOR(kRKNPU, kFloat, kNCHW);
-  INIT_FOR(kRKNPU, kInt8, kNCHW);
-  INIT_FOR(kRKNPU, kAny, kNCHW);
-  INIT_FOR(kRKNPU, kAny, kAny);
-#endif
-
-#undef INIT_FOR
-}
-
-KernelRegistry &KernelRegistry::Global() {
-  static auto *x = new KernelRegistry;
-  return *x;
-}
-
-}  // namespace lite
+namespace lite {}  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index 2128e218554fb304474c14cfacd7867e491a4fe6..90a2b563af7e17a4806bd47cb883d9590cdab40f 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -17,7 +17,6 @@
 #include <list>
 #include <map>
 #include <memory>
-#include <set>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -33,19 +32,19 @@ using LiteType = paddle::lite::Type;
 
 class OpKernelInfoCollector {
  public:
-  static OpKernelInfoCollector &Global() {
-    static auto *x = new OpKernelInfoCollector;
+  static OpKernelInfoCollector& Global() {
+    static auto* x = new OpKernelInfoCollector;
     return *x;
   }
-  void AddOp2path(const std::string &op_name, const std::string &op_path) {
+  void AddOp2path(const std::string& op_name, const std::string& op_path) {
     size_t index = op_path.find_last_of('/');
     if (index != std::string::npos) {
       op2path_.insert(std::pair<std::string, std::string>(
           op_name, op_path.substr(index + 1)));
     }
   }
-  void AddKernel2path(const std::string &kernel_name,
-                      const std::string &kernel_path) {
+  void AddKernel2path(const std::string& kernel_name,
+                      const std::string& kernel_path) {
     size_t index = kernel_path.find_last_of('/');
     if (index != std::string::npos) {
       kernel2path_.insert(std::pair<std::string, std::string>(
@@ -53,13 +52,13 @@ class OpKernelInfoCollector {
     }
   }
   void SetKernel2path(
-      const std::map<std::string, std::string> &kernel2path_map) {
+      const std::map<std::string, std::string>& kernel2path_map) {
     kernel2path_ = kernel2path_map;
   }
-  const std::map<std::string, std::string> &GetOp2PathDict() {
+  const std::map<std::string, std::string>& GetOp2PathDict() {
     return op2path_;
   }
-  const std::map<std::string, std::string> &GetKernel2PathDict() {
+  const std::map<std::string, std::string>& GetKernel2PathDict() {
     return kernel2path_;
   }
 
@@ -71,409 +70,185 @@ class OpKernelInfoCollector {
 namespace paddle {
 namespace lite {
 
-const std::map<std::string, std::string> &GetOp2PathDict();
-
-using KernelFunc = std::function<void()>;
-using KernelFuncCreator = std::function<std::unique_ptr<KernelFunc>()>;
-class LiteOpRegistry final : public Factory<OpLite, std::shared_ptr<OpLite>> {
+class OpLiteFactory {
  public:
-  static LiteOpRegistry &Global() {
-    static auto *x = new LiteOpRegistry;
-    return *x;
+  // Register a function to create an op
+  void RegisterCreator(const std::string& op_type,
+                       std::function<std::shared_ptr<OpLite>()> fun) {
+    op_registry_[op_type] = fun;
   }
 
- private:
-  LiteOpRegistry() = default;
-};
-
-template <typename OpClass>
-class OpLiteRegistor : public Registor<OpClass> {
- public:
-  explicit OpLiteRegistor(const std::string &op_type)
-      : Registor<OpClass>([&] {
-          LiteOpRegistry::Global().Register(
-              op_type, [op_type]() -> std::unique_ptr<OpLite> {
-                return std::unique_ptr<OpLite>(new OpClass(op_type));
-              });
-        }) {}
-};
-template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
-using KernelRegistryForTarget =
-    Factory<KernelLite<Target, Precision, Layout>, std::unique_ptr<KernelBase>>;
-
-class KernelRegistry final {
- public:
-  using any_kernel_registor_t =
-      variant<KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kCUDA),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNHWC)> *,  //
-
-              KernelRegistryForTarget<TARGET(kX86),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kX86),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kInt32),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kHost),
-                                      PRECISION(kInt64),
-                                      DATALAYOUT(kNCHW)> *,  //
-
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kInt64),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kInt32),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kARM),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNHWC)> *,  //
-
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageDefault)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageFolder)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kImageNW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageDefault)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageFolder)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kImageNW)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kImageDefault)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kImageFolder)> *,  //
-              KernelRegistryForTarget<TARGET(kOpenCL),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kImageNW)> *,  //
-
-              KernelRegistryForTarget<TARGET(kNPU),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kNPU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kNPU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
+  static OpLiteFactory& Global() {
+    static OpLiteFactory* x = new OpLiteFactory;
+    return *x;
+  }
 
-              KernelRegistryForTarget<TARGET(kAPU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kXPU),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kXPU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kXPU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
+  std::shared_ptr<OpLite> Create(const std::string& op_type) const {
+    auto it = op_registry_.find(op_type);
+    if (it == op_registry_.end()) return nullptr;
+    return it->second();
+  }
 
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
+  std::string DebugString() const {
+    STL::stringstream ss;
+    for (const auto& item : op_registry_) {
+      ss << " - " << item.first << "\n";
+    }
+    return ss.str();
+  }
 
-              KernelRegistryForTarget<TARGET(kRKNPU),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kRKNPU),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kRKNPU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kRKNPU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
+  std::vector<std::string> GetAllOps() const {
+    std::vector<std::string> res;
+    for (const auto& op : op_registry_) {
+      res.push_back(op.first);
+    }
+    return res;
+  }
 
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kFPGA),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
+ protected:
+  std::map<std::string, std::function<std::shared_ptr<OpLite>()>> op_registry_;
+};
 
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kInt16),
-                                      DATALAYOUT(kNHWC)> *,  //
-              KernelRegistryForTarget<TARGET(kMLU),
-                                      PRECISION(kInt16),
-                                      DATALAYOUT(kNCHW)> *  //
-              >;
+using LiteOpRegistry = OpLiteFactory;
 
-  KernelRegistry();
+// Register OpLite by initializing a static OpLiteRegistrar instance
+class OpLiteRegistrar {
+ public:
+  OpLiteRegistrar(const std::string& op_type,
+                  std::function<std::shared_ptr<OpLite>()> fun) {
+    OpLiteFactory::Global().RegisterCreator(op_type, fun);
+  }
+  // Touch function is used to guarantee registrar was initialized.
+  void touch() {}
+};
 
-  static KernelRegistry &Global();
+class KernelFactory {
+ public:
+  // Register a function to create kernels
+  void RegisterCreator(const std::string& op_type,
+                       TargetType target,
+                       PrecisionType precision,
+                       DataLayoutType layout,
+                       std::function<std::unique_ptr<KernelBase>()> fun) {
+    op_registry_[op_type][std::make_tuple(target, precision, layout)].push_back(
+        fun);
+  }
 
-  template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
-  void Register(
-      const std::string &name,
-      typename KernelRegistryForTarget<Target, Precision, Layout>::creator_t
-          &&creator) {
-    using kernel_registor_t =
-        KernelRegistryForTarget<Target, Precision, Layout>;
-    auto &varient = registries_[std::make_tuple(Target, Precision, Layout)];
-    auto *reg = varient.template get<kernel_registor_t *>();
-    CHECK(reg) << "Can not be empty of " << name;
-    reg->Register(name, std::move(creator));
-#ifdef LITE_ON_MODEL_OPTIMIZE_TOOL
-    kernel_info_map_[name].push_back(
-        std::make_tuple(Target, Precision, Layout));
-#endif  // LITE_ON_MODEL_OPTIMIZE_TOOL
+  static KernelFactory& Global() {
+    static KernelFactory* x = new KernelFactory;
+    return *x;
   }
 
-  template <TargetType Target,
-            PrecisionType Precision = PRECISION(kFloat),
-            DataLayoutType Layout = DATALAYOUT(kNCHW)>
-  std::list<std::unique_ptr<KernelBase>> Create(const std::string &op_type) {
-    using kernel_registor_t =
-        KernelRegistryForTarget<Target, Precision, Layout>;
-    std::list<std::unique_ptr<KernelBase>> kernel_list;
-    std::tuple<TargetType, PrecisionType, DataLayoutType> temp_tuple(
-        Target, Precision, Layout);
-    if (registries_[temp_tuple].valid()) {
-      kernel_list =
-          registries_[temp_tuple].template get<kernel_registor_t *>()->Creates(
-              op_type);
+  /**
+   * Create all kernels belongs to an op.
+   */
+  std::list<std::unique_ptr<KernelBase>> Create(const std::string& op_type) {
+    std::list<std::unique_ptr<KernelBase>> res;
+    if (op_registry_.find(op_type) == op_registry_.end()) return res;
+    auto& kernel_registry = op_registry_[op_type];
+    for (auto it = kernel_registry.begin(); it != kernel_registry.end(); ++it) {
+      for (auto& fun : it->second) {
+        res.emplace_back(fun());
+      }
     }
-    return kernel_list;
+    return res;
   }
 
-  std::list<std::unique_ptr<KernelBase>> Create(const std::string &op_type,
+  /**
+   * Create a specific kernel. Return a list for API compatible.
+   */
+  std::list<std::unique_ptr<KernelBase>> Create(const std::string& op_type,
                                                 TargetType target,
                                                 PrecisionType precision,
-                                                DataLayoutType layout);
+                                                DataLayoutType layout) {
+    std::list<std::unique_ptr<KernelBase>> res;
+    if (op_registry_.find(op_type) == op_registry_.end()) return res;
+    auto& kernel_registry = op_registry_[op_type];
+    auto it = kernel_registry.find(std::make_tuple(target, precision, layout));
+    if (it == kernel_registry.end()) return res;
+    for (auto& fun : it->second) {
+      res.emplace_back(fun());
+    }
+    return res;
+  }
 
   std::string DebugString() const {
-#ifndef LITE_ON_MODEL_OPTIMIZE_TOOL
-    return "No more debug info";
-#else   // LITE_ON_MODEL_OPTIMIZE_TOOL
     STL::stringstream ss;
-    ss << "\n";
-    ss << "Count of kernel kinds: ";
-    int count = 0;
-    for (auto &item : kernel_info_map_) {
-      count += item.second.size();
-    }
-    ss << count << "\n";
-
-    ss << "Count of registered kernels: " << kernel_info_map_.size() << "\n";
-    for (auto &item : kernel_info_map_) {
-      ss << "op: " << item.first << "\n";
-      for (auto &kernel : item.second) {
-        ss << "   - (" << TargetToStr(std::get<0>(kernel)) << ",";
-        ss << PrecisionToStr(std::get<1>(kernel)) << ",";
-        ss << DataLayoutToStr(std::get<2>(kernel));
-        ss << ")";
-        ss << "\n";
-      }
+    for (const auto& item : op_registry_) {
+      ss << " - " << item.first << "\n";
     }
-
     return ss.str();
-#endif  // LITE_ON_MODEL_OPTIMIZE_TOOL
   }
 
- private:
-  mutable std::map<std::tuple<TargetType, PrecisionType, DataLayoutType>,
-                   any_kernel_registor_t>
-      registries_;
-#ifndef LITE_ON_TINY_PUBLISH
-  mutable std::map<
-      std::string,
-      std::vector<std::tuple<TargetType, PrecisionType, DataLayoutType>>>
-      kernel_info_map_;
-#endif
+ protected:
+  // Outer map: op -> a map of kernel.
+  // Inner map: kernel -> creator function.
+  // Each kernel was represented by a combination of <TargetType, PrecisionType,
+  // DataLayoutType>
+  std::map<std::string,
+           std::map<std::tuple<TargetType, PrecisionType, DataLayoutType>,
+                    std::list<std::function<std::unique_ptr<KernelBase>()>>>>
+      op_registry_;
 };
 
-template <TargetType target,
-          PrecisionType precision,
-          DataLayoutType layout,
-          typename KernelType>
-class KernelRegistor : public lite::Registor<KernelType> {
+using KernelRegistry = KernelFactory;
+
+// Register Kernel by initializing a static KernelRegistrar instance
+class KernelRegistrar {
  public:
-  KernelRegistor(const std::string &op_type, const std::string &alias)
-      : Registor<KernelType>([=] {
-          KernelRegistry::Global().Register<target, precision, layout>(
-              op_type, [=]() -> std::unique_ptr<KernelType> {
-                std::unique_ptr<KernelType> x(new KernelType);
-                x->set_op_type(op_type);
-                x->set_alias(alias);
-                return x;
-              });
-        }) {}
+  KernelRegistrar(const std::string& op_type,
+                  TargetType target,
+                  PrecisionType precision,
+                  DataLayoutType layout,
+                  std::function<std::unique_ptr<KernelBase>()> fun) {
+    KernelFactory::Global().RegisterCreator(
+        op_type, target, precision, layout, fun);
+  }
+  // Touch function is used to guarantee registrar was initialized.
+  void touch() {}
 };
 
 }  // namespace lite
 }  // namespace paddle
 
-// Operator registry
-#define LITE_OP_REGISTER_INSTANCE(op_type__) op_type__##__registry__instance__
-#define REGISTER_LITE_OP(op_type__, OpClass)                              \
-  static paddle::lite::OpLiteRegistor<OpClass> LITE_OP_REGISTER_INSTANCE( \
-      op_type__)(#op_type__);                                             \
-  int touch_op_##op_type__() {                                            \
-    OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__);     \
-    return LITE_OP_REGISTER_INSTANCE(op_type__).Touch();                  \
+// Register an op.
+#define REGISTER_LITE_OP(op_type__, OpClass)                                   \
+  static paddle::lite::OpLiteRegistrar op_type__##__registry(                  \
+      #op_type__, []() {                                                       \
+        return std::unique_ptr<paddle::lite::OpLite>(new OpClass(#op_type__)); \
+      });                                                                      \
+  int touch_op_##op_type__() {                                                 \
+    op_type__##__registry.touch();                                             \
+    OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__);          \
+    return 0;                                                                  \
   }
 
-// Kernel registry
-#define LITE_KERNEL_REGISTER(op_type__, target__, precision__) \
-  op_type__##__##target__##__##precision__##__registor__
-#define LITE_KERNEL_REGISTER_INSTANCE(                   \
-    op_type__, target__, precision__, layout__, alias__) \
-  op_type__##__##target__##__##precision__##__##layout__##registor__instance__##alias__  // NOLINT
-
-#define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \
-  LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__)
-
+// Register a kernel.
 #define REGISTER_LITE_KERNEL(                                                 \
     op_type__, target__, precision__, layout__, KernelClass, alias__)         \
-  static paddle::lite::KernelRegistor<TARGET(target__),                       \
-                                      PRECISION(precision__),                 \
-                                      DATALAYOUT(layout__),                   \
-                                      KernelClass>                            \
-      LITE_KERNEL_REGISTER_INSTANCE(                                          \
-          op_type__, target__, precision__, layout__, alias__)(#op_type__,    \
-                                                               #alias__);     \
-  static KernelClass LITE_KERNEL_INSTANCE(                                    \
-      op_type__, target__, precision__, layout__, alias__);                   \
+  static paddle::lite::KernelRegistrar                                        \
+      op_type__##target__##precision__##layout__##alias__##_kernel_registry(  \
+          #op_type__,                                                         \
+          TARGET(target__),                                                   \
+          PRECISION(precision__),                                             \
+          DATALAYOUT(layout__),                                               \
+          []() {                                                              \
+            std::unique_ptr<KernelClass> x(new KernelClass);                  \
+            x->set_op_type(#op_type__);                                       \
+            x->set_alias(#alias__);                                           \
+            return x;                                                         \
+          });                                                                 \
   int touch_##op_type__##target__##precision__##layout__##alias__() {         \
+    op_type__##target__##precision__##layout__##alias__##_kernel_registry     \
+        .touch();                                                             \
     OpKernelInfoCollector::Global().AddKernel2path(                           \
         #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \
         __FILE__);                                                            \
-    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \
-        .Touch();                                                             \
     return 0;                                                                 \
   }                                                                           \
-  static bool LITE_KERNEL_PARAM_INSTANCE(                                     \
-      op_type__, target__, precision__, layout__, alias__) UNUSED =           \
-      paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__),          \
-                                                   PRECISION(precision__),    \
-                                                   DATALAYOUT(layout__)>(     \
-          #op_type__ "/" #alias__)
-
-#define LITE_KERNEL_INSTANCE(                            \
-    op_type__, target__, precision__, layout__, alias__) \
-  op_type__##target__##precision__##layout__##alias__
-#define LITE_KERNEL_PARAM_INSTANCE(                      \
-    op_type__, target__, precision__, layout__, alias__) \
-  op_type__##target__##precision__##layout__##alias__##param_register
+  static auto                                                                 \
+      op_type__##target__##precision__##layout__##alias__##param_register     \
+          UNUSED = paddle::lite::ParamTypeRegistry::NewInstance<              \
+              TARGET(target__),                                               \
+              PRECISION(precision__),                                         \
+              DATALAYOUT(layout__)>(#op_type__ "/" #alias__)
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 579f7690d7b73bb400d68cbcaf138b32bb23a6ce..70905c96f08d74fc5e27c85c7ccf3d395420a5e9 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -94,6 +94,8 @@ class Optimizer {
 #endif
            "identity_dropout_eliminate_pass",
            "__xpu__resnet_fuse_pass",
+           "__xpu__resnet_cbam_fuse_pass",
+           "__xpu__mmdnn_fuse_pass",
            "__xpu__multi_encoder_fuse_pass",
            "__xpu__embedding_with_eltwise_add_fuse_pass",
            "__xpu__fc_fuse_pass",
@@ -108,9 +110,13 @@ class Optimizer {
            "bm_subgraph_pass",
            "apu_subgraph_pass",
            "rknpu_subgraph_pass",
+           "mlu_subgraph_pass",
            "static_kernel_pick_pass",  // pick original kernel from graph
+
            "remove_tf_redundant_ops_pass",
            "variable_place_inference_pass",  // inference arg/var's
+
+           "mlu_postprocess_pass",
            // info(target/precision/layout/device)
            // using kernel info
            "argument_type_display_pass",  // debug pass: show arg-type-node's
@@ -140,13 +146,9 @@ class Optimizer {
            "variable_place_inference_pass",  //
            "argument_type_display_pass",
 
-           "mlu_subgraph_pass",
-
            "runtime_context_assign_pass",
            "argument_type_display_pass",
 
-           "mlu_postprocess_pass",
-
            "memory_optimize_pass"}};
 
       if (passes.size() == 1) {
diff --git a/lite/core/program.cc b/lite/core/program.cc
index c911d4bba888901aec8a535b1a78528876ca03d3..f9ce00446e936871241405d39c51a2fcab91db32 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -15,9 +15,7 @@
 #include "lite/core/program.h"
 #include <algorithm>
 #include <map>
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/operators/conditional_block_op.h"
 #include "lite/operators/subgraph_op.h"
 #include "lite/operators/while_op.h"
diff --git a/lite/core/program.h b/lite/core/program.h
index 46d66759a5ae516725fcab90e9c36c39d1683b17..5dff631c70d4f4353b2487df8e37e62143306e85 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -22,7 +22,7 @@
 #include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
-#include "lite/model_parser/cpp/program_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/profiler.h"
 #endif
diff --git a/lite/core/scope.h b/lite/core/scope.h
index 57e4e3a5e058000f963ff369cbd25e69b9c981c6..41d6ee8f4f55268e3389cd4cada7e48fb8f922d7 100644
--- a/lite/core/scope.h
+++ b/lite/core/scope.h
@@ -62,19 +62,36 @@ class Scope final {
   // Create a Tensor variable. This will create a new Variable called `name`.
   Tensor* NewTensor(const std::string& name) {
     auto* var = Var(name);
-    return var->GetMutable<TensorLite>();
+    return var->GetMutable<Tensor>();
   }
 
   const Tensor* FindTensor(const std::string& name) {
     auto* var = FindVar(name);
     if (!var) return nullptr;
-    return &var->Get<TensorLite>();
+    return &var->Get<Tensor>();
   }
 
   Tensor* FindMutableTensor(const std::string& name) {
     auto* var = FindVar(name);
     if (!var) return nullptr;
-    return var->GetMutable<TensorLite>();
+    return var->GetMutable<Tensor>();
+  }
+
+  std::vector<Tensor>* NewTensorList(const std::string& name) {
+    auto* var = Var(name);
+    return var->GetMutable<std::vector<Tensor>>();
+  }
+
+  const std::vector<Tensor>* FindTensorList(const std::string& name) {
+    auto* var = FindVar(name);
+    if (!var) return nullptr;
+    return &var->Get<std::vector<Tensor>>();
+  }
+
+  std::vector<Tensor>* FindMutableTensorList(const std::string& name) {
+    auto* var = FindVar(name);
+    if (!var) return nullptr;
+    return var->GetMutable<std::vector<Tensor>>();
   }
 
  private:
diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc
index 0dab71ed26c1b4ee438f52e088614bb577a9eade..3ad02a9c53c311a9253bbdf481c9aa6288685654 100644
--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
@@ -67,7 +67,7 @@ framework::proto::VarType::Type ToDataType(std::type_index type) {
   if (it != gDataTypeMap().cpp_to_proto_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support %s as tensor type", type.name());
+  LOG(FATAL) << "Not support " << type.name() << " as tensor type";
   return static_cast<framework::proto::VarType::Type>(-1);
 }
 
@@ -76,8 +76,8 @@ std::type_index ToTypeIndex(framework::proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_cpp_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
+  LOG(FATAL) << "Not support framework::proto::VarType::Type("
+             << static_cast<int>(type) << ") as tensor type";
   return std::type_index(typeid(void));
 }
 
@@ -86,8 +86,8 @@ std::string DataTypeToString(const framework::proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_str_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
-               static_cast<int>(type));
+  LOG(FATAL) << "Not support framework::proto::VarType::Type("
+             << static_cast<int>(type) << ") as tensor type";
   return std::string();
 }
 
@@ -96,7 +96,8 @@ size_t SizeOfType(framework::proto::VarType::Type type) {
   if (it != gDataTypeMap().proto_to_size_.end()) {
     return it->second;
   }
-  PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type).c_str());
+  LOG(FATAL) << "Not support " << DataTypeToString(type).c_str()
+             << " as tensor type";
   return 0;
 }
 
diff --git a/lite/fluid/data_type.h b/lite/fluid/data_type.h
index a8b11ec465e00356561c95b56f63e3c56cbe8a5b..9896c0d54844b99748e1a7c8bddc5e178f84fb51 100644
--- a/lite/fluid/data_type.h
+++ b/lite/fluid/data_type.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <typeindex>
 #include "lite/core/framework.pb.h"
 #include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -72,7 +72,7 @@ inline void VisitDataType(framework::proto::VarType::Type type,
 
   _ForEachDataType_(VisitDataTypeCallback);
 #undef VisitDataTypeCallback
-  PADDLE_THROW("Not supported %d", type);
+  LOG(FATAL) << "Not supported " << type;
 }
 
 extern std::string DataTypeToString(const framework::proto::VarType::Type type);
diff --git a/lite/fluid/eigen.h b/lite/fluid/eigen.h
index c3af7e9f6c3588f404c614430bf01f7ab5e099e5..3312c9c39eaad4fc0a4225d9734b3f80790b2979 100644
--- a/lite/fluid/eigen.h
+++ b/lite/fluid/eigen.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "lite/core/tensor.h"
 #include "lite/fluid/float16.h"
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
@@ -30,7 +30,7 @@ struct EigenDim {
   using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
 
   static Type From(const lite::DDim& dims) {
-    PADDLE_ENFORCE_EQ(dims.size(), D, "D must match DDim::size");
+    CHECK_EQ(dims.size(), D) << "D must match DDim::size";
     Type ret;
     for (size_t d = 0; d < dims.size(); d++) {
       ret[d] = dims[d];
@@ -39,7 +39,7 @@ struct EigenDim {
   }
 
   static Type From(const DDim::value_type length) {
-    PADDLE_ENFORCE_EQ(D, 1, "D must be 1.");
+    CHECK_EQ(D, 1) << "D must be 1.";
     Type ret;
     ret[0] = length;
     return ret;
@@ -84,16 +84,16 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
   static typename EigenMatrix::Type Reshape(Tensor& tensor,  // NOLINT
                                             int num_col_dims) {
     int rank = tensor.dims().size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    CHECK(num_col_dims > 0 && num_col_dims < rank)
+        << "`num_col_dims` must be between (0, rank_of_tensor).";
     return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
   }
 
   static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
                                                  int num_col_dims) {
     int rank = tensor.dims().size();
-    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
-                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    CHECK(num_col_dims > 0 && num_col_dims < rank)
+        << "`num_col_dims` must be between (0, rank_of_tensor).";
     return EigenMatrix::From(tensor, tensor.dims().Flatten2D(num_col_dims));
   }
 };
diff --git a/lite/fluid/rw_lock.h b/lite/fluid/rw_lock.h
index eb9829425eca9d8bd363a45961302a7f3818e513..f68a21502073ccde6d27c46793d3f8cfa0751af3 100644
--- a/lite/fluid/rw_lock.h
+++ b/lite/fluid/rw_lock.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <mutex>  // NOLINT
 #endif            // !_WIN32
 
-#include "lite/utils/paddle_enforce.h"
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -33,17 +33,15 @@ struct RWLock {
   ~RWLock() { pthread_rwlock_destroy(&lock_); }
 
   inline void RDLock() {
-    PADDLE_ENFORCE_EQ(
-        pthread_rwlock_rdlock(&lock_), 0, "acquire read lock failed");
+    CHECK_EQ(pthread_rwlock_rdlock(&lock_), 0) << "acquire read lock failed";
   }
 
   inline void WRLock() {
-    PADDLE_ENFORCE_EQ(
-        pthread_rwlock_wrlock(&lock_), 0, "acquire write lock failed");
+    CHECK_EQ(pthread_rwlock_wrlock(&lock_), 0) << "acquire write lock failed";
   }
 
   inline void UNLock() {
-    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+    CHECK_EQ(pthread_rwlock_unlock(&lock_), 0) << "unlock failed";
   }
 
  private:
diff --git a/lite/fluid/selected_rows.cc b/lite/fluid/selected_rows.cc
index 98e9325ca2f8fab3f8aa77a0bb074ae5d1be7670..361d63cf5dfd9cd21db47917047a7e2f3758ec96 100644
--- a/lite/fluid/selected_rows.cc
+++ b/lite/fluid/selected_rows.cc
@@ -119,7 +119,7 @@ void DeserializeFromStream(
     // the 1st field, unit32_t version for SelectedRows
     uint32_t version;
     is.read(reinterpret_cast<char*>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+    CHECK_EQ(version, 0U) << "Only version 0 is supported";
   }
   {
     // the 2st field, rows information
@@ -163,24 +163,22 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key,
   if (iter == id_to_index_.end()) {
     rwlock_->UNLock();
     if (!auto_grown) {
-      PADDLE_THROW("key %ld not found", key);
+      LOG(FATAL) << "key " << key << " not found";
     }
     rwlock_->WRLock();
     auto map_size = id_to_index_.size();
     auto vector_size = rows_.size();
     if (map_size != vector_size) {
       rwlock_->UNLock();
-      PADDLE_THROW(
-          "id_to_index_ size %lu should have the same size with rows_ %lu",
-          map_size,
-          vector_size);
+      LOG(FATAL) << "id_to_index_ size " << map_size
+                 << " should have the same size with rows_ " << vector_size;
     }
     auto write_iter = id_to_index_.find(key);
     if (write_iter == id_to_index_.end()) {
       int row_num = rows_.size();
       if (row_num == value_->dims()[0]) {
         rwlock_->UNLock();
-        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+        LOG(FATAL) << "selected rows is full, then length exceed " << row_num;
       }
       // key logic to put a key into id_to_index_
       rows_.push_back(key);
@@ -213,16 +211,14 @@ void SelectedRows::Get(const lite::Tensor& ids,
                        lite::Tensor* value,
                        bool auto_grown,
                        bool is_test) {
-  PADDLE_ENFORCE(value->IsInitialized(),
-                 "The value tensor should be initialized.");
+  CHECK(value->IsInitialized()) << "The value tensor should be initialized.";
   if (ids.numel() == 0) {
     VLOG(3) << "keys is empty, please check data!";
   } else {
     int64_t value_width = value_->numel() / value_->dims()[0];
-    PADDLE_ENFORCE_EQ(value_width,
-                      value->numel() / value->dims()[0],
-                      "output tensor should have the same shape with table "
-                      "except the dims[0].");
+    CHECK_EQ(value_width, value->numel() / value->dims()[0])
+        << "output tensor should have the same shape with table "
+           "except the dims[0].";
     for (int i = 0; i < ids.numel(); ++i) {
       auto id = ids.data<int64_t>()[i];
       int64_t index = AutoGrownIndex(id, auto_grown, is_test);
diff --git a/lite/fluid/selected_rows.h b/lite/fluid/selected_rows.h
index 5db322f8592f4518d9e1ccc996ffb1e847e7b964..aad93552ebef5d67c77e554b29bf593f5cd176f7 100644
--- a/lite/fluid/selected_rows.h
+++ b/lite/fluid/selected_rows.h
@@ -82,7 +82,7 @@ class SelectedRows {
   int64_t Index(int64_t key) const {
     auto it = std::find(rows_.begin(), rows_.end(), key);
     if (it == rows_.end()) {
-      PADDLE_THROW("id %ld not in table", key);
+      LOG(FATAL) << "id " << key << " not in table";
     }
     return static_cast<int64_t>(std::distance(rows_.begin(), it));
   }
diff --git a/lite/gen_code/gen_code.cc b/lite/gen_code/gen_code.cc
index 6c43f6e0116d9adfc4fc6f315d5653b2634dfe7b..a1e69b624a600719121926fc3a4f58391fa63ce6 100644
--- a/lite/gen_code/gen_code.cc
+++ b/lite/gen_code/gen_code.cc
@@ -59,7 +59,7 @@ void Module::AddHeaderIncludeGenCode() {
   Line("#include \"lite/gen_code/paddle_infer.h\"");
   Line("#include \"lite/core/op_registry.h\"");
   Line("#include \"lite/core/scope.h\"");
-  Line("#include \"lite/model_parser/cpp/op_desc.h\"");
+  Line("#include \"lite/model_parser/cpp_desc.h\"");
   Line("");
   Line("");
 }
diff --git a/lite/gen_code/gen_code.h b/lite/gen_code/gen_code.h
index d316eac43f99664fa71cba54b3ab5360852300a0..e100904a7fe4f9c3e489c056ceeeba21657b4944 100644
--- a/lite/gen_code/gen_code.h
+++ b/lite/gen_code/gen_code.h
@@ -20,9 +20,9 @@
 #include "lite/core/program.h"
 #include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/model_parser/pb/op_desc.h"
 #include "lite/utils/all.h"
 
diff --git a/lite/gen_code/gen_code_test.cc b/lite/gen_code/gen_code_test.cc
index d0b1c1f8b23f90976f4b315a1a4e13069b2136f1..5b3db0de8342f312dcb4443ebcd1fd72e857eea0 100644
--- a/lite/gen_code/gen_code_test.cc
+++ b/lite/gen_code/gen_code_test.cc
@@ -25,7 +25,7 @@
 #include "lite/core/scope.h"
 #include "lite/core/tensor.h"
 #include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/cpp/op_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/model_parser/model_parser.h"
 #include "lite/model_parser/pb/program_desc.h"
 
diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc
index ca6e0ff2ac3930fe5cab9230dbbefa0af0a864ab..bf5e313180d9d8089b29f993384bd243b2a5ed05 100644
--- a/lite/kernels/apu/bridges/conv_op.cc
+++ b/lite/kernels/apu/bridges/conv_op.cc
@@ -35,6 +35,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   int neuron_errCode;
   VLOG(3) << "[APU] Converting [" << op_type << "]";
 
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
   // Get input and output vars and op attributes
   auto input_name = op_info->Input("Input").front();
   auto input = scope->FindMutableTensor(input_name);
@@ -94,30 +97,18 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                       input_dims,
                                       filter_dims);
 
-  float input_scale;
-  float output_scale;
-  std::vector<float> weight_scale;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      if (op_info->HasAttr("input_scale"))
-        input_scale = op_info->GetAttr<float>("input_scale");
-      if (op_info->HasAttr("weight_scale"))
-        weight_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
-      if (op_info->HasAttr("output_scale"))
-        output_scale = op_info->GetAttr<float>("output_scale");
-      VLOG(3) << "has output scale:" << output_scale;
-    } else {
-      return FAILED;
-    }
-  } else {
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(filter_name));
+  auto filter_scale = op_info->GetInputScale(filter_name);
+  CHECK(op_info->HasOutputScale(output_name));
+  auto output_scale = op_info->GetOutputScale(output_name)[0];
 
   VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups
           << " ,dilations: " << dilations[0] << ":" << dilations[1];
   VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type;
   VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims
-          << " ,weight_scale size: " << weight_scale.size();
+          << " ,filter_scale size: " << filter_scale.size();
   VLOG(3) << "filter_dims: " << filter_dims
           << " ,memory_size: " << filter->memory_size()
           << " ,data_size: " << filter->data_size();
@@ -216,10 +207,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   NeuronOperandType filterType;
   NeuronOperandType channelFilterType;
   NeuronSymmPerChannelQuantParams symmPerChannelQuantParams;
-  if (1 == weight_scale.size()) {
+  if (1 == filter_scale.size()) {
     // Per layer type
     filterType.type = NEURON_TENSOR_QUANT8_ASYMM;
-    filterType.scale = weight_scale[0];
+    filterType.scale = filter_scale[0];
     filterType.zeroPoint = 128;
     filterType.dimensionCount = filter_dims.size();
     filterType.dimensions = &dims_filter[0];
@@ -237,17 +228,17 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       symmPerChannelQuantParams.channelDim = 3;
     else
       symmPerChannelQuantParams.channelDim = 0;
-    symmPerChannelQuantParams.scaleCount = weight_scale.size();
-    symmPerChannelQuantParams.scales = weight_scale.data();
+    symmPerChannelQuantParams.scaleCount = filter_scale.size();
+    symmPerChannelQuantParams.scales = filter_scale.data();
     biasType.scale = 0;
   }
 
   std::shared_ptr<Node> filter_node = nullptr;
-  if (1 == weight_scale.size()) {
+  if (1 == filter_scale.size()) {
     NeuronModel_addOperand(model, &filterType);  // 1: filter
     filter_node = graph->Add(filter_name, dims_filter);
-    VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]"
-            << weight_scale[0] << ": filterType: " << filterType.dimensions[0]
+    VLOG(3) << "filter node idx: " << filter_node->index() << "filter_scale[0]"
+            << filter_scale[0] << ": filterType: " << filterType.dimensions[0]
             << ":" << filterType.dimensions[1] << ":"
             << filterType.dimensions[2] << ":" << filterType.dimensions[3];
     memcpy(filter->mutable_data<int8_t>(),
@@ -263,8 +254,8 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     NeuronModel_addOperand(model, &channelFilterType);  // 1: filter
     filter_node = graph->Add(filter_name, dims_filter);
     VLOG(3) << "chennel filter node idx: " << filter_node->index()
-            << " ,scale_count:" << weight_scale.size()
-            << " weight_scale[0]:" << weight_scale.data()[0]
+            << " ,scale_count:" << filter_scale.size()
+            << " filter_scale[0]:" << filter_scale.data()[0]
             << " ,channelFilterType: " << channelFilterType.dimensions[0] << ":"
             << channelFilterType.dimensions[1] << ":"
             << channelFilterType.dimensions[2] << ":"
@@ -298,7 +289,6 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::shared_ptr<Node> bias_node = nullptr;
   if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
     auto bias = scope->FindMutableTensor(bias_name);
     auto bias_dims = bias->dims();
 
@@ -364,10 +354,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Add output tensor type
   NeuronOperandType outType;
   outType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  if (graph->IsOutput(output_name))
-    outType.scale = output_scale / 127;
-  else
-    outType.scale = output_scale;
+  outType.scale = output_scale;
   outType.zeroPoint = 128;
   outType.dimensionCount = output_dims.size();
   std::vector<uint32_t> dims_out = {(uint32_t)output_dims[0],
@@ -401,7 +388,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     int32_t* int32_bias_data =
         reinterpret_cast<int32_t*>(bias->mutable_data<float>());
     float2int32(
-        bias->data<float>(), input_scale, weight_scale, int32_bias_data);
+        bias->data<float>(), input_scale, filter_scale, int32_bias_data);
 
     VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : "
             << int32_bias_data[1] << " : " << int32_bias_data[2] << " : "
diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc
index a00a35f9a0766b4fb4f02d05419a0ae42354ca37..106ce2c16f3fd287a27c92179fa3a429c7be57c8 100644
--- a/lite/kernels/apu/bridges/fc_op.cc
+++ b/lite/kernels/apu/bridges/fc_op.cc
@@ -31,6 +31,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto scope = op->scope();
   VLOG(3) << "[APU] Converting [" + op_type + "]";
 
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
+  // Get input and output vars and op attributes
   auto input_name = op_info->Input("Input").front();
   auto input = scope->FindMutableTensor(input_name);
   auto input_dims = input->dims();
@@ -52,23 +56,12 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
           << " out_dims: " << out_dims << " m: " << m << " k: " << k
           << " n: " << n;
 
-  float input_scale = 1.0f;
-  float out_scale = 1.0f;
-  std::vector<float> w_scale;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      if (op_info->HasAttr("input_scale"))
-        input_scale = op_info->GetAttr<float>("input_scale");
-      if (op_info->HasAttr("weight_scale"))
-        w_scale = op_info->GetAttr<std::vector<float>>("weight_scale");
-      if (op_info->HasAttr("output_scale"))
-        out_scale = op_info->GetAttr<float>("output_scale");
-    } else {
-      return FAILED;
-    }
-  } else {
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(input_name));
+  auto input_scale = op_info->GetInputScale(input_name)[0];
+  CHECK(op_info->HasInputScale(w_name));
+  auto w_scale = op_info->GetInputScale(w_name);
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
 
   // Add input tensor type
   NeuronOperandType inType;
diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc
index 2bda76ab99af727276102e884f84534b77a59586..b82f23beaf715e8c720ffc22792b804ff6c2c225 100644
--- a/lite/kernels/apu/bridges/pool_op.cc
+++ b/lite/kernels/apu/bridges/pool_op.cc
@@ -32,6 +32,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto scope = op->scope();
   VLOG(3) << "[APU] Converting [" + op_type + "] ";
 
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
   auto x = scope->FindMutableTensor(x_name);
@@ -87,22 +90,10 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  ksize);
 
   // Add x tensor type
-  float x_scale = 1.0f;
-  float out_scale = 1.0f;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      if (op_info->HasAttr("input_scale"))
-        x_scale = op_info->GetAttr<float>("input_scale");
-      if (op_info->HasAttr("output_scale"))
-        out_scale = op_info->GetAttr<float>("output_scale");
-    } else {
-      LOG(WARNING) << "Do not enable_int8";
-      return FAILED;
-    }
-  } else {
-    LOG(WARNING) << "Do not enable_int8";
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(x_name));
+  auto x_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
 
   NeuronOperandType xType;
   xType.type = NEURON_TENSOR_QUANT8_ASYMM;
diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc
index 6a289ac987b9fa300cb548d190b6e46b67f24c44..dec6d12307b50798d04f743064360aa6870acfa3 100644
--- a/lite/kernels/apu/bridges/softmax_op.cc
+++ b/lite/kernels/apu/bridges/softmax_op.cc
@@ -31,6 +31,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto scope = op->scope();
   VLOG(3) << "[APU] Converting [" + op_type + "]";
 
+  CHECK(op_info->HasAttr("enable_int8") &&
+        op_info->GetAttr<bool>("enable_int8"));
+
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
   auto x = scope->FindMutableTensor(x_name);
@@ -45,22 +48,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     axis += x_rank;
   }
 
-  float input_scale = 1.0f;
-  float out_scale = 1.0f;
-  if (op_info->HasAttr("enable_int8")) {
-    if (op_info->GetAttr<bool>("enable_int8")) {
-      if (op_info->HasAttr("input_scale"))
-        input_scale = op_info->GetAttr<float>("input_scale");
-      if (op_info->HasAttr("output_scale"))
-        out_scale = op_info->GetAttr<float>("output_scale");
-    } else {
-      LOG(WARNING) << "Do not enable_int8";
-      return FAILED;
-    }
-  } else {
-    LOG(WARNING) << "Do not enable_int8";
-    return FAILED;
-  }
+  CHECK(op_info->HasInputScale(x_name));
+  auto input_scale = op_info->GetInputScale(x_name)[0];
+  CHECK(op_info->HasOutputScale(out_name));
+  auto out_scale = op_info->GetOutputScale(out_name)[0];
 
   // Check output scale
   NeuronOperandType xType;
@@ -104,14 +95,14 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Add out operand
   NeuronOperandType outType;
   outType.type = NEURON_TENSOR_QUANT8_ASYMM;
-  outType.scale = out_scale / 127;
+  outType.scale = out_scale;
   outType.zeroPoint = 128;
   outType.dimensionCount = x_dims.size();
   outType.dimensions = &dims_x[0];
   NeuronModel_addOperand(model, &outType);  // 3: output
   std::shared_ptr<Node> out_node = nullptr;
   out_node = graph->Add(out_name, dims_x);
-  VLOG(3) << "output_scale: " << out_scale;
+  VLOG(3) << "out_scale: " << out_scale;
 
   float beta_val[] = {1.0f};
   NeuronModel_setOperandValue(
diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc
index 6009e71e05c33f6dedfd995020612e112c888d36..21373811dd91d009d834a16d2c437bc722cd676a 100644
--- a/lite/kernels/apu/subgraph_compute.cc
+++ b/lite/kernels/apu/subgraph_compute.cc
@@ -28,7 +28,7 @@ namespace lite {
 namespace kernels {
 namespace apu {
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::BuildDeviceProgram() {
   unsigned int version;
   Neuron_getVersion(&version);
   VLOG(3) << "Neuron Adapter version: " << version;
@@ -38,7 +38,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   int neuron_errCode = NeuronModel_create(&model_);
   if (NEURON_NO_ERROR != neuron_errCode) {
     LOG(WARNING) << "Fail to create model";
-    return subgraph::FAILED;
+    return false;
   }
   graph.set_model(model_);
   graph.set_input_names(input_names_);
@@ -46,6 +46,9 @@ int SubgraphEngine::BuildDeviceProgram() {
 
   // Convert all of ops and their input vars and weights and added into the APU
   // NIR graph
+  if (origin_program_.empty()) {
+    BuildOriginProgram();
+  }
   const auto& bridges = subgraph::Registry::Instance();
   for (auto& inst : origin_program_) {
     auto op = const_cast<OpLite*>(inst.op());
@@ -54,7 +57,7 @@ int SubgraphEngine::BuildDeviceProgram() {
     op->InferShape();
     std::string op_type = op->op_info()->Type();
     if (!bridges.Exists(op_type, TARGET(kAPU))) {
-      return subgraph::FAILED;
+      return false;
     }
 
     auto kernel = inst.kernel();
@@ -63,7 +66,7 @@ int SubgraphEngine::BuildDeviceProgram() {
                                               const_cast<OpLite*>(op),
                                               const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
 
@@ -84,7 +87,7 @@ int SubgraphEngine::BuildDeviceProgram() {
       VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
     } else {
       LOG(WARNING) << "Fail to find input: " << input_names_[i];
-      return subgraph::FAILED;
+      return false;
     }
   }
 
@@ -105,7 +108,7 @@ int SubgraphEngine::BuildDeviceProgram() {
       VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
     } else {
       LOG(WARNING) << "Fail to find output: " << output_names_[i];
-      return subgraph::FAILED;
+      return false;
     }
   }
 
@@ -116,7 +119,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   neuron_errCode = NeuronModel_finish(model_);
   if (NEURON_NO_ERROR != neuron_errCode) {
     LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
-    return subgraph::FAILED;
+    return false;
   }
   VLOG(3) << "[APU] APU NIR model created!";
 
@@ -129,15 +132,14 @@ int SubgraphEngine::BuildDeviceProgram() {
   compilation_ = lite::apu::Device::Global().Build(model_);
   if (compilation_ == nullptr) {
     LOG(WARNING) << "[APU] Build APU DLA model failed!";
-    return subgraph::FAILED;
+    return false;
   }
   VLOG(3) << "[APU] APU DLA model created, Build cost "
           << GetCurrentUS() - start_time << " us";
-
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
   auto GetCurrentUS = []() -> double {
     struct timeval time;
     gettimeofday(&time, NULL);
@@ -149,22 +151,19 @@ int SubgraphEngine::LaunchDeviceProgram() {
   int neuron_errCode = NeuronExecution_create(compilation_, &run);
   if (NEURON_NO_ERROR != neuron_errCode) {
     LOG(WARNING) << "[APU] Build APU runtime failed!";
-    return subgraph::FAILED;
+    return false;
   }
 
   // Set input buffer
-  Tensor input_temp;
   for (size_t i = 0; i < origin_itensors_.size(); i++) {
-    input_temp.Resize({origin_idims_[i]});
-    uint8_t* input_data = input_temp.mutable_data<uint8_t>();
-    memcpy(input_data,
-           origin_itensors_[i]->raw_data(),
-           origin_itensors_[i]->memory_size());
+    auto origin_data = origin_itensors_[i]->mutable_data<int8_t>();
+    auto converted_data = reinterpret_cast<uint8_t*>(origin_data);
     for (int j = 0; j < origin_itensors_[i]->data_size(); j++) {
-      input_data[j] += (uint8_t)128;
+      converted_data[j] =
+          static_cast<uint8_t>(static_cast<int16_t>(origin_data[j]) + 128);
     }
     NeuronExecution_setInput(
-        run, i, NULL, input_data, origin_itensors_[i]->memory_size());
+        run, i, NULL, converted_data, origin_itensors_[i]->memory_size());
   }
 
   // Set output buffer
@@ -180,19 +179,20 @@ int SubgraphEngine::LaunchDeviceProgram() {
   neuron_errCode = NeuronExecution_compute(run);
   if (NEURON_NO_ERROR != neuron_errCode) {
     LOG(WARNING) << "Fail to run execution!" << neuron_errCode;
-    return subgraph::FAILED;
+    return false;
   }
 
   for (size_t i = 0; i < origin_otensors_.size(); i++) {
-    int8_t* output_data = origin_otensors_[i]->mutable_data<int8_t>();
-    VLOG(3) << "output size:" << origin_otensors_[i]->memory_size();
+    auto converted_data = origin_otensors_[i]->mutable_data<int8_t>();
+    auto origin_data = reinterpret_cast<uint8_t*>(converted_data);
     for (int j = 0; j < origin_otensors_[i]->data_size(); j++) {
-      output_data[j] -= (int8_t)128;
+      converted_data[j] =
+          static_cast<int8_t>(static_cast<int16_t>(origin_data[j]) - 128);
     }
   }
   NeuronExecution_free(run);
   VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us";
-  return 0;
+  return true;
 }
 
 SubgraphEngine::~SubgraphEngine() {
@@ -213,12 +213,11 @@ void SubgraphCompute::PrepareForRun() {
                                    param.output_data_names,
                                    param.scope));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace apu
diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h
index ecd8a38343cd1f62bb5a3bf8e948384b90cfe826..beb582b8cc16e456491c28ace5e2d1695143216a 100644
--- a/lite/kernels/apu/subgraph_compute.h
+++ b/lite/kernels/apu/subgraph_compute.h
@@ -41,8 +41,8 @@ class SubgraphEngine : public subgraph::Engine {
   ~SubgraphEngine();
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
   NeuronModel *model_;
   NeuronCompilation *compilation_;
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 218ee3f053fcf49f6a08ffbe0d780509f9b2cc03..6d1d24adcb4cf74b3c6bb991a33316e974dc0110 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -42,6 +42,7 @@ add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_de
 add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(affine_grid_compute_arm ARM basic SRCS affine_grid_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -77,6 +78,7 @@ add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS
 add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 # for OCR specific
 add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -101,7 +103,6 @@ add_kernel(deformable_conv_compute_arm ARM extra SRCS deformable_conv_compute.cc
 add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm)
diff --git a/lite/kernels/arm/activation_grad_compute.cc b/lite/kernels/arm/activation_grad_compute.cc
deleted file mode 100644
index 137668fa5e0d1bd07e838b3040a31e084a7475c8..0000000000000000000000000000000000000000
--- a/lite/kernels/arm/activation_grad_compute.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/activation_grad_compute.h"
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void SquareGradCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-  auto out_grad_dims = param.Out_grad->dims();
-  auto out_grad_data = param.Out_grad->data<float>();
-
-  auto x_data = param.X->data<float>();
-  auto x_grad_data = param.X_grad->mutable_data<float>();
-  lite::arm::math::act_square_grad<float>(x_data,
-                                          out_grad_data,
-                                          x_grad_data,
-                                          out_grad_dims.production(),
-                                          ctx.threads());
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(square_grad,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::SquareGradCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/affine_grid_compute.cc b/lite/kernels/arm/affine_grid_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1ea5512bf3a3e9944855b36277784b6a06e050bb
--- /dev/null
+++ b/lite/kernels/arm/affine_grid_compute.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/affine_grid_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void AffineGridCompute::PrepareForRun() {
+  auto& param = Param<operators::AffineGridParam>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+
+  const lite::Tensor* x = param.X;
+  const float* din = x->data<float>();
+  lite::Tensor* out = param.Out;
+  float* dout = param.Out->mutable_data<float>();
+  int N = x->dims()[0];
+  int H = param.output_shape[2];
+  int W = param.output_shape[3];
+
+  vh = reinterpret_cast<float*>(malloc(sizeof(float) * H));
+  vw = reinterpret_cast<float*>(malloc(sizeof(float) * W));
+  int out_size = H * W * 3;
+  float scale = 2 / (static_cast<float>(H) - 1);
+  for (int i = 0; i < H; i++) {
+    vh[i] = -1 + scale * i;
+  }
+  scale = 2 / (static_cast<float>(W) - 1);
+  for (int i = 0; i < W; i++) {
+    vw[i] = -1 + i * scale;
+  }
+  return;
+}
+void AffineGridCompute::Run() {
+  auto& param = Param<operators::AffineGridParam>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+
+  const lite::Tensor* x = param.X;
+  int N = x->dims()[0];
+
+  int H = param.output_shape[2];
+  int W = param.output_shape[3];
+  int out_size = H * W * 3;
+  float* hw3 = ctx.workspace_data<float>() + ctx.llc_size() / sizeof(float);
+
+  for (int i = 0; i < out_size; i += 3) {
+    hw3[i] = 1;
+    hw3[i + 1] = 1;
+    hw3[i + 2] = 1;
+  }
+
+  for (int i = 0; i < H * W; i++) {
+    hw3[i * 3 + 1] = vh[i / W];
+  }
+  for (int i = 0; i < H * W; i++) {
+    hw3[i * 3] = vw[i % W];
+  }
+
+  const float* din = x->data<float>();
+  float* dout = param.Out->mutable_data<float>();
+  float* tmp = dout;
+  operators::ActivationParam act_param;
+  act_param.has_active = false;
+  for (int i = 0; i < N; i++) {
+    lite::arm::math::sgemm(false,
+                           true,
+                           H * W,
+                           2,
+                           3,
+                           1.f,
+                           hw3,
+                           3,
+                           din,
+                           3,
+                           0.f,
+                           dout,
+                           2,
+                           nullptr,
+                           false,
+                           act_param,
+                           &ctx);
+
+    din += 6;
+    dout += H * W * 2;
+  }
+
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(affine_grid,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::AffineGridCompute,
+                     def)
+    .BindInput("Theta", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/affine_grid_compute.h b/lite/kernels/arm/affine_grid_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..09f0e1f85c88acc2f70f0ca12f942c560b61a722
--- /dev/null
+++ b/lite/kernels/arm/affine_grid_compute.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class AffineGridCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AffineGridParam;
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~AffineGridCompute() = default;
+  float* vh;
+  float* vw;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/argmax_compute_test.cc b/lite/kernels/arm/argmax_compute_test.cc
index 034d57cdaba77130b319d203c3ae0616720c9d31..5e511264a855ac86a9fb12ede56d51fb1ea83010 100644
--- a/lite/kernels/arm/argmax_compute_test.cc
+++ b/lite/kernels/arm/argmax_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/argmax_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstdlib>
 #include <functional>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/argmax_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -66,9 +68,7 @@ void argmax_compute_ref(const operators::ArgmaxParam& param) {
 }
 
 TEST(argmax_arm, retrive_op) {
-  auto argmax =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "arg_max");
+  auto argmax = KernelRegistry::Global().Create("arg_max");
   ASSERT_FALSE(argmax.empty());
   ASSERT_TRUE(argmax.front());
 }
diff --git a/lite/kernels/arm/axpy_compute_test.cc b/lite/kernels/arm/axpy_compute_test.cc
index af145435ebe2c5bd0c1d1b78b112e8a8572d36ec..7348630e776155cd421bc78a9da7494d42e84c3f 100644
--- a/lite/kernels/arm/axpy_compute_test.cc
+++ b/lite/kernels/arm/axpy_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/axpy_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstdlib>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/axpy_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -61,8 +63,7 @@ void axpy_compute_ref(const operators::AxpyParam& param) {
 }
 
 TEST(axpy_arm, retrive_op) {
-  auto axpy =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("axpy");
+  auto axpy = KernelRegistry::Global().Create("axpy");
   ASSERT_FALSE(axpy.empty());
   ASSERT_TRUE(axpy.front());
 }
diff --git a/lite/kernels/arm/batch_norm_compute_test.cc b/lite/kernels/arm/batch_norm_compute_test.cc
index bf690f88a5e776709a3988cc843762db3bf684e6..a3ef9bda4a17ebfdb5468c911cc6c9aa6a5d4fd7 100644
--- a/lite/kernels/arm/batch_norm_compute_test.cc
+++ b/lite/kernels/arm/batch_norm_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/batch_norm_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/batch_norm_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -78,9 +80,7 @@ void batch_norm_compute_ref(const operators::BatchNormParam& param) {
 }
 
 TEST(batch_norm_arm, retrive_op) {
-  auto batch_norm =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "batch_norm");
+  auto batch_norm = KernelRegistry::Global().Create("batch_norm");
   ASSERT_FALSE(batch_norm.empty());
   ASSERT_TRUE(batch_norm.front());
 }
diff --git a/lite/kernels/arm/calib_compute.cc b/lite/kernels/arm/calib_compute.cc
index 6dac97dcbc59991d4680ab1a98a54a900573f631..383e868843b43f4081e1eac330b1422b79307d9c 100644
--- a/lite/kernels/arm/calib_compute.cc
+++ b/lite/kernels/arm/calib_compute.cc
@@ -33,6 +33,17 @@ void CalibComputeFp32ToInt8<DLType>::Run() {
       din, dout, scale.data(), 1, 1, param.input->numel());
 }
 
+template <DataLayoutType DLType>
+void CalibComputeInt64ToInt32<DLType>::Run() {
+  auto& param = this->template Param<operators::CalibParam>();
+  const auto* din = param.input->template data<int64_t>();
+  std::vector<float> scale = {param.scale};
+  auto* dout = param.output->template mutable_data<int32_t>();
+  for (auto i = 0; i < param.input->numel(); ++i) {
+    dout[i] = din[i];
+  }
+}
+
 template <DataLayoutType DLType>
 void CalibComputeInt8ToFp32<DLType>::Run() {
   auto& param = this->template Param<operators::CalibParam>();
@@ -105,6 +116,23 @@ REGISTER_LITE_KERNEL(
                                        DATALAYOUT(kNHWC))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(
+    calib,
+    kARM,
+    kInt64,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeInt64ToInt32<DATALAYOUT(kNCHW)>,
+    int64_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     calib_once,
     kARM,
@@ -161,3 +189,20 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    calib_once,
+    kARM,
+    kInt64,
+    kNCHW,
+    paddle::lite::kernels::arm::CalibComputeInt64ToInt32<DATALAYOUT(kNCHW)>,
+    int64_to_int32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/arm/calib_compute.h b/lite/kernels/arm/calib_compute.h
index a4c8b4c1232101416e95171d70ab629f6a37177b..f10bb931df9b276bc3bb01da16906f3e5b5a7dce 100644
--- a/lite/kernels/arm/calib_compute.h
+++ b/lite/kernels/arm/calib_compute.h
@@ -34,6 +34,19 @@ class CalibComputeFp32ToInt8
  private:
 };
 
+template <DataLayoutType DLType>
+class CalibComputeInt64ToInt32
+    : public KernelLite<TARGET(kARM), PRECISION(kInt64), DLType> {
+ public:
+  using param_t = operators::CalibParam;
+
+  void Run() override;
+
+  ~CalibComputeInt64ToInt32() override{};
+
+ private:
+};
+
 template <DataLayoutType DLType>
 class CalibComputeInt8ToFp32
     : public KernelLite<TARGET(kARM), PRECISION(kInt8), DLType> {
diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc
index 3b3ef07e105c583b7e3eb8b64b14610ca0f9e41a..919e9c603edff4383f086ac795c3dff4ed856c4f 100644
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -62,8 +62,19 @@ void CastCompute::Run() {
     int32_t* out_data = param.Out->mutable_data<int32_t>();
     std::transform(
         x_data_begin, x_data_end, out_data, TransOp<int64_t, int32_t>);
+  } else if (param.in_dtype == 0 && param.out_dtype == 5) {  // bool->fp32
+    const bool* x_data_begin = param.X->data<bool>();
+    const bool* x_data_end = x_data_begin + param.X->numel();
+    float* out_data = param.Out->mutable_data<float>();
+    std::transform(x_data_begin, x_data_end, out_data, TransOp<bool, float>);
+  } else if (param.in_dtype == 3 && param.out_dtype == 5) {  // int64->fp32
+    const int64_t* x_data_begin = param.X->data<int64_t>();
+    const int64_t* x_data_end = x_data_begin + param.X->numel();
+    float* out_data = param.Out->mutable_data<float>();
+    std::transform(x_data_begin, x_data_end, out_data, TransOp<int64_t, float>);
   } else {
-    LOG(FATAL) << "other has not been implemented";
+    LOG(FATAL) << "other has not been implemented transform with dtype"
+               << param.in_dtype << " X, dtype" << param.out_dtype << " Out";
   }
 }
 
diff --git a/lite/kernels/arm/clip_compute.cc b/lite/kernels/arm/clip_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d71eaef9e5b3e68d571a48e1a9772b8870c29b7
--- /dev/null
+++ b/lite/kernels/arm/clip_compute.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/clip_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void ClipCompute::Run() {
+  auto& param = Param<operators::ClipParam>();
+  lite::Tensor* x = param.x;
+  lite::Tensor* min_tensor = param.min_tensor;
+  lite::Tensor* max_tensor = param.max_tensor;
+  lite::Tensor* out = param.out;
+  float min = param.min;
+  float max = param.max;
+
+  if (min_tensor != nullptr) {
+    min = min_tensor->data<float>()[0];
+  }
+  if (max_tensor != nullptr) {
+    max = max_tensor->data<float>()[0];
+  }
+
+  const float* x_ptr = x->data<float>();
+  float* out_ptr = out->mutable_data<float>();
+  int64_t num = x->numel();
+  lite::arm::math::clip_kernel_fp32(x_ptr, num, min, max, out_ptr);
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    clip, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ClipCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Min", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Max", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/activation_grad_compute.h b/lite/kernels/arm/clip_compute.h
similarity index 81%
rename from lite/kernels/arm/activation_grad_compute.h
rename to lite/kernels/arm/clip_compute.h
index ef03f58fa8cd499192aa6edfe3a7c51b49b14f65..94c2b3a32ea2fc0847d8e223ecd61856fa8e3ed2 100644
--- a/lite/kernels/arm/activation_grad_compute.h
+++ b/lite/kernels/arm/clip_compute.h
@@ -15,20 +15,20 @@
 #pragma once
 #include <algorithm>
 #include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
+#include "lite/operators/clip_op.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {
 
-class SquareGradCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class ClipCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
-  using param_t = operators::ActivationGradParam;
+  using param_t = operators::ClipParam;
 
   void Run() override;
 
-  virtual ~SquareGradCompute() = default;
+  virtual ~ClipCompute() = default;
 };
 
 }  // namespace arm
diff --git a/lite/kernels/arm/concat_compute.cc b/lite/kernels/arm/concat_compute.cc
index dc78e1b955c29b261b2103479ea00bb836c0a31f..9ab4ca54bb909876bc823ac25cb67764eab12e47 100644
--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
@@ -52,11 +52,7 @@ void ConcatFunc(const std::vector<lite::Tensor*> inputs,
       output_offset += in_stride[0];
     }
   } else {
-    std::vector<lite::Tensor*> inputs_concat(inputs.size());
-    for (int j = 0; j < inputs.size(); ++j) {
-      inputs_concat[j] = inputs[j];
-    }
-    lite::arm::math::concat_func<T>(inputs_concat, axis, out);
+    lite::arm::math::concat_func<T>(inputs, axis, out);
   }
 }
 
@@ -71,6 +67,9 @@ void ConcatCompute::Run() {
     auto* axis_tensor_data = axis_tensor->data<int>();
     axis = axis_tensor_data[0];
   }
+  if (axis < 0) {
+    axis += inputs[0]->dims().size();
+  }
 
   switch (inputs.front()->precision()) {
     case PRECISION(kFloat):
diff --git a/lite/kernels/arm/concat_compute_test.cc b/lite/kernels/arm/concat_compute_test.cc
index 44c6dedd44ad4509a3f5a9c13fc04d6f1ffbdc64..862094fd23aa339bba0b06c4200e71f06402c645 100644
--- a/lite/kernels/arm/concat_compute_test.cc
+++ b/lite/kernels/arm/concat_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/concat_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <string>
 #include <vector>
+
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/arm/concat_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -221,8 +223,7 @@ TEST(concat_arm, compute_input_multi) {
 }
 
 TEST(concat, retrive_op) {
-  auto concat =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kAny)>("concat");
+  auto concat = KernelRegistry::Global().Create("concat");
   ASSERT_FALSE(concat.empty());
   ASSERT_TRUE(concat.front());
 }
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index ef174814ced73d4b2ec20580e06c63d39693ce57..54e67de5abbfc88f64a50b07335d2527d9738206 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -121,10 +121,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
       no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>;
     // VLOG(3) << "Run DepthwiseConv Int8";
+  } else if (param.groups == 1 && kw == 3 && sw == 2 && no_dilation &&
+             pads_equal) {
+    impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kFloat)>;
+    // VLOG(3) << "Run DirectConv Int8";
   } else if (param.groups == 1 && kw == 3 && sw == 1 && no_dilation &&
              pads_equal) {
     impl_ = new WinogradConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    // VLOG(3) << "Run DirectConv Int8";
+    // VLOG(3) << "Run WinogradConv Int8";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>;
     // VLOG(3) << "Run GemmLikeConvInt8";
@@ -168,10 +172,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
       no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>;
     // VLOG(3) << "Run DepthwiseConv Int8";
+  } else if (param.groups == 1 && kw == 3 && sw == 2 && no_dilation &&
+             pads_equal) {
+    impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kInt8)>;
+    // VLOG(3) << "Run DirectConv Int8";
   } else if (param.groups == 1 && kw == 3 && sw == 1 && no_dilation &&
              pads_equal) {
     impl_ = new WinogradConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    // VLOG(3) << "Run DirectConv Int8";
+    // VLOG(3) << "Run WinogradConv Int8";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>;
     // VLOG(3) << "Run GemmLikeConvInt8";
diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc
index c6e06a243cc1d1f1c8dc35338d8183352c4f679a..f61c6109cdfd57b30c2b57390d21dec7c3bb3aa2 100644
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -358,6 +358,9 @@ void WinogradConv<PRECISION(kInt8), OutType>::Run() {
                                                       param,
                                                       &ctx);
   }
+#ifdef LITE_WITH_PROFILE
+  kernel_func_name_ = "conv_compute_2x2_3x3_int8";
+#endif
 }
 template class WinogradConv<PRECISION(kInt8), PRECISION(kInt8)>;
 template class WinogradConv<PRECISION(kInt8), PRECISION(kFloat)>;
diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h
index 69835a74b40b4f08d78cb11f3b9415eef7bc89d6..b93a719f7dbb13aa9888ea943fa81b6ea2b38c00 100644
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
@@ -61,6 +61,13 @@ class WinogradConv<PRECISION(kInt8), OutType>
   virtual void PrepareForRun();
   virtual void ReInitWhenNeeded();
   virtual void Run();
+#ifdef LITE_WITH_PROFILE
+  virtual void SetProfileRuntimeKernelInfo(
+      paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+  }
+  std::string kernel_func_name_{"NotImplForConvWino"};
+#endif
 
  protected:
   using param_t = operators::ConvParam;
diff --git a/lite/kernels/arm/decode_bboxes_compute_test.cc b/lite/kernels/arm/decode_bboxes_compute_test.cc
index 271a99c29b61063877b7d1c0d2e50bc65d135d72..ef9da0f1e2c53a021c82f19d3151a2fe8fba8af4 100644
--- a/lite/kernels/arm/decode_bboxes_compute_test.cc
+++ b/lite/kernels/arm/decode_bboxes_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/decode_bboxes_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <string>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/decode_bboxes_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -115,9 +117,7 @@ void decode_bboxes_compute_ref(const operators::DecodeBboxesParam& param) {
 }
 
 TEST(decode_bboxes_arm, retrive_op) {
-  auto decode_bboxes =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "decode_bboxes");
+  auto decode_bboxes = KernelRegistry::Global().Create("decode_bboxes");
   ASSERT_FALSE(decode_bboxes.empty());
   ASSERT_TRUE(decode_bboxes.front());
 }
diff --git a/lite/kernels/arm/deformable_conv_compute.h b/lite/kernels/arm/deformable_conv_compute.h
index 6c8995ddd447a4382ee40e00f3b31832566ad9e9..17fae957619b7754637023a21169da9641686e59 100644
--- a/lite/kernels/arm/deformable_conv_compute.h
+++ b/lite/kernels/arm/deformable_conv_compute.h
@@ -17,6 +17,7 @@
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/kernel.h"
 #ifdef LITE_WITH_PROFILE
+#include <string>
 #include "lite/core/profile/profiler.h"
 #endif
 
@@ -56,8 +57,9 @@ class DeformableConvCompute : public KernelLite<TARGET(kARM), Ptype> {
 #ifdef LITE_WITH_PROFILE
   virtual void SetProfileRuntimeKernelInfo(
       paddle::lite::profile::OpCharacter* ch) {
-    impl_->SetProfileRuntimeKernelInfo(ch);
+    ch->kernel_func_name = kernel_func_name_;
   }
+  std::string kernel_func_name_{"NotImplForDeformableConv"};
 #endif
 
   ~DeformableConvCompute() = default;
diff --git a/lite/kernels/arm/dropout_compute_test.cc b/lite/kernels/arm/dropout_compute_test.cc
index 1c0f8db347304076caee23ee3d295bcfacbe2a1f..0aa16b8d348d7b8415120051df0e9732fada4495 100644
--- a/lite/kernels/arm/dropout_compute_test.cc
+++ b/lite/kernels/arm/dropout_compute_test.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/dropout_compute.h"
 #include <gtest/gtest.h>
+
 #include <string>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/dropout_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -30,9 +32,7 @@ TEST(dropout_arm, init) {
 }
 
 TEST(dropout, retrive_op) {
-  auto dropout =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "dropout");
+  auto dropout = KernelRegistry::Global().Create("dropout");
   ASSERT_FALSE(dropout.empty());
   ASSERT_TRUE(dropout.front());
 }
diff --git a/lite/kernels/arm/elementwise_compute.cc b/lite/kernels/arm/elementwise_compute.cc
index 8115700f5950ddfcb71df49e6a21528563f23d95..28082785e1c726097a8bfd2165f0d09b9962a5e7 100644
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -300,11 +300,12 @@ void ElementwiseMaxActivationCompute::Run() {
   }
 }
 
-void ElementwiseDivCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
+template <typename T, PrecisionType PType>
+void ElementwiseDivCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ElementwiseParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
   int axis = param.axis;
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
@@ -313,10 +314,10 @@ void ElementwiseDivCompute::Run() {
     LOG(FATAL) << "elewise div don't support x_dims size < y_dims size";
   }
   if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_div_broadcast(
+    lite::arm::math::elementwise_div_broadcast<T>(
         x_data, y_data, out_data, pre, n, post);
   } else {
-    lite::arm::math::elementwise_div(
+    lite::arm::math::elementwise_div<T>(
         x_data, y_data, out_data, x_dims.production());
   }
 }
@@ -351,6 +352,29 @@ void ElementwiseDivActivationCompute::Run() {
   }
 }
 
+template <typename T, PrecisionType PType>
+void ElementwiseModCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ElementwiseParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
+  int axis = param.axis;
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int pre, n, post;
+  if (x_dims.size() < y_dims.size() &&
+      is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_mod_broadcast<T>(
+        y_data, x_data, out_data, pre, n, post);
+  } else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
+    lite::arm::math::elementwise_mod_broadcast<T>(
+        x_data, y_data, out_data, pre, n, post);
+  } else {
+    lite::arm::math::elementwise_mod<T>(
+        x_data, y_data, out_data, x_dims.production());
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -465,17 +489,27 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(elementwise_div,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseDivCompute,
-                     def)
+using elementwise_div_fp32 =
+    paddle::lite::kernels::arm::ElementwiseDivCompute<float, PRECISION(kFloat)>;
+
+REGISTER_LITE_KERNEL(
+    elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
+using elementwise_div_int64 =
+    paddle::lite::kernels::arm::ElementwiseDivCompute<int64_t,
+                                                      PRECISION(kInt64)>;
+
+REGISTER_LITE_KERNEL(
+    elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     fusion_elementwise_div_activation,
     kARM,
@@ -487,3 +521,13 @@ REGISTER_LITE_KERNEL(
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+
+using elementwise_mod_int64 =
+    paddle::lite::kernels::arm::ElementwiseModCompute<int64_t,
+                                                      PRECISION(kInt64)>;
+REGISTER_LITE_KERNEL(
+    elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/arm/elementwise_compute.h b/lite/kernels/arm/elementwise_compute.h
index 731010a0d189c08f031363e6df95652c000a237b..7d7a93bf6954de9bbcd1b44061e614cd041fafe8 100644
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -86,8 +86,8 @@ class ElementwiseMaxActivationCompute
   virtual ~ElementwiseMaxActivationCompute() = default;
 };
 
-class ElementwiseDivCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ElementwiseDivCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   void Run() override;
 
@@ -102,6 +102,22 @@ class ElementwiseDivActivationCompute
   virtual ~ElementwiseDivActivationCompute() = default;
 };
 
+template <typename T, PrecisionType PType>
+class ElementwiseModCompute : public KernelLite<TARGET(kARM), PType> {
+ public:
+  void Run() override;
+
+  virtual ~ElementwiseModCompute() = default;
+};
+
+// class ElementwiseModActivationCompute
+//     : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+//  public:
+//   void Run() override;
+
+//   virtual ~ElementwiseModActivationCompute() = default;
+// };
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/arm/elementwise_compute_test.cc b/lite/kernels/arm/elementwise_compute_test.cc
index b0ac3a7d33d92239c83147a3fe7615cd2fbf0249..62a5bc423ca6e72098332963713e8baffb366325 100644
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
@@ -12,11 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/elementwise_compute.h"
 #include <gtest/gtest.h>
+
+#include <cmath>
 #include <string>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/elementwise_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -24,9 +27,7 @@ namespace kernels {
 namespace arm {
 
 TEST(elementwise_add_arm, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_add");
+  auto elementwise_add = KernelRegistry::Global().Create("elementwise_add");
   ASSERT_FALSE(elementwise_add.empty());
   ASSERT_TRUE(elementwise_add.front());
 }
@@ -140,6 +141,119 @@ void elementwise_compute_ref(const operators::ElementwiseParam& param,
   }
 }
 
+template <typename dtype>
+void elementwise_fmod_compute_ref(const operators::ElementwiseParam& param,
+                                  const std::string act_type) {
+  const dtype* x_data = param.X->data<const dtype>();
+  const dtype* y_data = param.Y->data<const dtype>();
+  dtype* out_data = param.Out->mutable_data<dtype>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const dtype* din_ptr = x_data + offset;
+      const dtype diny_data = y_data[j];
+      dtype* dout_ptr = out_data + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = fmod(diny_data + fmod(*din_ptr, diny_data), diny_data);
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+  // do activation relu
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename dtype>
+void elementwise_imod_compute_ref(const operators::ElementwiseParam& param,
+                                  const std::string act_type) {
+  const dtype* x_data = param.X->data<const dtype>();
+  const dtype* y_data = param.Y->data<const dtype>();
+  dtype* out_data = param.Out->mutable_data<dtype>();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  int axis = param.axis;
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  int batch = 1;
+  int channels = 1;
+  int num = 1;
+  for (int i = 0; i < axis; ++i) {
+    batch *= x_dims[i];
+  }
+  for (int i = 0; i < y_dims.size(); ++i) {
+    channels *= y_dims[i];
+  }
+  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
+    num *= x_dims[i];
+  }
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const dtype* din_ptr = x_data + offset;
+      const dtype diny_data = y_data[j];
+      dtype* dout_ptr = out_data + offset;
+      for (int k = 0; k < num; ++k) {
+        *dout_ptr = (*din_ptr) % diny_data;
+        dout_ptr++;
+        din_ptr++;
+      }
+    }
+  }
+  // do activation relu
+  if (act_type.size() > 0) {
+    if (act_type == "relu") {
+      for (int i = 0; i < batch; ++i) {
+        for (int j = 0; j < channels; ++j) {
+          dtype* dout_ptr = out_data + (i * channels + j) * num;
+          for (int k = 0; k < num; ++k) {
+            *dout_ptr = *dout_ptr > 0.0f ? *dout_ptr : 0.0f;
+            dout_ptr++;
+          }
+        }
+      }
+    }
+  }
+}
+
+template void elementwise_fmod_compute_ref<float>(
+    const operators::ElementwiseParam& param, const std::string act_type);
+template void elementwise_imod_compute_ref<int32_t>(
+    const operators::ElementwiseParam& param, const std::string act_type);
+template void elementwise_imod_compute_ref<int64_t>(
+    const operators::ElementwiseParam& param, const std::string act_type);
+
 TEST(elementwise_add, compute) {
   ElementwiseAddCompute elementwise_add;
   operators::ElementwiseParam param;
@@ -222,8 +336,7 @@ TEST(elementwise_add, compute) {
 
 TEST(fusion_elementwise_add_activation_arm, retrive_op) {
   auto fusion_elementwise_add_activation =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "fusion_elementwise_add_activation");
+      KernelRegistry::Global().Create("fusion_elementwise_add_activation");
   ASSERT_FALSE(fusion_elementwise_add_activation.empty());
   ASSERT_TRUE(fusion_elementwise_add_activation.front());
 }
@@ -321,9 +434,7 @@ TEST(fusion_elementwise_add_activation_arm, compute) {
 }
 
 TEST(elementwise_mul_arm, retrive_op) {
-  auto elementwise_mul =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_mul");
+  auto elementwise_mul = KernelRegistry::Global().Create("elementwise_mul");
   ASSERT_FALSE(elementwise_mul.empty());
   ASSERT_TRUE(elementwise_mul.front());
 }
@@ -416,8 +527,7 @@ TEST(elementwise_mul, compute) {
 
 TEST(fusion_elementwise_mul_activation_arm, retrive_op) {
   auto fusion_elementwise_mul_activation =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "fusion_elementwise_mul_activation");
+      KernelRegistry::Global().Create("fusion_elementwise_mul_activation");
   ASSERT_FALSE(fusion_elementwise_mul_activation.empty());
   ASSERT_TRUE(fusion_elementwise_mul_activation.front());
 }
@@ -515,9 +625,7 @@ TEST(fusion_elementwise_mul_activation_arm, compute) {
 }
 
 TEST(elementwise_max_arm, retrive_op) {
-  auto elementwise_max =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "elementwise_max");
+  auto elementwise_max = KernelRegistry::Global().Create("elementwise_max");
   ASSERT_FALSE(elementwise_max.empty());
   ASSERT_TRUE(elementwise_max.front());
 }
@@ -610,8 +718,7 @@ TEST(elementwise_max, compute) {
 
 TEST(fusion_elementwise_max_activation_arm, retrive_op) {
   auto fusion_elementwise_max_activation =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "fusion_elementwise_max_activation");
+      KernelRegistry::Global().Create("fusion_elementwise_max_activation");
   ASSERT_FALSE(fusion_elementwise_max_activation.empty());
   ASSERT_TRUE(fusion_elementwise_max_activation.front());
 }
@@ -685,7 +792,7 @@ TEST(fusion_elementwise_max_activation_arm, compute) {
                 }
                 for (int i = 0; i < y_dim.production(); i++) {
                   float sign = i % 2 == 0 ? 0.5f : -0.5f;
-                  y_data[i] = i * sign;
+                  y_data[i] = (i + 1) * sign;
                 }
                 param.X = &x;
                 param.Y = &y;
@@ -708,6 +815,106 @@ TEST(fusion_elementwise_max_activation_arm, compute) {
   }
 }
 
+TEST(elementwise_mod_int64_arm, retrive_op) {
+  auto elementwise_mod = KernelRegistry::Global().Create("elementwise_mod");
+  ASSERT_FALSE(elementwise_mod.empty());
+  ASSERT_TRUE(elementwise_mod.front());
+}
+
+TEST(elementwise_mod_int64_arm, init) {
+  ElementwiseModCompute<int64_t, PRECISION(kInt64)> elementwise_mod;
+  ASSERT_EQ(elementwise_mod.precision(), PRECISION(kInt64));
+  ASSERT_EQ(elementwise_mod.target(), TARGET(kARM));
+}
+
+TEST(elementwise_mod_int64_arm, compute) {
+  ElementwiseModCompute<int64_t, PRECISION(kInt64)> elementwise_mod;
+  operators::ElementwiseParam param;
+  lite::Tensor x, y, output, output_ref;
+
+#if 1
+  for (auto n : {1, 3, 4}) {
+    for (auto c : {1, 3, 4}) {
+      for (auto h : {1, 3, 4}) {
+        for (auto w : {1, 3, 4}) {
+          for (auto axis : {-1, 0, 1, 3}) {
+            for (auto yd : {std::vector<int64_t>({n}),
+                            std::vector<int64_t>({c}),
+                            std::vector<int64_t>({h}),
+                            std::vector<int64_t>({w}),
+                            std::vector<int64_t>({n, c}),
+                            std::vector<int64_t>({c, h}),
+                            std::vector<int64_t>({c, h, w}),
+                            std::vector<int64_t>({n, c, h, w})}) {
+#else
+  for (auto n : {1, 3, 4, 11}) {
+    for (auto c : {1, 3, 4, 11}) {
+      for (auto h : {1, 3, 4, 11}) {
+        for (auto w : {1, 3, 4, 11}) {
+          for (auto axis : {-1, 0, 1, 2, 3}) {
+            for (auto yd : {std::vector<int64_t>({n}),
+                            std::vector<int64_t>({c}),
+                            std::vector<int64_t>({h}),
+                            std::vector<int64_t>({w}),
+                            std::vector<int64_t>({n, c}),
+                            std::vector<int64_t>({c, h}),
+                            std::vector<int64_t>({h, w}),
+                            std::vector<int64_t>({n, c, h}),
+                            std::vector<int64_t>({c, h, w}),
+                            std::vector<int64_t>({n, c, h, w})}) {
+#endif
+              auto x_dim = DDim(std::vector<int64_t>({n, c, h, w}));
+              auto y_dim = DDim(yd);
+              int axis_t = axis < 0 ? x_dim.size() - y_dim.size() : axis;
+
+              if (axis_t + y_dim.size() > 4) continue;
+              bool flag = false;
+              for (int i = 0; i < y_dim.size(); i++) {
+                if (x_dim[i + axis_t] != y_dim[i]) flag = true;
+              }
+              if (flag) continue;
+
+              x.Resize(x_dim);
+              y.Resize(y_dim);
+              output.Resize(x_dim);
+              output_ref.Resize(x_dim);
+              auto* x_data = x.mutable_data<int64_t>();
+              auto* y_data = y.mutable_data<int64_t>();
+              auto* output_data = output.mutable_data<int64_t>();
+              auto* output_ref_data = output_ref.mutable_data<int64_t>();
+              for (int i = 0; i < x_dim.production(); i++) {
+                x_data[i] = i + 1;
+              }
+              for (int i = 0; i < y_dim.production(); i++) {
+                y_data[i] = y_dim.production() - i;
+              }
+              param.X = &x;
+              param.Y = &y;
+              param.axis = axis;
+              param.Out = &output;
+              elementwise_mod.SetParam(param);
+              elementwise_mod.Run();
+              param.Out = &output_ref;
+              elementwise_imod_compute_ref<int64_t>(param, "");
+              for (int i = 0; i < output.dims().production(); i++) {
+                if (std::abs(output_data[i] - output_ref_data[i]) > 1e-5 ||
+                    std::isnan(output_data[i]) ||
+                    std::isnan(output_ref_data[i])) {
+                  LOG(FATAL) << "elementwise mod cmp error, i: " << i
+                             << ", x_data: " << x_data[i]
+                             << ", y_data: " << y_data[i]
+                             << ", output_data: " << output_data[i]
+                             << ", output_ref_data: " << output_ref_data[i];
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -719,3 +926,4 @@ USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_mod, kARM, kInt64, kNCHW, def);
diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc
index 3efacc4aacefcb150d53738c950ec9e797ed78c7..2a9c70aede7475b36f70c628ff6ccaa823f030b2 100644
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -73,7 +73,6 @@ void GatherCompute::Run() {
 REGISTER_LITE_KERNEL(
     gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
-    .BindInput("Index",
-               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Index", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/layer_norm_compute_test.cc b/lite/kernels/arm/layer_norm_compute_test.cc
index 22fe3d06569fac424ab797712142b4d088dc7d3a..e84f9f133ce0cdecb714dc535c0f5833597105c6 100644
--- a/lite/kernels/arm/layer_norm_compute_test.cc
+++ b/lite/kernels/arm/layer_norm_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/layer_norm_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <limits>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/layer_norm_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -181,9 +183,7 @@ TEST(layer_norm_arm, compute) {
 }
 
 TEST(layer_norm, retrive_op) {
-  auto layer_norm =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "layer_norm");
+  auto layer_norm = KernelRegistry::Global().Create("layer_norm");
   ASSERT_FALSE(layer_norm.empty());
   ASSERT_TRUE(layer_norm.front());
 }
diff --git a/lite/kernels/arm/lrn_compute_test.cc b/lite/kernels/arm/lrn_compute_test.cc
index e7030d00427e55c7faf333997cd90cba46260cd4..9afd05b80aaffdc4be2ae1deaa5993b8fd21dce4 100644
--- a/lite/kernels/arm/lrn_compute_test.cc
+++ b/lite/kernels/arm/lrn_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/lrn_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <string>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/lrn_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -133,8 +135,7 @@ void lrn_compute_ref(const operators::LrnParam& param) {
 }
 
 TEST(lrn_arm, retrive_op) {
-  auto lrn =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("lrn");
+  auto lrn = KernelRegistry::Global().Create("lrn");
   ASSERT_FALSE(lrn.empty());
   ASSERT_TRUE(lrn.front());
 }
diff --git a/lite/kernels/arm/merge_lod_tensor_compute_test.cc b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
index 914a58308bdf0d5c6d374d5f81ca38224941c85d..f8d92dfdc740988733ad26d5385b17050b490635 100644
--- a/lite/kernels/arm/merge_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/merge_lod_tensor_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstdlib>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/merge_lod_tensor_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace arm {
 
 TEST(merge_lod_tensor_arm, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "merge_lod_tensor");
+  auto kernel = KernelRegistry::Global().Create("merge_lod_tensor");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/arm/mul_compute_test.cc b/lite/kernels/arm/mul_compute_test.cc
index cddee81fe22897dbe91721ed172b144539e0852c..76ab95b93485b3e6701dca6224ce2a5f7a8b3df7 100644
--- a/lite/kernels/arm/mul_compute_test.cc
+++ b/lite/kernels/arm/mul_compute_test.cc
@@ -12,16 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/mul_compute.h"
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <iostream>
 #include <memory>
 #include <random>
 #include <utility>
 #include <vector>
+
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/mul_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -69,8 +71,7 @@ void FillData(T* a,
 }
 
 TEST(mul_arm, retrive_op) {
-  auto mul =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("mul");
+  auto mul = KernelRegistry::Global().Create("mul");
   ASSERT_FALSE(mul.empty());
   ASSERT_TRUE(mul.front());
 }
diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc
index acdaf0d0131621c1c2403b8a071d6cb1134f4565..c4aeb20a5bf53d80be4b407698a51ead46f6b8f5 100644
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/pool_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/pool_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -341,8 +343,7 @@ TEST(pool_arm, compute) {
 }
 
 TEST(pool_arm, retrive_op) {
-  auto pool = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-      "pool2d");
+  auto pool = KernelRegistry::Global().Create("pool2d");
   ASSERT_FALSE(pool.empty());
   ASSERT_TRUE(pool.front());
 }
diff --git a/lite/kernels/arm/scale_compute_test.cc b/lite/kernels/arm/scale_compute_test.cc
index 0d327b9807d306770850b09ed1ed2a0337104c92..fe5e1911d0cc2c012876731f50bd04b3125b8fa2 100644
--- a/lite/kernels/arm/scale_compute_test.cc
+++ b/lite/kernels/arm/scale_compute_test.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/scale_compute.h"
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/scale_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -103,8 +105,7 @@ TEST(scale_arm, compute) {
 }
 
 TEST(scale, retrive_op) {
-  auto scale =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("scale");
+  auto scale = KernelRegistry::Global().Create("scale");
   ASSERT_FALSE(scale.empty());
   ASSERT_TRUE(scale.front());
 }
diff --git a/lite/kernels/arm/sequence_conv_compute.cc b/lite/kernels/arm/sequence_conv_compute.cc
index a70b6717097ec0ffdaa24ba257bfdf8dbd536f3f..69740a258be165f9ceec6829a81497e842b5a697 100644
--- a/lite/kernels/arm/sequence_conv_compute.cc
+++ b/lite/kernels/arm/sequence_conv_compute.cc
@@ -88,7 +88,7 @@ void SequenceConvCompute::Run() {
       paddle::lite::arm::math::im2col(
           sub_in_data,
           1,
-          sequence_len,
+          input_row_end - input_row_begin,
           hidden_dim,  // C H W -> 1, seq_len, hidden_dim
           kernel_size,
           hidden_dim,  // kernel_h, kernel_w
diff --git a/lite/kernels/arm/softmax_compute.cc b/lite/kernels/arm/softmax_compute.cc
index 3409d0f5c5bd6e7ce1ea77809f7715b62bb10ca2..79ea23ab3fad3340c63846ea11cc89b371f5c6c9 100644
--- a/lite/kernels/arm/softmax_compute.cc
+++ b/lite/kernels/arm/softmax_compute.cc
@@ -34,7 +34,7 @@ void SoftmaxCompute::Run() {
   int inner_num = x_dims.Slice(axis + 1, x_rank).production();
   int axis_size = x_dims[axis];
   if (inner_num == 1) {
-    if (axis_size >= 4) {
+    if (axis_size > 4) {
       lite::arm::math::softmax_inner1_large_axis(
           din, dout, outer_num, axis_size);
     } else {
diff --git a/lite/kernels/arm/softmax_compute_test.cc b/lite/kernels/arm/softmax_compute_test.cc
index 459112d8c0169375584baf0cb983037682e47a3d..486ccf2cedd1af3ce0d7cc2f7d0aeecaadf15ca9 100644
--- a/lite/kernels/arm/softmax_compute_test.cc
+++ b/lite/kernels/arm/softmax_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/softmax_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <limits>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/softmax_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -121,9 +123,7 @@ TEST(softmax_arm, compute) {
 }
 
 TEST(softmax, retrive_op) {
-  auto softmax =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "softmax");
+  auto softmax = KernelRegistry::Global().Create("softmax");
   ASSERT_FALSE(softmax.empty());
   ASSERT_TRUE(softmax.front());
 }
diff --git a/lite/kernels/arm/split_compute_test.cc b/lite/kernels/arm/split_compute_test.cc
index 034fbb85c487df6159a6a22b9958cc9e64d9e1c6..c51ea186b52a77abec5c7560b0a028079bea4aba 100644
--- a/lite/kernels/arm/split_compute_test.cc
+++ b/lite/kernels/arm/split_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/split_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstring>
 #include <limits>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/split_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -165,8 +167,7 @@ TEST(split_arm, compute) {
 }
 
 TEST(split, retrive_op) {
-  auto split =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>("split");
+  auto split = KernelRegistry::Global().Create("split");
   ASSERT_FALSE(split.empty());
   ASSERT_TRUE(split.front());
 }
diff --git a/lite/kernels/arm/split_lod_tensor_compute_test.cc b/lite/kernels/arm/split_lod_tensor_compute_test.cc
index 3b2004c786698b70b4c54b68d696a9cf5f5221fd..03f5a21890ffd515e83de7895c2be886b15b8967 100644
--- a/lite/kernels/arm/split_lod_tensor_compute_test.cc
+++ b/lite/kernels/arm/split_lod_tensor_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/split_lod_tensor_compute.h"
 #include <gtest/gtest.h>
+
 #include <cstdlib>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/arm/split_lod_tensor_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace arm {
 
 TEST(split_lod_tensor_arm, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "split_lod_tensor");
+  auto kernel = KernelRegistry::Global().Create("split_lod_tensor");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/arm/transpose_compute_test.cc b/lite/kernels/arm/transpose_compute_test.cc
index aaf3f138a54db2c7ff766325cfd61bc51ec8b1d2..74fd14754637427277a6b19b820bb5d3de66c418 100644
--- a/lite/kernels/arm/transpose_compute_test.cc
+++ b/lite/kernels/arm/transpose_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/transpose_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <string>
 #include <vector>
+
 #include "lite/backends/arm/math/funcs.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/arm/transpose_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -121,9 +123,7 @@ TEST(transpose_arm, compute_shape_nchw) {
 }
 
 TEST(transpose, retrive_op) {
-  auto transpose =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "transpose");
+  auto transpose = KernelRegistry::Global().Create("transpose");
   ASSERT_FALSE(transpose.empty());
   ASSERT_TRUE(transpose.front());
 }
@@ -189,9 +189,7 @@ TEST(transpose2_arm, compute_shape_nchw) {
 }
 
 TEST(transpose2, retrive_op) {
-  auto transpose2 =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "transpose2");
+  auto transpose2 = KernelRegistry::Global().Create("transpose2");
   ASSERT_FALSE(transpose2.empty());
   ASSERT_TRUE(transpose2.front());
 }
diff --git a/lite/kernels/bm/bridges/batch_norm_op.cc b/lite/kernels/bm/bridges/batch_norm_op.cc
index fbf70178fdd971edce34b3253b02febfa3e3b85c..f5ecc0825a17f26b1cf65605ea2e8c0c93338f39 100644
--- a/lite/kernels/bm/bridges/batch_norm_op.cc
+++ b/lite/kernels/bm/bridges/batch_norm_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <bmcompiler_if.h>
+#include <math.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -64,10 +65,16 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto* bias_data = bias->mutable_data<float>();
   auto* mean_data = mean->mutable_data<float>();
   auto* variance_data = variance->mutable_data<float>();
+
+  float* new_bias = static_cast<float*>(malloc(bias->memory_size()));
+  float* new_scale = static_cast<float*>(malloc(scale->memory_size()));
+  CHECK(new_bias != nullptr);
+  CHECK(new_scale != nullptr);
+
   for (int c = 0; c < channel_size; c++) {
     float inv_scale = 1.f / (std::sqrt(variance_data[c] + epsilon));
-    bias_data[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
-    scale_data[c] = inv_scale * scale_data[c];
+    new_bias[c] = bias_data[c] - inv_scale * scale_data[c] * mean_data[c];
+    new_scale[c] = inv_scale * scale_data[c];
   }
 
   const int input_num = 1;
@@ -86,11 +93,13 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                   output_dims.size(),
                   static_cast<const char*>(output_var_name.c_str()),
                   static_cast<const char*>(unique_op_name.c_str()),
-                  static_cast<const float*>(scale->mutable_data<float>()),
-                  static_cast<const float*>(bias->mutable_data<float>()),
+                  static_cast<const float*>(new_scale),
+                  static_cast<const float*>(new_bias),
                   1,
                   1,
                   1);
+  free(new_scale);
+  free(new_bias);
   delete[] shape;
   delete[] name;
   delete[] dim;
diff --git a/lite/kernels/bm/bridges/density_prior_box_op.cc b/lite/kernels/bm/bridges/density_prior_box_op.cc
index 137c5142d5ae544226dbe5d6cd7c872fc272b71a..895901d94e2b2077f530e196ef8f30d4f57df793 100644
--- a/lite/kernels/bm/bridges/density_prior_box_op.cc
+++ b/lite/kernels/bm/bridges/density_prior_box_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <bmcompiler_if.h>
+#include <math.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
diff --git a/lite/kernels/bm/bridges/interpolate_op.cc b/lite/kernels/bm/bridges/interpolate_op.cc
index 8c2d39b16ac0206d83199fdeac6c30a0a352856e..a77ec4e8f788e581d9d226369210a449ec50840c 100644
--- a/lite/kernels/bm/bridges/interpolate_op.cc
+++ b/lite/kernels/bm/bridges/interpolate_op.cc
@@ -76,6 +76,8 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                         static_cast<const char*>(output_var_name.c_str()),
                         0,
                         0,
+                        0,
+                        0,
                         type);
   }
   graph->AddNode(output_var_name);
diff --git a/lite/kernels/bm/subgraph_compute.cc b/lite/kernels/bm/subgraph_compute.cc
index d7640e1ac7326d9764380469dc97a7806b044437..664198cf9fb45664fdc088df382b9b94a1924e9b 100644
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -28,12 +28,35 @@ namespace lite {
 namespace kernels {
 namespace bm {
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_inputs_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    device_inputs_[i].reset(new hiai::AiTensor);
+    CHECK(device_inputs_[i]);
+  }
+  device_outputs_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    device_outputs_[i].reset(new hiai::AiTensor);
+    CHECK(device_outputs_[i]);
+  }
+  return true;
+}
+
+bool SubgraphEngine::BuildDeviceProgram() {
   int status = 0;
   subgraph::bm::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
   graph.CreateCompilerHandle();
   auto& ctx = this->ctx_->template As<BMContext>();
+  if (origin_program_.empty()) {
+    BuildOriginProgram();
+  }
   for (auto& inst : origin_program_) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
@@ -42,7 +65,7 @@ int SubgraphEngine::BuildDeviceProgram() {
     std::string op_type = op->op_info()->Type();
     LOG(INFO) << op_type;
     if (!bridges.Exists(op_type, TARGET(kBM))) {
-      return subgraph::FAILED;
+      return false;
     }
     auto kernel = inst.kernel();
     status |=
@@ -50,12 +73,13 @@ int SubgraphEngine::BuildDeviceProgram() {
                                              const_cast<OpLite*>(op),
                                              const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
-  std::string net_name = "bmnetc_f32umodel";
+  std::string net_name = "bmnet_f32bmodel";
+  auto unique_net_name = lite::subgraph::bm::UniqueName(net_name);
   __bmcompile_opt(
-      graph.GetCompilerHandle(), const_cast<char*>(net_name.c_str()), 1);
+      graph.GetCompilerHandle(), const_cast<char*>(unique_net_name.c_str()), 2);
   void* bmodel_data = nullptr;
   unsigned int data_size = 0;
   bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
@@ -63,7 +87,7 @@ int SubgraphEngine::BuildDeviceProgram() {
   graph.UnlockCompilerMutex();
   bmrt_hd_ = bmrt_create(bm_hd_);
   if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
-    return subgraph::FAILED;
+    return false;
   }
   bmrt_get_network_names(bmrt_hd_, &net_names_);
   net_info_ = bmrt_get_network_info(bmrt_hd_, net_names_[0]);
@@ -116,10 +140,10 @@ int SubgraphEngine::BuildDeviceProgram() {
                             net_info_->output_dtypes[i],
                             stage.output_shapes[i]);
   }
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
   for (size_t i = 0; i < device_inputs_.size(); i++) {
     bm_memcpy_s2d(bm_hd_,
                   device_inputs_[i].device_mem,
@@ -143,7 +167,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
       out_index++;
     }
   }
-  return 0;
+  return true;
 }
 
 void SubgraphCompute::PrepareForRun() {
@@ -155,12 +179,11 @@ void SubgraphCompute::PrepareForRun() {
                                    param.output_data_names,
                                    param.scope));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace bm
diff --git a/lite/kernels/bm/subgraph_compute.h b/lite/kernels/bm/subgraph_compute.h
index 60f7661c7990d90020dbfc7ec3a6e0d178dceb70..7a5b2552ff95681da09346ba11f40f1a6acb7f01 100644
--- a/lite/kernels/bm/subgraph_compute.h
+++ b/lite/kernels/bm/subgraph_compute.h
@@ -44,8 +44,9 @@ class SubgraphEngine : public subgraph::Engine {
             ctx, block_idx, block_desc, input_names, output_names, scope) {}
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool PrepareWorkspaceForDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
  private:
   void *bmrt_hd_;
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index 1a58a51c36a1ccbb21bb2830a197c096e7ddac51..22bb4345fe744df9a06997d366310e2cc24a7a12 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -6,6 +6,8 @@ message(STATUS "compile with lite CUDA kernels")
 
 # basic kernels
 add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(fc_compute_cuda CUDA basic SRCS fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(matmul_compute_cuda CUDA basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
@@ -34,7 +36,10 @@ add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.
 add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda})
 add_kernel(sequence_reverse_compute_cuda CUDA extra SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_pad_compute_cuda CUDA extra SRCS sequence_pad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(sequence_unpad_compute_cuda CUDA extra SRCS sequence_unpad_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(sequence_concat_compute_cuda CUDA extra SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_mask_compute_cuda CUDA extra SRCS sequence_mask_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_arithmetic_compute_cuda CUDA extra SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps})
@@ -44,6 +49,8 @@ add_kernel(match_matrix_tensor_compute_cuda CUDA extra SRCS match_matrix_tensor_
 add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm)
 add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
 add_kernel(var_conv_2d_compute_cuda CUDA extra SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(topk_pooling_compute_cuda CUDA extra SRCS topk_pooling_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(assign_value_compute_cuda CUDA extra SRCS assign_value_compute.cu DEPS ${lite_kernel_deps})
 
 # unit test
 lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
@@ -60,7 +67,9 @@ nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute
 nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda)
 nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
-nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) 
+nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
+nv_test(fc_compute_cuda_test SRCS fc_compute_test.cc DEPS fc_compute_cuda)
+nv_test(matmul_compute_cuda_test SRCS matmul_compute_test.cc DEPS matmul_compute_cuda)
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
 #nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
@@ -74,9 +83,14 @@ if(LITE_BUILD_EXTRA)
     nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda)
     nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda)
     nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda)
+    nv_test(sequence_pad_compute_cuda_test SRCS sequence_pad_compute_test.cc DEPS sequence_pad_compute_cuda)
+    nv_test(sequence_unpad_compute_cuda_test SRCS sequence_unpad_compute_test.cc DEPS sequence_unpad_compute_cuda)
+    nv_test(sequence_mask_compute_cuda_test SRCS sequence_mask_compute_test.cc DEPS sequence_mask_compute_cuda)
     nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
     #nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
     #nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
     nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
     #nv_test(search_fc_cuda_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda)
+    nv_test(topk_pooling_compute_cuda_test SRCS topk_pooling_compute_test.cc DEPS topk_pooling_compute_cuda)
+    nv_test(assign_value_compute_cuda_test SRCS assign_value_compute_test.cc DEPS assign_value_compute_cuda)
 endif()
diff --git a/lite/kernels/cuda/assign_value_compute.cu b/lite/kernels/cuda/assign_value_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..89f2937f10399361951c3c8deb47e3700f93e288
--- /dev/null
+++ b/lite/kernels/cuda/assign_value_compute.cu
@@ -0,0 +1,76 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/kernels/cuda/assign_value_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <class T>
+void TensorFromVector(const std::vector<T>& src,
+                      lite::Tensor* dst,
+                      cudaStream_t* stream) {
+  auto* src_ptr = static_cast<const void*>(src.data());
+  auto* dst_ptr = static_cast<void*>(dst->mutable_data<T>(TARGET(kCUDA)));
+  auto size = src.size() * sizeof(T);
+  TargetWrapperCuda::MemcpyAsync(
+      dst_ptr, src_ptr, size, IoDirection::HtoD, *stream);
+}
+
+void AssignValueCompute::Run() {
+  auto& param = Param<operators::AssignValueParam>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  int dtype = param.dtype;
+  std::vector<float> fp32_values = param.fp32_values;
+  std::vector<int> int32_values = param.int32_values;
+  std::vector<int64_t> int64_values = param.int64_values;
+  std::vector<int> bool_values = param.bool_values;
+  auto* out = param.Out;
+
+  if (dtype == static_cast<int>(lite::core::FluidType::INT32)) {
+    TensorFromVector(int32_values, out, &stream);
+  } else if (dtype == static_cast<int>(lite::core::FluidType::FP32)) {
+    TensorFromVector(fp32_values, out, &stream);
+  } else if (dtype == static_cast<int>(lite::core::FluidType::INT64)) {
+    TensorFromVector(int64_values, out, &stream);
+  } else if (dtype == static_cast<int>(lite::core::FluidType::BOOL)) {
+    TensorFromVector(bool_values, out, &stream);
+  } else {
+    LOG(FATAL) << "Unsupported dtype for assign_value_op:" << dtype;
+  }
+  return;
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(assign_value,
+                     kCUDA,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::AssignValueCompute,
+                     def)
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/cuda/assign_value_compute.h b/lite/kernels/cuda/assign_value_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c334e36d8061437881a4ea67d960f87b7ffb3ceb
--- /dev/null
+++ b/lite/kernels/cuda/assign_value_compute.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class AssignValueCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AssignValueParam;
+
+  void Run() override;
+  virtual ~AssignValueCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/assign_value_compute_test.cc b/lite/kernels/cuda/assign_value_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c29426b745e92f71bcfeca6a8fc2890cd1908b4
--- /dev/null
+++ b/lite/kernels/cuda/assign_value_compute_test.cc
@@ -0,0 +1,150 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/assign_value_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <functional>
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class AssignValueTest : public ::testing::Test {
+ protected:
+  AssignValueTest() : dtype_(5), shape_({1}) {
+    int num = std::accumulate(
+        shape_.begin(), shape_.end(), 1, std::multiplies<int>());
+    fp32_values_.resize(num);
+    int32_values_.resize(num);
+    int64_values_.resize(num);
+    bool_values_.resize(num);
+    for (int i = 0; i < num; ++i) {
+      fp32_values_[i] = i + 5;
+      int32_values_[i] = i;
+      int64_values_[i] = i;
+      bool_values_[i] = i;
+    }
+    std::vector<int64_t> out_shape(shape_.size(), 0);
+    for (size_t i = 0; i < shape_.size(); ++i) out_shape[i] = shape_[i];
+    out_ref_.Resize(lite::DDim(out_shape));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+
+    RunBaseLine(&out_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.shape = shape_;
+    param_.dtype = dtype_;
+    param_.fp32_values = fp32_values_;
+    param_.int32_values = int32_values_;
+    param_.int64_values = int64_values_;
+    param_.bool_values = bool_values_;
+    param_.Out = &out_gpu_;
+  }
+
+  void InitFloatInput() {}
+
+  void InitHalfInput() {}
+
+  void RunBaseLine(lite::Tensor* out) {
+    if (dtype_ == static_cast<int>(lite::core::FluidType::INT32)) {
+      for (size_t i = 0; i < int32_values_.size(); ++i) {
+        out->mutable_data<int>()[i] = int32_values_[i];
+      }
+    } else if (dtype_ == static_cast<int>(lite::core::FluidType::FP32)) {
+      for (size_t i = 0; i < fp32_values_.size(); ++i) {
+        out->mutable_data<float>()[i] = fp32_values_[i];
+      }
+    } else if (dtype_ == static_cast<int>(lite::core::FluidType::INT64)) {
+      for (size_t i = 0; i < int64_values_.size(); ++i) {
+        out->mutable_data<int64_t>()[i] = int64_values_[i];
+      }
+    } else if (dtype_ == static_cast<bool>(lite::core::FluidType::BOOL)) {
+      for (size_t i = 0; i < bool_values_.size(); ++i) {
+        out->mutable_data<bool>()[i] = bool_values_[i];
+      }
+    } else {
+      LOG(FATAL) << "Unsupported dtype_ for assign_value_op:" << dtype_;
+    }
+  }
+
+  int dtype_;
+  std::vector<int> shape_;
+  std::vector<float> fp32_values_;
+  std::vector<int> int32_values_;
+  std::vector<int64_t> int64_values_;
+  std::vector<int> bool_values_;
+
+  lite::Tensor out_ref_;
+  lite::Tensor out_gpu_;
+  lite::Tensor out_cpu_;
+
+  operators::AssignValueParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(AssignValueTest, fp32) {
+  InitFloatInput();
+  AssignValueCompute kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/fc_compute.cu b/lite/kernels/cuda/fc_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0ad376577b133540b782e2726564302a95ddf216
--- /dev/null
+++ b/lite/kernels/cuda/fc_compute.cu
@@ -0,0 +1,353 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/cuda/fc_compute.h"
+
+#include <string>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+struct FcTypeTraits;
+
+template <>
+struct FcTypeTraits<float> {
+  typedef float4 Type;
+};
+
+template <typename T>
+__global__ void AddBiasV2(const int num, const T* bias, T* data, int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const T bias_ptr = bias[bias_idx];
+    const T in_ptr = data[index];
+    T packed_val;
+    packed_val.x = in_ptr.x + bias_ptr.x;
+    packed_val.y = in_ptr.y + bias_ptr.y;
+    data[index] = packed_val;
+  }
+}
+
+template <>
+__global__ void AddBiasV2(const int num,
+                          const half2* bias,
+                          half2* data,
+                          int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const half2 bias_ptr = bias[bias_idx];
+    const half2 in_ptr = data[index];
+#if __CUDA_ARCH__ >= 530
+    data[index] = __hadd2(in_ptr, bias_ptr);
+#else
+    half2 packed_val;
+    packed_val.x = __hadd(in_ptr.x, bias_ptr.x);
+    packed_val.y = __hadd(in_ptr.y, bias_ptr.y);
+    data[index] = packed_val;
+#endif
+  }
+}
+
+template <typename T>
+__global__ void AddBiasReluV2(const int num, const T* bias, T* data, int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const T bias_ptr = bias[bias_idx];
+    const T in_ptr = data[index];
+    T packed_val;
+    packed_val.x = fmaxf(0.f, in_ptr.x + bias_ptr.x);
+    packed_val.y = fmaxf(0.f, in_ptr.y + bias_ptr.y);
+    data[index] = packed_val;
+  }
+}
+
+template <>
+__global__ void AddBiasReluV2(const int num,
+                              const half2* bias,
+                              half2* data,
+                              int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const half2 bias_ptr = bias[bias_idx];
+    const half2 in_ptr = data[index];
+#if __CUDA_ARCH__ >= 530
+    data[index] = __hmul2(__hgt2(in_ptr + bias_ptr, __float2half2_rn(0.f)),
+                          in_ptr + bias_ptr);
+#else
+    const float2 bias = __half22float2(bias_ptr);
+    const float2 in = __half22float2(in_ptr);
+    data[index] = __floats2half2_rn(
+        bias.x + in.x > 0.0f ? static_cast<float>(bias.x + in.x) : 0.0f,
+        bias.y + in.y > 0.0f ? static_cast<float>(bias.y + in.y) : 0.0f);
+#endif
+  }
+}
+
+template <typename T>
+__global__ void AddBiasV4(const int num, const T* bias, T* data, int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const T bias_ptr = bias[bias_idx];
+    const T in_ptr = data[index];
+    T packed_val;
+    packed_val.x = in_ptr.x + bias_ptr.x;
+    packed_val.y = in_ptr.y + bias_ptr.y;
+    packed_val.z = in_ptr.z + bias_ptr.z;
+    packed_val.w = in_ptr.w + bias_ptr.w;
+    data[index] = packed_val;
+  }
+}
+
+template <typename T>
+__global__ void AddBiasReluV4(const int num, const T* bias, T* data, int K) {
+  CUDA_KERNEL_LOOP(index, num) {
+    int bias_idx = index % K;
+    const T bias_ptr = bias[bias_idx];
+    const T in_ptr = data[index];
+    T packed_val;
+    packed_val.x = fmaxf(0.f, in_ptr.x + bias_ptr.x);
+    packed_val.y = fmaxf(0.f, in_ptr.y + bias_ptr.y);
+    packed_val.z = fmaxf(0.f, in_ptr.z + bias_ptr.z);
+    packed_val.w = fmaxf(0.f, in_ptr.w + bias_ptr.w);
+    data[index] = packed_val;
+  }
+}
+
+template <typename T>
+__global__ void AddBias(const int num, const T* bias, T* data) {
+  int offset = blockIdx.x * num;
+
+  for (int i = threadIdx.x; i < num; i += blockDim.x) {
+    T temp;
+#if __CUDA_ARCH__ >= 350
+    temp = __ldg(data + offset + i) + __ldg(bias + i);
+#else
+    temp = data[offset + i] + bias[i];
+#endif
+    data[offset + i] = temp;
+  }
+}
+
+template <>
+__global__ void AddBias(const int num, const half* bias, half* data) {
+  int offset = blockIdx.x * num;
+
+  for (int i = threadIdx.x; i < num; i += blockDim.x) {
+    half temp;
+#if __CUDA_ARCH__ >= 350
+    temp = __hadd(__ldg(data + offset + i), __ldg(bias + i));
+#else
+    temp = __hadd(data[offset + i], bias[i]);
+#endif
+    data[offset + i] = temp;
+  }
+}
+
+template <typename T>
+__global__ void AddBiasRelu(const int num, const T* bias, T* data) {
+  int offset = blockIdx.x * num;
+
+  for (int i = threadIdx.x; i < num; i += blockDim.x) {
+    T temp;
+#if __CUDA_ARCH__ >= 350
+    temp = __ldg(data + offset + i) + __ldg(bias + i);
+#else
+    temp = data[offset + i] + bias[i];
+#endif
+    data[offset + i] = static_cast<int>(temp > 0) * temp;
+  }
+}
+
+template <>
+__global__ void AddBiasRelu<half>(const int num, const half* bias, half* data) {
+  int offset = blockIdx.x * num;
+
+  for (int i = threadIdx.x; i < num; i += blockDim.x) {
+    half temp;
+#if __CUDA_ARCH__ >= 350
+    temp = __hadd(__ldg(data + offset + i), __ldg(bias + i));
+#else
+    temp = __hadd(data[offset + i], bias[i]);
+#endif
+
+#if __CUDA_ARCH__ >= 530
+    data[offset + i] =
+        __hgt(temp, __float2half(0.0f)) ? temp : __float2half(0.0f);
+#else
+    data[offset + i] =
+        __float2half(__half2float(temp) > 0.f ? __half2float(temp) : 0.f);
+#endif
+  }
+}
+
+template <typename T, PrecisionType PType>
+void FcCompute<T, PType>::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<T, T>);
+}
+
+template <typename T, PrecisionType PType>
+void FcCompute<T, PType>::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+  auto& param = this->template Param<param_t>();
+
+  const auto* x_data = param.input->template data<T>();
+  const auto* w_data = param.w->template data<T>();
+  const auto* b_data = param.bias ? param.bias->template data<T>() : nullptr;
+
+  auto out_vec = param.output->dims().Vectorize();
+  out_vec.back() = param.w->dims()[1];
+  param.output->Resize(out_vec);
+  auto* out_data = param.output->template mutable_data<T>(TARGET(kCUDA));
+
+  int in_num_col_dims = param.in_num_col_dims;
+
+  int M = static_cast<int>(
+      param.input->dims().Slice(0, param.in_num_col_dims).production());
+  int K = static_cast<int>(
+      param.input->dims()
+          .Slice(param.in_num_col_dims, param.input->dims().size())
+          .production());
+  int K2 = static_cast<int>(param.w->dims()[0]);
+  int N = static_cast<int>(param.w->dims()[1]);
+  CHECK_EQ(K, K2) << "x_w must be equal with y_h";
+
+  CHECK(gemm_impl_->init(false, false, M, N, K, &context));
+  gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &context);
+
+  if (b_data == nullptr) {
+    return;
+  }
+
+  std::string activation_type = param.activation_type;
+  if (N % 4 == 0) {
+    const int threads = 256;
+    const int num = M * N / 4;
+    const int blocks = (num + threads - 1) / threads;
+    typedef typename FcTypeTraits<T>::Type trans_type;
+    const auto* bias_ptr_v4 = reinterpret_cast<const trans_type*>(b_data);
+    auto* data_ptr_v4 = reinterpret_cast<trans_type*>(out_data);
+    if (activation_type == "relu") {
+      AddBiasReluV4<trans_type><<<blocks, threads, 0, stream>>>(
+          num, bias_ptr_v4, data_ptr_v4, N / 4);
+    } else if (activation_type == "") {
+      AddBiasV4<trans_type><<<blocks, threads, 0, stream>>>(
+          num, bias_ptr_v4, data_ptr_v4, N / 4);
+    } else {
+      LOG(FATAL) << "not supported activation type: " << activation_type;
+    }
+  } else {
+    const int threads = 256;
+    const int blocks = M;
+    if (activation_type == "relu") {
+      AddBiasRelu<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+    } else if (activation_type == "") {
+      AddBias<T><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+    } else {
+      LOG(FATAL) << "not supported activation type: " << activation_type;
+    }
+  }
+}
+
+template <>
+void FcCompute<half, PRECISION(kFP16)>::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+  auto& param = this->template Param<param_t>();
+
+  const auto* x_data = param.input->template data<half>();
+  const auto* w_data = param.w->template data<half>();
+  const auto* b_data = param.bias ? param.bias->template data<half>() : nullptr;
+
+  auto out_vec = param.output->dims().Vectorize();
+  out_vec.back() = param.w->dims()[1];
+  param.output->Resize(out_vec);
+  auto* out_data = param.output->template mutable_data<half>(TARGET(kCUDA));
+
+  int in_num_col_dims = param.in_num_col_dims;
+
+  int M = static_cast<int>(
+      param.input->dims().Slice(0, param.in_num_col_dims).production());
+  int K = static_cast<int>(
+      param.input->dims()
+          .Slice(param.in_num_col_dims, param.input->dims().size())
+          .production());
+  int K2 = static_cast<int>(param.w->dims()[0]);
+  int N = static_cast<int>(param.w->dims()[1]);
+  CHECK_EQ(K, K2) << "x_w must be equal with y_h";
+
+  CHECK(gemm_impl_->init(false, false, M, N, K, &context));
+  gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &context);
+
+  if (b_data == nullptr) {
+    return;
+  }
+
+  std::string activation_type = param.activation_type;
+  if (N % 2 == 0) {
+    const int threads = 256;
+    const int num = M * N / 2;
+    const int blocks = (num + threads - 1) / threads;
+    const auto* bias_ptr_v2 = reinterpret_cast<const half2*>(b_data);
+    auto* data_ptr_v2 = reinterpret_cast<half2*>(out_data);
+    if (activation_type == "relu") {
+      AddBiasReluV2<half2><<<blocks, threads, 0, stream>>>(
+          num, bias_ptr_v2, data_ptr_v2, N / 2);
+    } else if (activation_type == "") {
+      AddBiasV2<half2><<<blocks, threads, 0, stream>>>(
+          num, bias_ptr_v2, data_ptr_v2, N / 2);
+    } else {
+      LOG(FATAL) << "not supported activation type: " << activation_type;
+    }
+  } else {
+    const int threads = 256;
+    const int blocks = M;
+    if (activation_type == "relu") {
+      AddBiasRelu<half><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+    } else if (activation_type == "") {
+      AddBias<half><<<blocks, threads, 0, stream>>>(N, b_data, out_data);
+    } else {
+      LOG(FATAL) << "not supported activation type: " << activation_type;
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using FcFp32 = paddle::lite::kernels::cuda::FcCompute<float, PRECISION(kFloat)>;
+
+using FcFp16 = paddle::lite::kernels::cuda::FcCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(fc, kCUDA, kFloat, kNCHW, FcFp32, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fc, kCUDA, kFP16, kNCHW, FcFp16, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/fc_compute.h b/lite/kernels/cuda/fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..700194c115824762411e952c77d06cb01a754bc0
--- /dev/null
+++ b/lite/kernels/cuda/fc_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType PType>
+class FcCompute : public KernelLite<TARGET(kCUDA), PType> {
+ public:
+  using param_t = operators::FcParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~FcCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<T, T>> gemm_impl_{nullptr};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/fc_compute_test.cc b/lite/kernels/cuda/fc_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fa0dada729ca01cb1a4176ca585ce8f921f3aa42
--- /dev/null
+++ b/lite/kernels/cuda/fc_compute_test.cc
@@ -0,0 +1,231 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/fc_compute.h"
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class FcTest : public ::testing::Test {
+ protected:
+  FcTest()
+      : m_(8),
+        k_(16),
+        n_(64),
+        in_num_col_dims_(1),
+        act_type_("relu"),
+        x_shape_({m_, k_}),
+        w_shape_({k_, n_}),
+        b_shape_({n_}),
+        out_shape_({m_, n_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(lite::DDim(x_shape_));
+
+    w_ref_.Resize(lite::DDim(w_shape_));
+    w_gpu_.Resize(lite::DDim(w_shape_));
+
+    b_ref_.Resize(lite::DDim(b_shape_));
+    b_gpu_.Resize(lite::DDim(b_shape_));
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto w_ref_data = w_ref_.mutable_data<float>();
+    auto b_ref_data = b_ref_.mutable_data<float>();
+
+    // prepare input
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+    for (int64_t i = 0; i < w_ref_.numel(); i++) {
+      w_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+    for (int64_t i = 0; i < b_ref_.numel(); i++) {
+      b_ref_data[i] = static_cast<float>(i % 10 * 0.2);
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &w_ref_, &b_ref_, &out_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.input = &x_gpu_;
+    param_.w = &w_gpu_;
+    param_.bias = &b_gpu_;
+    param_.in_num_col_dims = in_num_col_dims_;
+    param_.activation_type = act_type_;
+    param_.output = &out_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    w_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(w_ref_.data<float>(),
+                                                    w_gpu_.dims());
+    b_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(b_ref_.data<float>(),
+                                                    b_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    w_half_.Resize(w_ref_.dims());
+    auto w_half_data = w_half_.mutable_data<half>();
+    for (int64_t i = 0; i < w_half_.numel(); i++) {
+      w_half_data[i] = half(lite::float16(w_ref_.data<float>()[i]));
+    }
+    w_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(w_half_data, w_gpu_.dims());
+    b_half_.Resize(b_ref_.dims());
+    auto b_half_data = b_half_.mutable_data<half>();
+    for (int64_t i = 0; i < b_half_.numel(); i++) {
+      b_half_data[i] = half(lite::float16(b_ref_.data<float>()[i]));
+    }
+    b_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(b_half_data, b_gpu_.dims());
+  }
+
+  void RunBaseLine(const lite::Tensor* x,
+                   const lite::Tensor* w,
+                   const lite::Tensor* b,
+                   lite::Tensor* out) {
+    const float* data_in = x->data<float>();
+    const float* bias = b->data<float>();
+    const float* weights = w->data<float>();
+    float* data_out = out->mutable_data<float>();
+    int out_rows = x->dims()[0];
+    int in_cols = x->numel() / out_rows;
+    int out_cols = w->numel() / in_cols;
+    int index_out;
+    for (int i = 0; i < out_rows; i++) {
+      for (int j = 0; j < out_cols; j++) {
+        index_out = i * out_cols + j;
+        data_out[index_out] = bias ? bias[j] : 0;
+        for (int k = 0; k < in_cols; k++) {
+          data_out[index_out] +=
+              data_in[i * in_cols + k] * weights[k * out_cols + j];
+        }
+        if (act_type_ == "relu") {
+          data_out[index_out] *= static_cast<int>(data_out[index_out] > 0);
+        }
+      }
+    }
+  }
+
+  int m_, k_, n_, in_num_col_dims_;
+  std::string act_type_;
+  std::vector<int64_t> x_shape_, w_shape_, b_shape_, out_shape_;
+  lite::Tensor x_ref_, w_ref_, b_ref_, out_ref_;
+  lite::Tensor x_gpu_, w_gpu_, b_gpu_;
+  lite::Tensor x_half_, w_half_, b_half_;
+  lite::Tensor out_cpu_, out_gpu_;
+
+  operators::FcParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(FcTest, TestFP32) {
+  InitFloatInput();
+  FcCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = out_cpu_.data<float>()[i];
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
+  }
+}
+
+TEST_F(FcTest, TestFP16) {
+  InitHalfInput();
+  FcCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 2e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/lookup_table_compute_test.cc b/lite/kernels/cuda/lookup_table_compute_test.cc
index 9323de14eb168fb55a68640350b87bf7040f5729..89050ea97f160b2fddb479966f59c05aafd8c268 100644
--- a/lite/kernels/cuda/lookup_table_compute_test.cc
+++ b/lite/kernels/cuda/lookup_table_compute_test.cc
@@ -12,14 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/cuda/lookup_table_compute.h"
 #include <gtest/gtest.h>
+
 #include <cmath>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/lookup_table_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -56,9 +58,7 @@ void LookupTableComputeRef(const operators::LookupTableParam& param) {
 }
 
 TEST(lookup_table_cuda, retrieve_op) {
-  auto lookup_table =
-      KernelRegistry::Global().Create<TARGET(kCUDA), PRECISION(kFloat)>(
-          "lookup_table");
+  auto lookup_table = KernelRegistry::Global().Create("lookup_table");
   ASSERT_FALSE(lookup_table.empty());
   ASSERT_TRUE(lookup_table.front());
 }
diff --git a/lite/kernels/cuda/matmul_compute.cc b/lite/kernels/cuda/matmul_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b80b673dfabdccc7c728fa3081a81a870531acf
--- /dev/null
+++ b/lite/kernels/cuda/matmul_compute.cc
@@ -0,0 +1,156 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/matmul_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType PType>
+void MatMulCompute<T, PType>::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto& param = this->template Param<param_t>();
+
+  const auto* x_data = param.X->template data<T>();
+  const auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>(TARGET(kCUDA));
+  bool transpose_x = param.transpose_X;
+  bool transpose_y = param.transpose_Y;
+  float alpha = param.alpha;
+
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+
+  int m = 0;
+  int k = 0;
+  int n = 0;
+  int batch = 0;
+  int64_t stride_x = 0;
+  int64_t stride_y = 0;
+
+  if (x_dims.size() >= 2 && y_dims.size() >= 2 &&
+      (x_dims.size() != 2 || y_dims.size() != 2)) {
+    // x: [B, ..., M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [B, M, K], y: [K, N], out: [B, M, N]
+    // or
+    // x: [M, K], y: [B, ..., K, N], out: [B, ..., M, N]
+    // x: [M, K], y: [B, K, N], out: [B, M, N]
+    strided_gemm_impl_->init(transpose_x, transpose_y, &context);
+    m = transpose_x ? x_dims[x_dims.size() - 1] : x_dims[x_dims.size() - 2];
+    k = transpose_x ? x_dims[x_dims.size() - 2] : x_dims[x_dims.size() - 1];
+    n = transpose_y ? y_dims[y_dims.size() - 2] : y_dims[y_dims.size() - 1];
+    int batch_x = x_dims.size() == 2 ? 0 : x_dims.count(0, x_dims.size() - 2);
+    int batch_y = y_dims.size() == 2 ? 0 : y_dims.count(0, y_dims.size() - 2);
+    CHECK(batch_x == batch_y || batch_x == 0 || batch_y == 0)
+        << "batch_size x should be equal to batch_size y, or "
+           "one of batch_size x and batch_size y should be 0. "
+           "But got batch_size x = "
+        << batch_x << ", batch_size y = " << batch_y;
+    batch = batch_x == 0 ? batch_y : batch_x;
+    stride_x = x_dims.size() == 2 ? 0 : m * k;
+    stride_y = y_dims.size() == 2 ? 0 : k * n;
+    strided_gemm_impl_->run(alpha,
+                            0.f,
+                            m,
+                            n,
+                            k,
+                            x_data,
+                            y_data,
+                            out_data,
+                            batch,
+                            stride_x,
+                            stride_y);
+  } else if (x_dims.size() == 2 && y_dims.size() == 2) {
+    // x: [M, K], y: [K, N], out: [M, N]
+    m = transpose_x ? x_dims[1] : x_dims[0];
+    k = transpose_x ? x_dims[0] : x_dims[1];
+    n = transpose_y ? y_dims[0] : y_dims[1];
+    gemm_impl_->init(transpose_x, transpose_y, m, n, k, &context);
+    gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context);
+  } else if (x_dims.size() > 2 && y_dims.size() == 1) {
+    // x: [B, M, K], y: [K], out: [B, M]
+    strided_gemm_impl_->init(transpose_x, transpose_y, &context);
+    m = transpose_x ? x_dims[x_dims.size() - 1] : x_dims[x_dims.size() - 2];
+    k = transpose_x ? x_dims[x_dims.size() - 2] : x_dims[x_dims.size() - 1];
+    n = 1;
+    batch = x_dims.count(0, x_dims.size() - 2);
+    stride_x = m * k;
+    stride_y = 0;
+    strided_gemm_impl_->run(alpha,
+                            0.f,
+                            m,
+                            n,
+                            k,
+                            x_data,
+                            y_data,
+                            out_data,
+                            batch,
+                            stride_x,
+                            stride_y);
+  } else if (x_dims.size() == 1 && y_dims.size() == 1) {
+    if (!transpose_x && !transpose_y) {
+      // x: [K], y: [K], out: [1]
+      m = 1;
+      k = x_dims[0];
+      n = 1;
+      CHECK_EQ(x_dims[0], y_dims[0])
+          << "x_dims[0] should be equal to y_dims[0]";
+      gemm_impl_->init(false, false, m, n, k, &context);
+      gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context);
+    } else if (transpose_x && transpose_y) {
+      // x: [M], y: [N], x_transpose: true, y_transpose: true, out: [M, N]
+      m = x_dims[0];
+      k = 1;
+      n = y_dims[0];
+      gemm_impl_->init(false, false, m, n, k, &context);
+      gemm_impl_->run(alpha, 0.0f, x_data, y_data, out_data, &context);
+    } else {
+      LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims("
+                 << y_dims << "), transpose_x(" << transpose_x
+                 << "), transpose_y(" << transpose_y << ")";
+    }
+  } else {
+    LOG(FATAL) << "not supported x_dims(" << x_dims << ") and y_dims(" << y_dims
+               << ")";
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using MatMulFp32 =
+    paddle::lite::kernels::cuda::MatMulCompute<float, PRECISION(kFloat)>;
+
+using MatMulFp16 =
+    paddle::lite::kernels::cuda::MatMulCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(matmul, kCUDA, kFloat, kNCHW, MatMulFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(matmul, kCUDA, kFP16, kNCHW, MatMulFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/matmul_compute.h b/lite/kernels/cuda/matmul_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..69ad178d9184b7c3893f49a23024a14d7466115b
--- /dev/null
+++ b/lite/kernels/cuda/matmul_compute.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/backends/cuda/math/strided_gemm.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class MatMulCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  void PrepareForRun() override {
+    strided_gemm_impl_.reset(new lite::cuda::math::StridedGemm<T, T>);
+    gemm_impl_.reset(new lite::cuda::math::Gemm<T, T>);
+  }
+
+  void Run() override;
+
+  virtual ~MatMulCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::StridedGemm<T, T>> strided_gemm_impl_{
+      nullptr};
+  std::unique_ptr<lite::cuda::math::Gemm<T, T>> gemm_impl_{nullptr};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/matmul_compute_test.cc b/lite/kernels/cuda/matmul_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89f40af3920ba0d3e36781955ffbf5eaba093257
--- /dev/null
+++ b/lite/kernels/cuda/matmul_compute_test.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/matmul_compute.h"
+
+#include <gtest/gtest.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class MatMulTest : public ::testing::Test {
+ protected:
+  MatMulTest()
+      : x_trans_(false),
+        y_trans_(true),
+        alpha_(1.0f),
+        x_shape_({4, 1, 2}),
+        y_shape_({4, 1, 2}),
+        out_shape_({4, 1, 1}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(x_ref_.dims());
+
+    y_ref_.Resize(lite::DDim(y_shape_));
+    y_gpu_.Resize(y_ref_.dims());
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto y_ref_data = y_ref_.mutable_data<float>();
+
+    // prepare input
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(1);
+    }
+    for (int64_t i = 0; i < y_ref_.numel(); i++) {
+      y_ref_data[i] = static_cast<float>(1);
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_cpu_.Resize(out_ref_.dims());
+    out_gpu_.Resize(out_ref_.dims());
+    RunBaseLine();
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Y = &y_gpu_;
+    param_.transpose_X = x_trans_;
+    param_.transpose_Y = y_trans_;
+    param_.alpha = alpha_;
+    param_.Out = &out_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    y_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(y_ref_.data<float>(),
+                                                    y_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(x_ref_.dims());
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); ++i) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    y_half_.Resize(y_ref_.dims());
+    auto y_half_data = y_half_.mutable_data<half>();
+    for (int64_t i = 0; i < y_half_.numel(); i++) {
+      y_half_data[i] = half(lite::float16(y_ref_.data<float>()[i]));
+    }
+    y_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(y_half_data, y_gpu_.dims());
+  }
+
+  void RunBaseLine() {
+    auto* out_data = out_ref_.mutable_data<float>();
+    for (int64_t i = 0; i < out_ref_.numel(); ++i) {
+      out_data[i] = 2;
+    }
+  }
+
+  bool x_trans_, y_trans_;
+  float alpha_;
+  std::vector<int64_t> x_shape_, y_shape_, out_shape_;
+  lite::Tensor x_ref_, y_ref_, out_ref_;
+  lite::Tensor x_gpu_, y_gpu_;
+  lite::Tensor x_half_, y_half_;
+  lite::Tensor out_cpu_, out_gpu_;
+
+  operators::MatMulParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(MatMulTest, TestFP32) {
+  InitFloatInput();
+  MatMulCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = out_cpu_.data<float>()[i];
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / ref, 0.f, 1e-5);
+  }
+}
+
+TEST_F(MatMulTest, TestFP16) {
+  InitHalfInput();
+  MatMulCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_mask_compute.cu b/lite/kernels/cuda/sequence_mask_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8a8f292c103b8fb7b55940cf075d4b80b3fb328d
--- /dev/null
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
@@ -0,0 +1,102 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_mask_compute.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+__global__ void SequenceMaskKernel(T* dst,
+                                   const int64_t* src,
+                                   int count,
+                                   int maxlen) {
+  CUDA_KERNEL_LOOP(index, count) {
+    int src_idx = index / maxlen;
+    int inner_idx = index % maxlen;
+    dst[index] = static_cast<T>(inner_idx < src[src_idx] ? 1 : 0);
+  }
+}
+
+template <typename T, PrecisionType Ptype>
+void SequenceMaskCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  const auto* x = param.X;
+  auto* x_data = x->template data<int64_t>();
+  auto* y = param.Y;
+  int maxlen = param.maxlen;
+
+  if (param.MaxLenTensor) {
+    auto* len_tensor_data = param.MaxLenTensor->template data<int32_t>();
+    int32_t len_data{0};
+    TargetWrapperCuda::MemcpySync(
+        &len_data, len_tensor_data, sizeof(int32_t), IoDirection::DtoH);
+    maxlen = len_data;
+  }
+
+  if (maxlen < 0) {
+    maxlen = thrust::reduce(
+        x_data, x_data + x->numel(), 0, thrust::maximum<int64_t>());
+  }
+
+  auto y_dim = x->dims().Vectorize();
+  y_dim.push_back(maxlen);
+  y->Resize(y_dim);
+  const int count = y->numel();
+  auto* dst_data = y->template mutable_data<T>(TARGET(kCUDA));
+  if (param.out_dtype == 5) {
+    SequenceMaskKernel<
+        T><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+        dst_data, x_data, count, maxlen);
+  } else {
+    LOG(FATAL) << "not supported out_dtype: " << param.out_dtype;
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using SeqMaskFp32 =
+    paddle::lite::kernels::cuda::SequenceMaskCompute<float, PRECISION(kFloat)>;
+
+using SeqMaskFp16 =
+    paddle::lite::kernels::cuda::SequenceMaskCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(sequence_mask, kCUDA, kFloat, kNCHW, SeqMaskFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindInput("MaxLenTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_mask, kCUDA, kFP16, kNCHW, SeqMaskFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindInput("MaxLenTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt32))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_mask_compute.h b/lite/kernels/cuda/sequence_mask_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3611587f0ce7daef1a88f5b6a916e2d30d33bcc1
--- /dev/null
+++ b/lite/kernels/cuda/sequence_mask_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SequenceMaskCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::SequenceMaskParam;
+
+  void Run() override;
+  virtual ~SequenceMaskCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_mask_compute_test.cc b/lite/kernels/cuda/sequence_mask_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..efbdf2ae00b6d1d9353831e94a202e5e42228b62
--- /dev/null
+++ b/lite/kernels/cuda/sequence_mask_compute_test.cc
@@ -0,0 +1,170 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_mask_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequenceMaskTest : public ::testing::Test {
+ protected:
+  SequenceMaskTest()
+      : maxlen_(4),
+        out_dtype_(5),
+        x_data_({3, 2, 1, 0}),
+        out_shape_({static_cast<int64_t>(x_data_.size()), maxlen_}) {
+    x_ref_.Resize(lite::DDim({static_cast<int64_t>(x_data_.size())}));
+    x_gpu_.Resize(x_ref_.dims());
+
+    auto* x_ref_data = x_ref_.mutable_data<int64_t>();
+
+    // prepare input
+    for (size_t i = 0; i < x_data_.size(); i++) {
+      x_ref_data[i] = x_data_[i];
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &out_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Y = &out_gpu_;
+    param_.maxlen = maxlen_;
+    param_.out_dtype = out_dtype_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(x_ref_.data<int64_t>(),
+                                                      x_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(x_ref_.data<int64_t>(),
+                                                      x_gpu_.dims());
+  }
+
+  void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) {
+    auto* out_data = out->mutable_data<float>();
+
+    for (size_t i = 0; i < x_data_.size(); ++i) {
+      for (int j = 0; j < maxlen_; ++j) {
+        out_data[i * maxlen_ + j] = j < x_data_[i] ? 1 : 0;
+      }
+    }
+  }
+
+  int maxlen_, out_dtype_;
+  std::vector<int64_t> x_data_, out_shape_;
+
+  lite::Tensor x_ref_, out_ref_;
+  lite::Tensor x_gpu_, out_gpu_;
+  lite::Tensor out_cpu_;
+
+  operators::SequenceMaskParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(SequenceMaskTest, fp32) {
+  InitFloatInput();
+  SequenceMaskCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+}
+
+TEST_F(SequenceMaskTest, TestFP16) {
+  InitHalfInput();
+  SequenceMaskCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_pad_compute.cu b/lite/kernels/cuda/sequence_pad_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1e304f00633794dcac5d8ebfcd9d79defb4980f7
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pad_compute.cu
@@ -0,0 +1,106 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/sequence_padding.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_pad_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+void SequencePadCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  const auto* x = param.X;
+  const auto* pad_value = param.PadValue;
+  auto* out = param.Out;
+  auto* len_t = param.Length;
+  int padded_length = param.padded_length;
+
+  int seq_num = x->lod()[0].size() - 1;
+  int max_seq_len = 0;
+  int step_width = x->numel() / x->dims()[0];
+
+  // calc for param.Lenght
+  seq_len_.resize(seq_num);
+  seq_offsets_vec_.resize(x->lod()[0].size());
+  for (size_t i = 0; i < seq_num; ++i) {
+    max_seq_len = std::max(
+        max_seq_len, static_cast<int>(x->lod()[0][i + 1] - x->lod()[0][i]));
+    seq_len_[i] = x->lod()[0][i + 1] - x->lod()[0][i];
+    seq_offsets_vec_[i] = x->lod()[0][i];
+  }
+  seq_offsets_vec_[seq_num] = x->lod()[0][seq_num];
+  TargetWrapperCuda::MemcpyAsync(
+      len_t->template mutable_data<int64_t>(TARGET(kCUDA)),
+      seq_len_.data(),
+      sizeof(int64_t) * seq_len_.size(),
+      IoDirection::HtoD,
+      stream);
+  seq_offsets_.Resize({static_cast<int64_t>(x->lod()[0].size())});
+  TargetWrapperCuda::MemcpyAsync(
+      seq_offsets_.mutable_data<size_t>(TARGET(kCUDA)),
+      seq_offsets_vec_.data(),
+      sizeof(size_t) * seq_offsets_vec_.size(),
+      IoDirection::HtoD,
+      stream);
+
+  const T* seq_data = x->template data<T>();
+  T* pad_data = out->template mutable_data<T>(TARGET(kCUDA));
+  const T* pad_value_data = pad_value->template data<T>();
+
+  lite::cuda::math::SequencePadding(pad_data,
+                                    seq_data,
+                                    pad_value_data,
+                                    pad_value->numel() == 1,
+                                    seq_offsets_.data<size_t>(),
+                                    seq_num,
+                                    padded_length,
+                                    step_width,
+                                    &stream);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using SeqPadFp32 =
+    paddle::lite::kernels::cuda::SequencePadCompute<float, PRECISION(kFloat)>;
+
+using SeqPadFp16 =
+    paddle::lite::kernels::cuda::SequencePadCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(sequence_pad, kCUDA, kFloat, kNCHW, SeqPadFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("PadValue", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Length",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_pad, kCUDA, kFP16, kNCHW, SeqPadFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("PadValue",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Length",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_pad_compute.h b/lite/kernels/cuda/sequence_pad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c494fe127d4eb5a7c0ba77a5c76ab1d1d0c1f2f2
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pad_compute.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SequencePadCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::SequencePadParam;
+
+  void Run() override;
+  virtual ~SequencePadCompute() = default;
+
+ private:
+  lite::Tensor seq_offsets_;
+  std::vector<int64_t> seq_len_;
+  std::vector<size_t> seq_offsets_vec_;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_pad_compute_test.cc b/lite/kernels/cuda/sequence_pad_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91141984c98d5d105f51d0acc247aa878ff219a7
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pad_compute_test.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_pad_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequencePadTest : public ::testing::Test {
+ protected:
+  SequencePadTest()
+      : batch_(5),
+        features_(2),
+        padded_length_(3),
+        x_lod_({{0, 2, 5}}),
+        x_shape_({batch_, features_}),
+        pad_value_shape_({features_}),
+        out_shape_({static_cast<int64_t>(x_lod_[0].size() - 1),
+                    padded_length_,
+                    features_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_ref_.set_lod(x_lod_);
+    x_gpu_.Resize(x_ref_.dims());
+
+    pad_value_ref_.Resize(lite::DDim(pad_value_shape_));
+    pad_value_gpu_.Resize(pad_value_ref_.dims());
+
+    length_ref_.Resize(
+        lite::DDim({static_cast<int64_t>(x_lod_[0].size() - 1)}));
+    length_gpu_.Resize(length_ref_.dims());
+    length_cpu_.Resize(length_ref_.dims());
+
+    auto x_ref_data = x_ref_.mutable_data<float>();
+    auto pad_value_ref_data = pad_value_ref_.mutable_data<float>();
+
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i);
+    }
+    for (int64_t i = 0; i < pad_value_ref_.numel(); i++) {
+      pad_value_ref_data[i] = static_cast<float>(i);
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &pad_value_ref_, &out_ref_, &length_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.PadValue = &pad_value_gpu_;
+    param_.Length = &length_gpu_;
+    param_.Out = &out_gpu_;
+    param_.padded_length = padded_length_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    x_gpu_.set_lod(x_ref_.lod());
+    pad_value_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(
+        pad_value_ref_.data<float>(), pad_value_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    x_gpu_.set_lod(x_ref_.lod());
+    pad_value_half_.Resize(pad_value_ref_.dims());
+    auto pad_value_half_data = pad_value_half_.mutable_data<half>();
+    for (int64_t i = 0; i < pad_value_half_.numel(); i++) {
+      pad_value_half_data[i] =
+          half(lite::float16(pad_value_ref_.data<float>()[i]));
+    }
+    pad_value_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(
+        pad_value_half_data, pad_value_gpu_.dims());
+  }
+
+  void RunBaseLine(const lite::Tensor* x,
+                   const lite::Tensor* pad_value,
+                   lite::Tensor* out,
+                   lite::Tensor* length) {
+    auto* length_data = length->mutable_data<int64_t>();
+    auto* out_data = out->mutable_data<float>();
+    length_data[0] = 2;
+    length_data[1] = 3;
+
+    for (size_t i = 0; i < 4; ++i) {
+      out_data[i] = i;
+    }
+    out_data[4] = 0;
+    out_data[5] = 1;
+    for (size_t i = 4; i < 10; ++i) {
+      out_data[2 + i] = i;
+    }
+  }
+
+  int batch_, features_, padded_length_;
+  LoD x_lod_;
+  std::vector<int64_t> x_shape_, pad_value_shape_, out_shape_;
+
+  lite::Tensor x_ref_, pad_value_ref_, out_ref_, length_ref_;
+  lite::Tensor x_gpu_, pad_value_gpu_, out_gpu_, length_gpu_;
+  lite::Tensor x_half_, pad_value_half_;
+  lite::Tensor out_cpu_, length_cpu_;
+
+  operators::SequencePadParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(SequencePadTest, fp32) {
+  InitFloatInput();
+  SequencePadCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(length_cpu_.mutable_data<int64_t>(),
+                          length_gpu_.data<int64_t>(),
+                          sizeof(int64_t) * length_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+  for (int i = 0; i < length_gpu_.numel(); ++i) {
+    EXPECT_NEAR(
+        length_cpu_.data<int64_t>()[i], length_ref_.data<int64_t>()[i], 1e-5);
+  }
+}
+
+TEST_F(SequencePadTest, TestFP16) {
+  InitHalfInput();
+  SequencePadCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  const int64_t* length_gpu_data = length_gpu_.data<int64_t>();
+  int64_t* length_cpu_data = length_cpu_.mutable_data<int64_t>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(length_cpu_data,
+                          length_gpu_data,
+                          sizeof(int64_t) * length_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
+  }
+  for (int i = 0; i < length_gpu_.numel(); ++i) {
+    EXPECT_NEAR(
+        length_cpu_.data<int64_t>()[i], length_ref_.data<int64_t>()[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_unpad_compute.cu b/lite/kernels/cuda/sequence_unpad_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdedd74588884aa1e4b7f7c7ae3f414810b0826a
--- /dev/null
+++ b/lite/kernels/cuda/sequence_unpad_compute.cu
@@ -0,0 +1,92 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include "lite/backends/cuda/math/sequence_padding.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_unpad_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+void SequenceUnpadCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  const auto* pad_tensor = param.X;
+  const auto* len_t = param.Length;
+  auto* seq_tensor = param.Out;
+
+  int padded_length = pad_tensor->dims()[1];
+  int seq_num = seq_tensor->lod()[0].size() - 1;
+  int max_seq_len = 0;
+  int step_width = seq_tensor->numel() / seq_tensor->dims()[0];
+
+  seq_offsets_vec_.resize(seq_tensor->lod()[0].size());
+  for (size_t i = 0; i < seq_num; ++i) {
+    max_seq_len = std::max(max_seq_len,
+                           static_cast<int>(seq_tensor->lod()[0][i + 1] -
+                                            seq_tensor->lod()[0][i]));
+    seq_offsets_vec_[i] = seq_tensor->lod()[0][i];
+  }
+  seq_offsets_vec_[seq_num] = seq_tensor->lod()[0][seq_num];
+  seq_offsets_.Resize({static_cast<int64_t>(seq_tensor->lod()[0].size())});
+  TargetWrapperCuda::MemcpyAsync(
+      seq_offsets_.mutable_data<size_t>(TARGET(kCUDA)),
+      seq_offsets_vec_.data(),
+      sizeof(size_t) * seq_offsets_vec_.size(),
+      IoDirection::HtoD,
+      stream);
+
+  const T* pad_data = pad_tensor->template data<T>();
+  T* seq_data = seq_tensor->template mutable_data<T>(TARGET(kCUDA));
+
+  lite::cuda::math::SequenceUnpadding(seq_data,
+                                      pad_data,
+                                      seq_offsets_.data<size_t>(),
+                                      seq_num,
+                                      padded_length,
+                                      step_width,
+                                      &stream);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using SeqUnadFp32 =
+    paddle::lite::kernels::cuda::SequenceUnpadCompute<float, PRECISION(kFloat)>;
+
+using SeqUnadFp16 =
+    paddle::lite::kernels::cuda::SequenceUnpadCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(sequence_unpad, kCUDA, kFloat, kNCHW, SeqUnadFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Length",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_unpad, kCUDA, kFP16, kNCHW, SeqUnadFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindInput("Length",
+               {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_unpad_compute.h b/lite/kernels/cuda/sequence_unpad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..f36520ea15c4ad504b2fd357d8729d6d0dbc2615
--- /dev/null
+++ b/lite/kernels/cuda/sequence_unpad_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <vector>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SequenceUnpadCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::SequenceUnpadParam;
+
+  void Run() override;
+  virtual ~SequenceUnpadCompute() = default;
+
+ private:
+  lite::Tensor seq_offsets_;
+  std::vector<size_t> seq_offsets_vec_;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_unpad_compute_test.cc b/lite/kernels/cuda/sequence_unpad_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..417115a50b6d086bd628a0b93a7d45c688ea18af
--- /dev/null
+++ b/lite/kernels/cuda/sequence_unpad_compute_test.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_unpad_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequenceUnpadTest : public ::testing::Test {
+ protected:
+  SequenceUnpadTest()
+      : batch_(5),
+        features_(2),
+        padded_length_(3),
+        out_lod_({{0, 2, 5}}),
+        x_shape_({static_cast<int64_t>(out_lod_[0].size() - 1),
+                  padded_length_,
+                  features_}),
+        out_shape_({batch_, features_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(x_ref_.dims());
+
+    length_ref_.Resize(
+        lite::DDim({static_cast<int64_t>(out_lod_[0].size() - 1)}));
+    length_gpu_.Resize(length_ref_.dims());
+
+    auto* x_ref_data = x_ref_.mutable_data<float>();
+    auto* length_ref_data = length_ref_.mutable_data<int64_t>();
+
+    // prepare input
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i);
+    }
+    for (size_t i = 0; i < out_lod_[0].size() - 1; ++i) {
+      length_ref_data[i] = out_lod_[0][i + 1] - out_lod_[0][i];
+    }
+
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_ref_.set_lod(out_lod_);
+    out_gpu_.Resize(out_ref_.dims());
+    out_gpu_.set_lod(out_ref_.lod());
+    out_cpu_.Resize(out_ref_.dims());
+    out_cpu_.set_lod(out_ref_.lod());
+
+    RunBaseLine(&x_ref_, &length_ref_, &out_ref_);
+
+    InitParamAndContext();
+  }
+
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.X = &x_gpu_;
+    param_.Length = &length_gpu_;
+    param_.Out = &out_gpu_;
+  }
+
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+    length_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(
+        length_ref_.data<int64_t>(), length_gpu_.dims());
+  }
+
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_shape_));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+    length_gpu_.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(
+        length_ref_.data<int64_t>(), length_gpu_.dims());
+  }
+
+  void RunBaseLine(const lite::Tensor* X,
+                   const lite::Tensor* Length,
+                   lite::Tensor* Out) {
+    auto* out_data = Out->mutable_data<float>();
+
+    for (size_t i = 0; i < 4; ++i) {
+      out_data[i] = i;
+    }
+    for (size_t i = 6; i < 12; ++i) {
+      out_data[i - 2] = i;
+    }
+  }
+
+  int batch_, features_, padded_length_;
+  LoD out_lod_;
+  std::vector<int64_t> x_shape_, out_shape_;
+
+  lite::Tensor x_ref_, out_ref_, length_ref_;
+  lite::Tensor x_gpu_, out_gpu_, length_gpu_;
+  lite::Tensor x_half_;
+  lite::Tensor out_cpu_, length_cpu_;
+
+  operators::SequencePadParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(SequenceUnpadTest, fp32) {
+  InitFloatInput();
+  SequenceUnpadCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+}
+
+TEST_F(SequenceUnpadTest, TestFP16) {
+  InitHalfInput();
+  SequenceUnpadCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/topk_pooling_compute.cu b/lite/kernels/cuda/topk_pooling_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bb4499b637a1435dec2dc913bf8141edd60130fc
--- /dev/null
+++ b/lite/kernels/cuda/topk_pooling_compute.cu
@@ -0,0 +1,200 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/topk_pooling_compute.h"
+
+#include <limits>
+#include <vector>
+
+#include "lite/backends/cuda/target_wrapper.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename Dtype>
+__global__ void top_k_pooling_batch_kernel_reduction(Dtype *output_data,
+                                                     const Dtype *input,
+                                                     const int *height_offset,
+                                                     const int *width_offset,
+                                                     const int batch_size,
+                                                     const int channel_num,
+                                                     const int height_stride,
+                                                     const int width_stride,
+                                                     const int k) {
+  const Dtype *input_start =
+      input +
+      (blockIdx.x * channel_num + blockIdx.y) * height_stride * width_stride;
+  Dtype *output_start =
+      output_data + (blockIdx.x * channel_num + blockIdx.y) * k;
+
+  int width = width_offset[blockIdx.x + 1] - width_offset[blockIdx.x];
+  int height = height_offset[blockIdx.x + 1] - height_offset[blockIdx.x];
+  int real_k = k < height * width ? k : height * width;
+
+  extern __shared__ Dtype smem[];
+
+  Dtype min_val = -100000.0f;
+  for (int j = threadIdx.x; j < height * width; j += blockDim.x) {
+    int index_tmp = (j / width) * width_stride + j % width;
+    smem[j] = input_start[index_tmp];
+  }
+  __syncthreads();
+
+  // get max val
+  int t = 0;
+  for (; t < real_k; ++t) {
+    // reduction
+    for (int gap = height * width; gap > 1;) {
+      if (threadIdx.x == 0) {  // edge cond
+        if (gap % 2 != 0) {
+          Dtype value_first = smem[0];
+          Dtype value_gap = smem[gap - 1];
+          if (value_first < value_gap) {
+            smem[0] = value_gap;
+            smem[gap - 1] = value_first;
+          }
+        }
+      }
+      gap >>= 1;
+      for (int j = threadIdx.x; j < gap; j += blockDim.x) {
+        Dtype value_first = smem[j];
+        Dtype value_gap = smem[j + gap];
+        if (value_first < value_gap) {
+          smem[j] = value_gap;
+          smem[j + gap] = value_first;
+        }
+      }
+      __syncthreads();
+    }
+    if (threadIdx.x == 0) {
+      output_start[t] = smem[0];
+      smem[0] = min_val;
+    }
+    __syncthreads();
+  }
+  for (int i = threadIdx.x; i < (k - t); i += blockDim.x) {
+    // output_start[t + i] = 0.0f;
+  }
+}
+
+template <typename T>
+void TopkPoolingCompute<T>::PrepareForRun() {
+  int device_id = lite::TargetWrapperCuda::GetCurDevice();
+  cudaDeviceProp deviceProp;
+  CUDA_CALL(cudaGetDeviceProperties(&deviceProp, device_id));
+  _shared_mem_size = deviceProp.sharedMemPerBlock;
+}
+
+template <typename T>
+void TopkPoolingCompute<T>::Run() {
+  auto &param = this->Param<param_t>();
+  auto &ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+
+  CHECK(param.X->lod().size() > 0 && param.X->lod()[0].size() > 0)
+      << "X sequence offset is not valid";
+  CHECK(param.Y->lod().size() > 0 && param.Y->lod()[0].size() > 0)
+      << "Y sequence offset is not valid";
+
+  int width_offset_len = param.X->lod()[0].size();
+  lite::DDim width_offset_shape(std::vector<int64_t>{width_offset_len});
+  _width_offset.Resize(width_offset_shape);
+  std::vector<int> width_lod_0(width_offset_len, 0);
+  for (size_t i = 0; i < param.X->lod()[0].size(); ++i) {
+    width_lod_0[i] = static_cast<int>(param.X->lod()[0][i]);
+  }
+  lite::TargetWrapperCuda::MemcpyAsync(
+      _width_offset.mutable_data<int>(TARGET(kCUDA)),
+      width_lod_0.data(),
+      sizeof(int) * width_offset_len,
+      lite::IoDirection::HtoD,
+      cuda_stream);
+
+  int height_offset_len = param.Y->lod()[0].size();
+  lite::DDim height_offset_shape(std::vector<int64_t>{height_offset_len});
+  _height_offset.Resize(height_offset_shape);
+  std::vector<int> height_lod_0(height_offset_len, 0);
+  for (size_t i = 0; i < param.Y->lod()[0].size(); ++i) {
+    height_lod_0[i] = static_cast<int>(param.Y->lod()[0][i]);
+  }
+  lite::TargetWrapperCuda::MemcpyAsync(
+      _height_offset.mutable_data<int>(TARGET(kCUDA)),
+      height_lod_0.data(),
+      sizeof(int) * height_offset_len,
+      lite::IoDirection::HtoD,
+      cuda_stream);
+
+  const Tensor *x_tensor = param.X;
+  Tensor *out_tensor = param.Out;
+  const T *in_data = x_tensor->data<T>();
+  T *out_data = out_tensor->mutable_data<T>(TARGET(kCUDA));
+
+  int num = x_tensor->dims()[0];
+  int channel = x_tensor->dims()[1];
+  int height = x_tensor->dims()[2];
+  int width = x_tensor->dims()[3];
+
+  const int *height_offset = _height_offset.data<int>();
+  const int *width_offset = _width_offset.data<int>();
+
+  int feat_map_size = height * width;
+
+  if (feat_map_size * sizeof(T) <= _shared_mem_size) {
+    dim3 blocks(num, channel);
+    dim3 threads(32, 1);
+
+    top_k_pooling_batch_kernel_reduction<
+        T><<<blocks, threads, feat_map_size * sizeof(T), cuda_stream>>>(
+        out_data,
+        in_data,
+        height_offset,
+        width_offset,
+        num,
+        channel,
+        height,
+        width,
+        param.top_k);
+  } else {
+    LOG(FATAL) << "Not implemented. Exceeded the shared memory limit.";
+  }
+  CUDA_POST_KERNEL_CHECK;
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(topk_pooling,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::TopkPoolingCompute<float>,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/topk_pooling_compute.h b/lite/kernels/cuda/topk_pooling_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..abf16163812a74de8ebb8cce0dd7d80469e0a7d8
--- /dev/null
+++ b/lite/kernels/cuda/topk_pooling_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+class TopkPoolingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::TopkPoolingParam;
+
+  void Run() override;
+
+  void PrepareForRun() override;
+
+  virtual ~TopkPoolingCompute() = default;
+
+ protected:
+  lite::Tensor _height_offset;
+  lite::Tensor _width_offset;
+  int _shared_mem_size;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/topk_pooling_compute_test.cc b/lite/kernels/cuda/topk_pooling_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0fb5c29f25bba0b4cc00f3eb58fc1c0726e6b23b
--- /dev/null
+++ b/lite/kernels/cuda/topk_pooling_compute_test.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/topk_pooling_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+
+#include "lite/api/test_helper.h"
+#include "lite/utils/float16.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class TopkPooingTest : public ::testing::Test {
+ protected:
+  TopkPooingTest()
+      : num(2),
+        channels(4),
+        height(4),
+        width(4),
+        top_k(2),
+        feat_map_num(height * width),
+        x_lod({{0, 4, 7}}),
+        y_lod({{0, 4, 7}}),
+        x_shape({num, channels, height, width}),
+        out_shape({num, channels * top_k}) {
+    CHECK_EQ(x_lod[0].size(), num + 1) << "invalid input.";
+    for (size_t i = 1; i < x_lod[0].size(); ++i) {
+      CHECK_LE(x_lod[0][i] - x_lod[0][i - 1], height) << "invalid input.";
+    }
+
+    X_gpu.Resize(lite::DDim(x_shape));
+    X_ref.Resize(lite::DDim(x_shape));
+    X_ref.set_lod(x_lod);
+    Y_gpu.Resize(lite::DDim(x_shape));
+    Y_ref.Resize(lite::DDim(x_shape));
+    Y_ref.set_lod(y_lod);
+    auto x_ref_data = X_ref.mutable_data<float>();
+    auto y_ref_data = Y_ref.mutable_data<float>();
+
+    // prepare input
+    for (int64_t i = 0; i < X_ref.numel(); i++) {
+      x_ref_data[i] = static_cast<float>(i % 16);
+    }
+    for (int64_t i = 0; i < Y_ref.numel(); i++) {
+      y_ref_data[i] = static_cast<float>(i % 16);
+    }
+
+    Out_ref.Resize(lite::DDim(out_shape));
+    Out_gpu.Resize(lite::DDim(out_shape));
+    Out_cpu.Resize(lite::DDim(out_shape));
+
+    device_init();
+  }
+
+  void device_init() {
+    ctx.reset(new KernelContext);
+    cudaStreamCreate(&stream);
+    param.X = &X_gpu;
+    param.Y = &Y_gpu;
+    param.Out = &Out_gpu;
+    param.top_k = top_k;
+    param.feat_map_num = feat_map_num;
+  }
+
+  void float_data_init() {
+    X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(X_ref.data<float>(),
+                                                   X_gpu.dims());
+    X_gpu.set_lod(X_ref.lod());
+    Y_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(Y_ref.data<float>(),
+                                                   Y_gpu.dims());
+    Y_gpu.set_lod(Y_ref.lod());
+  }
+
+  void half_data_init() {}
+
+  void cpu_base(const lite::Tensor* X,
+                const lite::Tensor* Y,
+                lite::Tensor* Out) {}
+
+  int num, channels, height, width;
+  int top_k, feat_map_num;
+  std::vector<std::vector<uint64_t>> x_lod, y_lod;
+  std::vector<int64_t> x_shape, out_shape;
+  lite::Tensor X_ref, Y_ref, Out_ref;
+  lite::Tensor X_gpu, Y_gpu;
+  lite::Tensor Out_cpu, Out_gpu;
+
+  operators::TopkPoolingParam param;
+  std::unique_ptr<KernelContext> ctx;
+  cudaStream_t stream;
+};
+
+TEST_F(TopkPooingTest, fp32) {
+  float_data_init();
+  auto& context = ctx->As<CUDAContext>();
+  context.SetExecStream(stream);
+  TopkPoolingCompute<float> kernel;
+  kernel.SetParam(param);
+  kernel.SetContext(std::move(ctx));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(Out_cpu.mutable_data<float>(),
+                          Out_gpu.data<float>(),
+                          sizeof(float) * Out_gpu.numel(),
+                          IoDirection::DtoH);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/transpose_compute.cu b/lite/kernels/cuda/transpose_compute.cu
index c5693c674c573d7c9f59034dd3c0985c9d94a22f..ec7ecd16e0daa9f9cb696224ae498825fe75c5b4 100644
--- a/lite/kernels/cuda/transpose_compute.cu
+++ b/lite/kernels/cuda/transpose_compute.cu
@@ -13,17 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "lite/kernels/cuda/transpose_compute.h"
+
 #include <vector>
+
 #include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/transpose_compute.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace cuda {
 
-void TransposeCompute::Run() {
-  auto& param = this->Param<param_t>();
+template <typename T, PrecisionType Ptype>
+void TransposeCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
 
@@ -31,8 +34,8 @@ void TransposeCompute::Run() {
   lite::Tensor* Out = param.output;
   std::vector<int> axes = param.axis;
 
-  const float* in = X->data<float>();
-  float* out = Out->mutable_data<float>(TARGET(kCUDA));
+  const T* in = X->template data<T>();
+  T* out = Out->mutable_data<T>(TARGET(kCUDA));
 
   int ndim = X->dims().size();
   std::vector<int64_t> dims = X->dims().data();
@@ -40,7 +43,7 @@ void TransposeCompute::Run() {
   // NCHW -> NHWC
   if (axes.size() == 4 && axes[0] == 0 && axes[1] == 2 && axes[2] == 3 &&
       axes[3] == 1) {
-    trans.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
+    trans_.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
     return;
@@ -49,13 +52,13 @@ void TransposeCompute::Run() {
   // NHWC -> NCHW
   if (axes.size() == 4 && axes[0] == 0 && axes[1] == 3 && axes[2] == 1 &&
       axes[3] == 2) {
-    trans.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
+    trans_.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
     return;
   }
 
-  trans.transpose(out, in, dims, axes, &stream);
+  trans_.transpose(out, in, dims, axes, &stream);
   cudaError_t error = cudaGetLastError();
   if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
@@ -65,34 +68,31 @@ void TransposeCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(transpose,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::TransposeCompute,
-                     def)
+using TransFp32 =
+    paddle::lite::kernels::cuda::TransposeCompute<float, PRECISION(kFloat)>;
+
+using TransFp16 =
+    paddle::lite::kernels::cuda::TransposeCompute<half, PRECISION(kFP16)>;
+
+REGISTER_LITE_KERNEL(transpose, kCUDA, kFloat, kNCHW, TransFp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(transpose2,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::TransposeCompute,
-                     def)
+REGISTER_LITE_KERNEL(transpose2, kCUDA, kFloat, kNCHW, TransFp32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))})
     .Finalize();
 
-// REGISTER_LITE_KERNEL(transpose2,
-//                      kCUDA,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::cuda::TransposeCompute,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-//     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-//     .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kCUDA))})
-//     .Finalize();
+REGISTER_LITE_KERNEL(transpose, kCUDA, kFP16, kNCHW, TransFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose2, kCUDA, kFP16, kNCHW, TransFp16, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFP16))})
+    .Finalize();
diff --git a/lite/kernels/cuda/transpose_compute.h b/lite/kernels/cuda/transpose_compute.h
index 273d072231fb0608deb9ed729bdf153395ee983f..7e373c3b26c1701cd467148a06466a86f04e0c95 100644
--- a/lite/kernels/cuda/transpose_compute.h
+++ b/lite/kernels/cuda/transpose_compute.h
@@ -21,7 +21,8 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-class TransposeCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+template <typename Dtype, PrecisionType Ptype>
+class TransposeCompute : public KernelLite<TARGET(kCUDA), Ptype> {
  public:
   using param_t = operators::TransposeParam;
 
@@ -29,7 +30,7 @@ class TransposeCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   virtual ~TransposeCompute() = default;
 
  private:
-  lite::cuda::math::Transpose<float> trans;
+  lite::cuda::math::Transpose<Dtype> trans_;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/transpose_compute_test.cc b/lite/kernels/cuda/transpose_compute_test.cc
index bf0d803a14a5f0e47c96128b953ae72a18798205..89654dd9c8a200f5672f23bd08c32b40b9b6f99e 100644
--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ b/lite/kernels/cuda/transpose_compute_test.cc
@@ -13,11 +13,16 @@
 // limitations under the License.
 
 #include "lite/kernels/cuda/transpose_compute.h"
+
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
 #include <vector>
 
+#include "lite/api/test_helper.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/utils/float16.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -31,9 +36,9 @@ namespace {
 #define OUT(n, c, h, w)                                    \
   output_data[w + h * output_w + c * output_h * output_w + \
               n * output_c * output_h * output_w]
-void nchw2nhwc_ref(lite::Tensor* input,
-                   lite::Tensor* output,
-                   const std::vector<int> axies) {
+void Nchw2nhwcBaseLine(lite::Tensor* input,
+                       lite::Tensor* output,
+                       const std::vector<int> axies) {
   auto* input_data = input->data<float>();
   auto* output_data = output->mutable_data<float>();
 
@@ -64,9 +69,9 @@ void nchw2nhwc_ref(lite::Tensor* input,
 #define OUT(n, h, w, c)                                    \
   output_data[c + w * output_c + h * output_w * output_c + \
               n * output_h * output_w * output_c]
-void nhwc2nchw_ref(lite::Tensor* input,
-                   lite::Tensor* output,
-                   const std::vector<int> axies) {
+void Nhwc2nchwBaseLine(lite::Tensor* input,
+                       lite::Tensor* output,
+                       const std::vector<int>& axies) {
   auto* input_data = input->data<float>();
   auto* output_data = output->mutable_data<float>();
 
@@ -89,7 +94,7 @@ void nhwc2nchw_ref(lite::Tensor* input,
   }
 }
 
-void transpose_ref(lite::Tensor* input,
+void TransBaseLine(const lite::Tensor* input,
                    lite::Tensor* output,
                    const std::vector<int> axes) {
   auto* input_data = input->data<float>();
@@ -123,7 +128,7 @@ void transpose_ref(lite::Tensor* input,
 }  // namespace
 
 TEST(transpose_nchw, normal) {
-  TransposeCompute transpose_kernel;
+  TransposeCompute<float, PRECISION(kFloat)> transpose_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   auto& context = ctx->As<CUDAContext>();
 
@@ -168,16 +173,15 @@ TEST(transpose_nchw, normal) {
   auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
   CopySync<TARGET(kCUDA)>(
       out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  nchw2nhwc_ref(&x_ref, &out_ref, axes);
+  Nchw2nhwcBaseLine(&x_ref, &out_ref, axes);
   auto* out_ref_data = out_ref.mutable_data<float>();
-  // transpose_ref(&x_ref, &out_ref, axes);
   for (int i = 0; i < out.numel(); i++) {
     EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
   }
 }
 
 TEST(transpose_nhwc, normal) {
-  TransposeCompute transpose_kernel;
+  TransposeCompute<float, PRECISION(kFloat)> transpose_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   auto& context = ctx->As<CUDAContext>();
 
@@ -220,62 +224,146 @@ TEST(transpose_nhwc, normal) {
   auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
   CopySync<TARGET(kCUDA)>(
       out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  nhwc2nchw_ref(&x_ref, &out_ref, axes);
-  // transpose_ref(&x_ref, &out_ref, axes);
+  Nhwc2nchwBaseLine(&x_ref, &out_ref, axes);
   auto* out_ref_data = out_ref.mutable_data<float>();
   for (int i = 0; i < out.numel(); i++) {
     EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
   }
 }
 
-TEST(transpose, normal) {
-  TransposeCompute transpose_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
+class TransposeTest : public ::testing::Test {
+ protected:
+  TransposeTest()
+      : C_(3),
+        H_(128),
+        W_(64),
+        axes_({1, 2, 0}),
+        x_shape_({C_, H_, W_}),
+        out_shape_({H_, W_, C_}) {
+    x_ref_.Resize(lite::DDim(x_shape_));
+    x_gpu_.Resize(x_ref_.dims());
+
+    auto X_ref__data = x_ref_.mutable_data<float>();
+
+    // prepare input
+    for (int64_t i = 0; i < x_ref_.numel(); i++) {
+      X_ref__data[i] = static_cast<float>(i);
+    }
 
-  operators::TransposeParam param;
+    out_ref_.Resize(lite::DDim(out_shape_));
+    out_gpu_.Resize(out_ref_.dims());
+    out_cpu_.Resize(out_ref_.dims());
+    RunBaseLine(&x_ref_, &out_ref_);
 
-  lite::Tensor x, x_cpu, x_ref;
-  lite::Tensor out, out_cpu, out_ref;
+    InitParamAndContext();
+  }
 
-  int C = 3, H = 128, W = 128;
-  std::vector<int> axes({2, 0, 1});
-  x.Resize({C, H, W});
-  out.Resize({W, C, H});
+  void InitParamAndContext() {
+    ctx_.reset(new KernelContext);
+    cudaStreamCreate(&stream_);
+    auto& context = ctx_->As<CUDAContext>();
+    context.SetExecStream(stream_);
+    param_.x = &x_gpu_;
+    param_.output = &out_gpu_;
+    param_.axis = axes_;
+  }
 
-  x_cpu.Resize({C, H, W});
-  out_cpu.Resize({W, C, H});
+  void InitFloatInput() {
+    x_gpu_.Assign<float, lite::DDim, TARGET(kCUDA)>(x_ref_.data<float>(),
+                                                    x_gpu_.dims());
+  }
 
-  x_ref.Resize({C, H, W});
-  out_ref.Resize({W, C, H});
+  void InitHalfInput() {
+    x_half_.Resize(lite::DDim(x_ref_.dims()));
+    auto x_half_data = x_half_.mutable_data<half>();
+    for (int64_t i = 0; i < x_half_.numel(); i++) {
+      x_half_data[i] = half(lite::float16(x_ref_.data<float>()[i]));
+    }
+    x_gpu_.Assign<half, lite::DDim, TARGET(kCUDA)>(x_half_data, x_gpu_.dims());
+  }
 
-  auto* x_cpu_data = x_cpu.mutable_data<float>();
-  auto* out_cpu_data = out_cpu.mutable_data<float>();
-  auto* x_ref_data = x_ref.mutable_data<float>();
+  void RunBaseLine(const lite::Tensor* x, lite::Tensor* out) {
+    TransBaseLine(x, out, axes_);
+  }
 
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i + 1;
-    x_ref_data[i] = i + 1;
+  int C_, H_, W_;
+  std::vector<int> axes_;
+  std::vector<int64_t> x_shape_, out_shape_;
+
+  lite::Tensor x_ref_, out_ref_;
+  lite::Tensor x_gpu_, out_gpu_;
+  lite::Tensor x_half_;
+  lite::Tensor out_cpu_;
+
+  operators::TransposeParam param_;
+  std::unique_ptr<KernelContext> ctx_;
+  cudaStream_t stream_;
+};
+
+TEST_F(TransposeTest, fp32) {
+  InitFloatInput();
+  TransposeCompute<float, PRECISION(kFloat)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
   }
 
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  param.x = &x;
-  param.output = &out;
-  param.axis = axes;
-  transpose_kernel.SetParam(param);
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-  transpose_kernel.SetContext(std::move(ctx));
-  transpose_kernel.Launch();
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
   cudaDeviceSynchronize();
-  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  transpose_ref(&x_ref, &out_ref, axes);
-  auto* out_ref_data = out_ref.mutable_data<float>();
-  for (int i = 0; i < out.numel(); i++) {
-    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp32, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  CopySync<TARGET(kCUDA)>(out_cpu_.mutable_data<float>(),
+                          out_gpu_.data<float>(),
+                          sizeof(float) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+  for (int i = 0; i < out_gpu_.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_.data<float>()[i], out_ref_.data<float>()[i], 1e-5);
+  }
+}
+
+TEST_F(TransposeTest, TestFP16) {
+  InitHalfInput();
+  TransposeCompute<half, PRECISION(kFP16)> kernel;
+  kernel.SetParam(param_);
+  kernel.SetContext(std::move(ctx_));
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    kernel.Launch();
+    cudaDeviceSynchronize();
+  }
+
+  auto start = GetCurrentUS();
+  kernel.PrepareForRun();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    kernel.Run();
+  }
+  cudaDeviceSynchronize();
+  auto duration = (GetCurrentUS() - start) / 1000.0;
+  LOG(INFO) << "fp16, warmup: " << FLAGS_warmup
+            << ", repeats: " << FLAGS_repeats << ", spend "
+            << duration / FLAGS_repeats << " ms in average.";
+
+  const half* out_gpu_data = out_gpu_.data<half>();
+  half* out_cpu_data = out_cpu_.mutable_data<half>();
+  CopySync<TARGET(kCUDA)>(out_cpu_data,
+                          out_gpu_data,
+                          sizeof(half) * out_gpu_.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_cpu_.numel(); ++i) {
+    float res = static_cast<float>(lite::float16(out_cpu_data[i]));
+    float ref = out_ref_.data<float>()[i];
+    EXPECT_NEAR(fabs(res - ref) / (ref + 1e-5), 0., 1e-2);
   }
 }
 
diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu
index 6b4b2875f39c479f3ddd387230dbdf8e3d24ce3c..23f5639a9ddbafa38cc575ac5ca068916956a075 100644
--- a/lite/kernels/cuda/yolo_box_compute.cu
+++ b/lite/kernels/cuda/yolo_box_compute.cu
@@ -185,15 +185,11 @@ void YoloBoxCompute::Run() {
 
   anchors_.Resize({static_cast<int64_t>(anchors.size())});
   int* d_anchors = anchors_.mutable_data<int>(TARGET(kCUDA));
-  // TargetWrapperCuda::MemcpyAsync(d_anchors,
-  //                               anchors.data(),
-  //                               sizeof(int) * anchors.size(),
-  //                               IoDirection::HtoD,
-  //                               stream);
-  CopySync<TARGET(kCUDA)>(d_anchors,
-                          anchors.data(),
-                          sizeof(int) * anchors.size(),
-                          IoDirection::HtoD);
+  TargetWrapperCuda::MemcpyAsync(d_anchors,
+                                 anchors.data(),
+                                 sizeof(int) * anchors.size(),
+                                 IoDirection::HtoD,
+                                 stream);
 
   int threads = 512;
   int blocks = (n * box_num + threads - 1) / threads;
diff --git a/lite/kernels/fpga/activation_compute_test.cc b/lite/kernels/fpga/activation_compute_test.cc
index cef87afffca65ee82ca63e58191d3877f62824f2..99f702b84b3439814433e7c416151b43772dfb0e 100644
--- a/lite/kernels/fpga/activation_compute_test.cc
+++ b/lite/kernels/fpga/activation_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/fpga/activation_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/fpga/activation_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -37,8 +39,7 @@ void activation_compute_ref(const operators::ActivationParam& param) {
 }
 
 TEST(activation_fpga, retrive_op) {
-  auto activation =
-      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("relu");
+  auto activation = KernelRegistry::Global().Create("relu");
   ASSERT_FALSE(activation.empty());
   ASSERT_TRUE(activation.front());
 }
diff --git a/lite/kernels/fpga/fc_compute_test.cc b/lite/kernels/fpga/fc_compute_test.cc
index 6ef8c02ed06dd89876dcab8c14fe389039bda614..08daecda314c771d0597951162d043f34d6316c9 100644
--- a/lite/kernels/fpga/fc_compute_test.cc
+++ b/lite/kernels/fpga/fc_compute_test.cc
@@ -12,15 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/fpga/fc_compute.h"
 #include <gtest/gtest.h>
+
 #include <algorithm>
 #include <iostream>
 #include <memory>
 #include <random>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/fpga/fc_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -76,8 +78,7 @@ void FillData(T* a,
 }
 
 TEST(fc_fpga, retrive_op) {
-  auto fc =
-      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>("fc");
+  auto fc = KernelRegistry::Global().Create("fc");
   ASSERT_FALSE(fc.empty());
   ASSERT_TRUE(fc.front());
 }
diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc
old mode 100755
new mode 100644
index 9248289fe9353705e7a2d84831b9f3de5d8ee7d7..ff93f1a6e1c30d006065deb04576255c24baed25
--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ b/lite/kernels/fpga/pooling_compute_test.cc
@@ -12,14 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/fpga/pooling_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <string>
 #include <vector>
-#include "lite/core/op_registry.h"
 
 #include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/fpga/pooling_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -277,8 +278,7 @@ TEST(pool_fpga, compute) {
 }
 
 TEST(pool_fpga, retrive_op) {
-  auto pool = KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>(
-      "pool2d");
+  auto pool = KernelRegistry::Global().Create("pool2d");
   ASSERT_FALSE(pool.empty());
   ASSERT_TRUE(pool.front());
 }
diff --git a/lite/kernels/fpga/softmax_compute_test.cc b/lite/kernels/fpga/softmax_compute_test.cc
index f92139d0f49b3d149531f11cb422e44ded6e7e64..a6f456ba1f140d07ccfcea0d7746c1061586611e 100644
--- a/lite/kernels/fpga/softmax_compute_test.cc
+++ b/lite/kernels/fpga/softmax_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/fpga/softmax_compute.h"
 #include <gtest/gtest.h>
+
 #include <limits>
 #include <vector>
+
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/fpga/softmax_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -121,9 +123,7 @@ TEST(softmax_arm, compute) {
 }
 
 TEST(softmax, retrive_op) {
-  auto softmax =
-      KernelRegistry::Global().Create<TARGET(kFPGA), PRECISION(kFP16)>(
-          "softmax");
+  auto softmax = KernelRegistry::Global().Create("softmax");
   ASSERT_FALSE(softmax.empty());
   ASSERT_TRUE(softmax.front());
 }
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index a70345708cce678b52e288a1f3eaf4ee1a23f541..cd91d2dc90f9f48668e1d5ab9fbe5d065cb0e191 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -16,3 +16,10 @@ add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${li
 add_kernel(write_to_array_compute_host Host extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(read_from_array_compute_host Host extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(assign_compute_host Host extra SRCS assign_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(retinanet_detection_output_compute_host Host extra SRCS retinanet_detection_output_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(where_index_compute_host Host extra SRCS where_index_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(activation_grad_compute_host Host train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps})
+
+if(LITE_BUILD_EXTRA)
+  lite_cc_test(test_where_index_compute_host SRCS where_index_compute.cc DEPS where_index_compute_host)
+endif()
diff --git a/lite/kernels/host/activation_grad_compute.cc b/lite/kernels/host/activation_grad_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b837cfda4572fa106a1ba1d015ffd5163b08340
--- /dev/null
+++ b/lite/kernels/host/activation_grad_compute.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/activation_grad_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+void SquareGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(param.X);
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto x_data = param.X->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  for (int i = 0; i < out_grad_dims.production(); i++) {
+    x_grad_data[i] = out_grad_data[i] * 2.0 * x_data[i];
+  }
+}
+
+void ReluGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(param.X);
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto x_data = param.X->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  for (int i = 0; i < out_grad_dims.production(); i++) {
+    x_grad_data[i] = x_data[i] > 0 ? out_grad_data[i] : 0.0;
+  }
+}
+
+void TanhGradCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(param.Out);
+  auto out_grad_dims = param.Out_grad->dims();
+  auto out_grad_data = param.Out_grad->data<float>();
+
+  auto out_data = param.Out->data<float>();
+  auto x_grad_data = param.X_grad->mutable_data<float>();
+  for (int i = 0; i < out_grad_dims.production(); i++) {
+    x_grad_data[i] = out_grad_data[i] *
+                     (static_cast<float>(1.0) - out_data[i] * out_data[i]);
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(square_grad,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::SquareGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(relu_grad,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::SquareGradCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(tanh_grad,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::SquareGradCompute,
+                     def)
+    .BindInput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Out@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("X@GRAD", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/host/activation_grad_compute.h b/lite/kernels/host/activation_grad_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..d942b901c448ee87410a2030ea0f9f10aca0e493
--- /dev/null
+++ b/lite/kernels/host/activation_grad_compute.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class SquareGradCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~SquareGradCompute() = default;
+};
+
+class ReluGradCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~ReluGradCompute() = default;
+};
+
+class TanhGradCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationGradParam;
+
+  void Run() override;
+
+  virtual ~TanhGradCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/retinanet_detection_output_compute.cc b/lite/kernels/host/retinanet_detection_output_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95a4bf708e7f03aee9d9ac99323b173287260b13
--- /dev/null
+++ b/lite/kernels/host/retinanet_detection_output_compute.cc
@@ -0,0 +1,435 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/retinanet_detection_output_compute.h"
+#include <cmath>
+#include <map>
+#include <utility>
+#include <vector>
+#include "lite/operators/retinanet_detection_output_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+bool SortScoreTwoPairDescend(const std::pair<float, std::pair<T, T>>& pair1,
+                             const std::pair<float, std::pair<T, T>>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores,
+    const T threshold,
+    int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(),
+                   sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static inline T BBoxArea(const std::vector<T>& box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const std::vector<T>& box1,
+                               const std::vector<T>& box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+    T inter_w = inter_xmax - inter_xmin + norm;
+    T inter_h = inter_ymax - inter_ymin + norm;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+void NMSFast(const std::vector<std::vector<T>>& cls_dets,
+             const T nms_threshold,
+             const T eta,
+             std::vector<int>* selected_indices) {
+  int64_t num_boxes = cls_dets.size();
+  std::vector<std::pair<T, int>> sorted_indices;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    sorted_indices.push_back(std::make_pair(cls_dets[i][4], i));
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(
+      sorted_indices.begin(), sorted_indices.end(), SortScorePairDescend<int>);
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = T(0.);
+
+        overlap = JaccardOverlap<T>(cls_dets[idx], cls_dets[kept_idx], false);
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <class T>
+void DeltaScoreToPrediction(
+    const std::vector<T>& bboxes_data,
+    const std::vector<T>& anchors_data,
+    T im_height,
+    T im_width,
+    T im_scale,
+    int class_num,
+    const std::vector<std::pair<T, int>>& sorted_indices,
+    std::map<int, std::vector<std::vector<T>>>* preds) {
+  im_height = static_cast<T>(std::round(im_height / im_scale));
+  im_width = static_cast<T>(std::round(im_width / im_scale));
+  T zero(0);
+  int i = 0;
+  for (const auto& it : sorted_indices) {
+    T score = it.first;
+    int idx = it.second;
+    int a = idx / class_num;
+    int c = idx % class_num;
+
+    int box_offset = a * 4;
+    T anchor_box_width =
+        anchors_data[box_offset + 2] - anchors_data[box_offset] + 1;
+    T anchor_box_height =
+        anchors_data[box_offset + 3] - anchors_data[box_offset + 1] + 1;
+    T anchor_box_center_x = anchors_data[box_offset] + anchor_box_width / 2;
+    T anchor_box_center_y =
+        anchors_data[box_offset + 1] + anchor_box_height / 2;
+    T target_box_center_x = 0, target_box_center_y = 0;
+    T target_box_width = 0, target_box_height = 0;
+    target_box_center_x =
+        bboxes_data[box_offset] * anchor_box_width + anchor_box_center_x;
+    target_box_center_y =
+        bboxes_data[box_offset + 1] * anchor_box_height + anchor_box_center_y;
+    target_box_width = std::exp(bboxes_data[box_offset + 2]) * anchor_box_width;
+    target_box_height =
+        std::exp(bboxes_data[box_offset + 3]) * anchor_box_height;
+    T pred_box_xmin = target_box_center_x - target_box_width / 2;
+    T pred_box_ymin = target_box_center_y - target_box_height / 2;
+    T pred_box_xmax = target_box_center_x + target_box_width / 2 - 1;
+    T pred_box_ymax = target_box_center_y + target_box_height / 2 - 1;
+    pred_box_xmin = pred_box_xmin / im_scale;
+    pred_box_ymin = pred_box_ymin / im_scale;
+    pred_box_xmax = pred_box_xmax / im_scale;
+    pred_box_ymax = pred_box_ymax / im_scale;
+
+    pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero);
+    pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero);
+    pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero);
+    pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero);
+
+    std::vector<T> one_pred;
+    one_pred.push_back(pred_box_xmin);
+    one_pred.push_back(pred_box_ymin);
+    one_pred.push_back(pred_box_xmax);
+    one_pred.push_back(pred_box_ymax);
+    one_pred.push_back(score);
+    (*preds)[c].push_back(one_pred);
+    i++;
+  }
+}
+
+template <class T>
+void MultiClassNMS(const std::map<int, std::vector<std::vector<T>>>& preds,
+                   int class_num,
+                   const int keep_top_k,
+                   const T nms_threshold,
+                   const T nms_eta,
+                   std::vector<std::vector<T>>* nmsed_out,
+                   int* num_nmsed_out) {
+  std::map<int, std::vector<int>> indices;
+  int num_det = 0;
+  for (int c = 0; c < class_num; ++c) {
+    if (static_cast<bool>(preds.count(c))) {
+      const std::vector<std::vector<T>> cls_dets = preds.at(c);
+      NMSFast(cls_dets, nms_threshold, nms_eta, &(indices[c]));
+      num_det += indices[c].size();
+    }
+  }
+
+  std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+  for (const auto& it : indices) {
+    int label = it.first;
+    const std::vector<int>& label_indices = it.second;
+    for (size_t j = 0; j < label_indices.size(); ++j) {
+      int idx = label_indices[j];
+      score_index_pairs.push_back(
+          std::make_pair(preds.at(label)[idx][4], std::make_pair(label, idx)));
+    }
+  }
+  // Keep top k results per image.
+  std::stable_sort(score_index_pairs.begin(),
+                   score_index_pairs.end(),
+                   SortScoreTwoPairDescend<int>);
+  if (num_det > keep_top_k) {
+    score_index_pairs.resize(keep_top_k);
+  }
+
+  // Store the new indices.
+  std::map<int, std::vector<int>> new_indices;
+  for (const auto& it : score_index_pairs) {
+    int label = it.second.first;
+    int idx = it.second.second;
+    std::vector<T> one_pred;
+    one_pred.push_back(label);
+    one_pred.push_back(preds.at(label)[idx][4]);
+    one_pred.push_back(preds.at(label)[idx][0]);
+    one_pred.push_back(preds.at(label)[idx][1]);
+    one_pred.push_back(preds.at(label)[idx][2]);
+    one_pred.push_back(preds.at(label)[idx][3]);
+    nmsed_out->push_back(one_pred);
+  }
+
+  *num_nmsed_out = (num_det > keep_top_k ? keep_top_k : num_det);
+}
+
+template <class T>
+void RetinanetDetectionOutput(
+    const operators::RetinanetDetectionOutputParam& param,
+    const std::vector<Tensor>& scores,
+    const std::vector<Tensor>& bboxes,
+    const std::vector<Tensor>& anchors,
+    const Tensor& im_info,
+    std::vector<std::vector<T>>* nmsed_out,
+    int* num_nmsed_out) {
+  int64_t nms_top_k = param.nms_top_k;
+  int64_t keep_top_k = param.keep_top_k;
+  T nms_threshold = static_cast<T>(param.nms_threshold);
+  T nms_eta = static_cast<T>(param.nms_eta);
+  T score_threshold = static_cast<T>(param.score_threshold);
+
+  int64_t class_num = scores[0].dims()[1];
+  std::map<int, std::vector<std::vector<T>>> preds;
+  for (size_t l = 0; l < scores.size(); ++l) {
+    // Fetch per level score
+    Tensor scores_per_level = scores[l];
+    // Fetch per level bbox
+    Tensor bboxes_per_level = bboxes[l];
+    // Fetch per level anchor
+    Tensor anchors_per_level = anchors[l];
+
+    int64_t scores_num = scores_per_level.numel();
+    int64_t bboxes_num = bboxes_per_level.numel();
+    std::vector<T> scores_data(scores_num);
+    std::vector<T> bboxes_data(bboxes_num);
+    std::vector<T> anchors_data(bboxes_num);
+    std::copy_n(scores_per_level.data<T>(), scores_num, scores_data.begin());
+    std::copy_n(bboxes_per_level.data<T>(), bboxes_num, bboxes_data.begin());
+    std::copy_n(anchors_per_level.data<T>(), bboxes_num, anchors_data.begin());
+    std::vector<std::pair<T, int>> sorted_indices;
+
+    // For the highest level, we take the threshold 0.0
+    T threshold = (l < (scores.size() - 1) ? score_threshold : 0.0);
+    GetMaxScoreIndex(scores_data, threshold, nms_top_k, &sorted_indices);
+    auto* im_info_data = im_info.data<T>();
+    auto im_height = im_info_data[0];
+    auto im_width = im_info_data[1];
+    auto im_scale = im_info_data[2];
+    DeltaScoreToPrediction(bboxes_data,
+                           anchors_data,
+                           im_height,
+                           im_width,
+                           im_scale,
+                           class_num,
+                           sorted_indices,
+                           &preds);
+  }
+
+  MultiClassNMS(preds,
+                class_num,
+                keep_top_k,
+                nms_threshold,
+                nms_eta,
+                nmsed_out,
+                num_nmsed_out);
+}
+
+template <class T>
+void MultiClassOutput(const std::vector<std::vector<T>>& nmsed_out,
+                      Tensor* outs) {
+  auto* odata = outs->mutable_data<T>();
+  int count = 0;
+  int64_t out_dim = 6;
+  for (size_t i = 0; i < nmsed_out.size(); ++i) {
+    odata[count * out_dim] = nmsed_out[i][0] + 1;  // label
+    odata[count * out_dim + 1] = nmsed_out[i][1];  // score
+    odata[count * out_dim + 2] = nmsed_out[i][2];  // xmin
+    odata[count * out_dim + 3] = nmsed_out[i][3];  // xmin
+    odata[count * out_dim + 4] = nmsed_out[i][4];  // xmin
+    odata[count * out_dim + 5] = nmsed_out[i][5];  // xmin
+    count++;
+  }
+}
+
+void RetinanetDetectionOutputCompute::Run() {
+  auto& param = Param<operators::RetinanetDetectionOutputParam>();
+  auto& boxes = param.bboxes;
+  auto& scores = param.scores;
+  auto& anchors = param.anchors;
+  auto* im_info = param.im_info;
+  auto* outs = param.out;
+
+  std::vector<Tensor> boxes_list(boxes.size());
+  std::vector<Tensor> scores_list(scores.size());
+  std::vector<Tensor> anchors_list(anchors.size());
+  for (size_t j = 0; j < boxes_list.size(); ++j) {
+    boxes_list[j] = *boxes[j];
+    scores_list[j] = *scores[j];
+    anchors_list[j] = *anchors[j];
+  }
+  auto score_dims = scores_list[0].dims();
+  int64_t batch_size = score_dims[0];
+  auto box_dims = boxes_list[0].dims();
+  int64_t box_dim = box_dims[2];
+  int64_t out_dim = box_dim + 2;
+
+  std::vector<std::vector<std::vector<float>>> all_nmsed_out;
+  std::vector<uint64_t> batch_starts = {0};
+  for (int i = 0; i < batch_size; ++i) {
+    int num_nmsed_out = 0;
+    std::vector<Tensor> box_per_batch_list(boxes_list.size());
+    std::vector<Tensor> score_per_batch_list(scores_list.size());
+    for (size_t j = 0; j < boxes_list.size(); ++j) {
+      auto score_dims = scores_list[j].dims();
+      score_per_batch_list[j] = scores_list[j].Slice<float>(i, i + 1);
+      score_per_batch_list[j].Resize({score_dims[1], score_dims[2]});
+      box_per_batch_list[j] = boxes_list[j].Slice<float>(i, i + 1);
+      box_per_batch_list[j].Resize({score_dims[1], box_dim});
+    }
+    Tensor im_info_slice = im_info->Slice<float>(i, i + 1);
+
+    std::vector<std::vector<float>> nmsed_out;
+    RetinanetDetectionOutput(param,
+                             score_per_batch_list,
+                             box_per_batch_list,
+                             anchors_list,
+                             im_info_slice,
+                             &nmsed_out,
+                             &num_nmsed_out);
+    all_nmsed_out.push_back(nmsed_out);
+    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+  }
+
+  uint64_t num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    outs->Resize({0, out_dim});
+  } else {
+    outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+    for (int i = 0; i < batch_size; ++i) {
+      int64_t s = static_cast<int64_t>(batch_starts[i]);
+      int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
+      if (e > s) {
+        Tensor out = outs->Slice<float>(s, e);
+        MultiClassOutput(all_nmsed_out[i], &out);
+      }
+    }
+  }
+
+  LoD lod;
+  lod.emplace_back(batch_starts);
+  outs->set_lod(lod);
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    retinanet_detection_output,
+    kHost,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::host::RetinanetDetectionOutputCompute,
+    def)
+    .BindInput("BBoxes",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Scores",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Anchors",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("ImInfo",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/host/retinanet_detection_output_compute.h b/lite/kernels/host/retinanet_detection_output_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..612ea7105e2728b856f02d71e9fcfaea2a1ef680
--- /dev/null
+++ b/lite/kernels/host/retinanet_detection_output_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class RetinanetDetectionOutputCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  void Run() override;
+
+  virtual ~RetinanetDetectionOutputCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/where_index_compute.cc b/lite/kernels/host/where_index_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d06be8d332734f3e41b0414e891c8810a117d8a6
--- /dev/null
+++ b/lite/kernels/host/where_index_compute.cc
@@ -0,0 +1,173 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/where_index_compute.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+static void where_index_rank4(const int64_t* true_index,
+                              int true_num,
+                              const int64_t* stride,
+                              int64_t* out) {
+  int cnt = true_num >> 1;
+  register int64_t stride0 = stride[0];
+  register int64_t stride1 = stride[1];
+  register int64_t stride2 = stride[2];
+  register int64_t stride3 = stride[3];
+  for (int i = 0; i < cnt; ++i) {
+    int64_t index0 = true_index[i * 2];
+    int64_t index1 = true_index[i * 2 + 1];
+    int out_index = i * 8;
+    // rank0
+    register int64_t oindex0 = index0 / stride0;
+    register int64_t oindex1 = index1 / stride0;
+    out[out_index] = oindex0;
+    index0 -= oindex0 * stride0;
+    index1 -= oindex1 * stride0;
+    out[out_index + 4] = oindex1;
+    out_index++;
+    // rank1
+    oindex0 = index0 / stride1;
+    oindex1 = index1 / stride1;
+    out[out_index] = oindex0;
+    index0 -= oindex0 * stride1;
+    index1 -= oindex1 * stride1;
+    out[out_index + 4] = oindex1;
+    out_index++;
+    // rank2
+    oindex0 = index0 / stride2;
+    oindex1 = index1 / stride2;
+    out[out_index] = oindex0;
+    index0 -= oindex0 * stride2;
+    index1 -= oindex1 * stride2;
+    out[out_index + 4] = oindex1;
+    out_index++;
+    // rank3
+    oindex0 = index0 / stride3;
+    oindex1 = index1 / stride3;
+    out[out_index] = oindex0;
+    out[out_index + 4] = oindex1;
+  }
+  // remain
+  for (int r = cnt * 2; r < true_num; ++r) {
+    int out_index = r * 4;
+    int64_t index = true_index[r];
+    for (int i = 0; i < 4; ++i) {
+      out[out_index + i] = index / stride[i];
+      index -= out[out_index + i] * stride[i];
+    }
+  }
+}
+
+inline void where_index_rank1(const int64_t* true_index,
+                              int true_num,
+                              int64_t* out) {
+  memcpy(out, true_index, true_num * sizeof(int64_t));
+}
+
+static void where_index_rankn(const int64_t* true_index,
+                              int true_num,
+                              const int64_t* stride,
+                              int rank,
+                              int64_t* out) {
+  int out_index = 0;
+  for (int i = 0; i < true_num; ++i) {
+    int64_t index = true_index[i];
+    for (int r = 0; r < rank; ++r) {
+      out[out_index] = index / stride[r];
+      index -= out[out_index++] * stride[r];
+    }
+  }
+}
+
+template <typename T>
+void WhereIndexKernel(const operators::WhereIndexParam& param) {
+  auto* input = param.input;
+  auto* output = param.output;
+  auto dims = input->dims();
+  auto numel = dims.production();
+  int64_t rank = static_cast<int64_t>(dims.size());
+  const T* cond_data = input->template data<T>();
+  int64_t true_num = 0;
+  std::vector<int64_t> true_index(numel);
+  for (auto i = 0; i < numel; i++) {
+    if (static_cast<bool>(cond_data[i])) {
+      true_index[true_num] = i;
+      true_num++;
+    }
+  }
+  output->Resize({true_num, rank});
+  if (true_num == 0) {
+    return;
+  }
+  auto* out_ptr = output->template mutable_data<int64_t>();
+  std::vector<int64_t> stride(rank);
+  stride[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    stride[i] = stride[i + 1] * dims[i + 1];
+  }
+  if (rank == 1) {
+    where_index_rank1(true_index.data(), true_num, out_ptr);
+  } else if (rank == 4) {
+    where_index_rank4(true_index.data(), true_num, stride.data(), out_ptr);
+  } else {
+    where_index_rankn(
+        true_index.data(), true_num, stride.data(), rank, out_ptr);
+  }
+}
+
+void WhereIndexCompute::Run() {
+  auto& param = this->Param<operators::WhereIndexParam>();
+  switch (param.input->precision()) {
+    case PRECISION(kFloat):
+      WhereIndexKernel<float>(param);
+      break;
+    case PRECISION(kInt32):
+      WhereIndexKernel<int32_t>(param);
+      break;
+    case PRECISION(kInt64):
+      WhereIndexKernel<int64_t>(param);
+      break;
+    case PRECISION(kInt8):
+      WhereIndexKernel<int8_t>(param);
+      break;
+    case PRECISION(kBool):
+      WhereIndexKernel<bool>(param);
+      break;
+    default:
+      LOG(FATAL) << "WhereIndex does not implement for the "
+                 << "input type:" << static_cast<int>(param.input->precision());
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using whereindex = paddle::lite::kernels::host::WhereIndexCompute;
+
+REGISTER_LITE_KERNEL(where_index, kHost, kAny, kAny, whereindex, def)
+    .BindInput("Condition",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/host/where_index_compute.h b/lite/kernels/host/where_index_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..6936e3ed8f0ee16bf0e41095bbcbd0c18169d62f
--- /dev/null
+++ b/lite/kernels/host/where_index_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/where_index_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+class WhereIndexCompute : public KernelLite<TARGET(kHost), PRECISION(kAny)> {
+ public:
+  using param_t = operators::WhereIndexParam;
+
+  void Run() override;
+
+  virtual ~WhereIndexCompute() = default;
+};
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/where_index_compute_test.cc b/lite/kernels/host/where_index_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7097bdcae2bb319331af72c390a9d5de4fc23a9f
--- /dev/null
+++ b/lite/kernels/host/where_index_compute_test.cc
@@ -0,0 +1,174 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/where_index_compute.h"
+#include <gtest/gtest.h>
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+template <typename T>
+void where_index_compute_ref(lite::Tensor* condition, lite::Tensor* out) {
+  auto dims = condition->dims();
+  auto numel = condition->numel();
+  const int64_t rank = static_cast<int64_t>(dims.size());
+  const T* cond_data = condition->data<T>();
+  std::vector<int64_t> true_index;
+  for (auto i = 0; i < numel; i++) {
+    if (static_cast<bool>(cond_data[i])) {
+      true_index.push_back(i);
+    }
+  }
+  int64_t true_num = static_cast<int64_t>(true_index.size());
+  out->Resize({true_num, rank});
+  int64_t* out_ptr = out->mutable_data<int64_t>();
+  if (true_num == 0) {
+    return;
+  }
+
+  std::vector<int64_t> stride(rank);
+  stride[rank - 1] = 1;
+  for (int i = rank - 2; i >= 0; i--) {
+    stride[i] = stride[i + 1] * dims[i + 1];
+  }
+  for (int i = 0; i < true_num; ++i) {
+    int64_t index = true_index[i];
+    for (int j = 0; j < rank; ++j) {
+      out_ptr[i * rank + j] = index / stride[j];
+      index -= out_ptr[i * rank + j] * stride[j];
+    }
+  }
+}
+
+TEST(where_index, init) {
+  WhereIndexCompute where_index;
+  ASSERT_EQ(where_index.precision(), PRECISION(kAny));
+  ASSERT_EQ(where_index.target(), TARGET(kHost));
+}
+
+TEST(where_index, retrive_op) {
+  auto where_index =
+      KernelRegistry::Global().Create<TARGET(kHost), PRECISION(kAny)>(
+          "where_index");
+  ASSERT_FALSE(where_index.empty());
+  ASSERT_TRUE(where_index.front());
+}
+
+TEST(where_index, compute) {
+  paddle::lite::DeviceInfo::Init();
+  WhereIndexCompute where_index;
+  operators::WhereIndexParam param;
+
+  lite::Tensor input;
+  lite::Tensor output;
+  lite::Tensor output_ref;
+  param.input = &input;
+  param.output = &output;
+  where_index.SetParam(param);
+  for (auto& n : {1, 2, 4}) {
+    for (auto& c : {1, 3, 21, 32}) {
+      for (auto& h : {1, 5, 63}) {
+        for (auto& w : {1, 5, 64}) {
+          for (auto& dim_size : {1, 2, 3, 4}) {
+            for (int i = 0; i < 5; ++i) {
+              std::vector<int64_t> in_shape;
+              in_shape.push_back(n);
+              in_shape.push_back(c);
+              in_shape.push_back(h);
+              in_shape.push_back(w);
+              int outer = 1;
+              for (int i = dim_size - 1; i < in_shape.size(); ++i) {
+                outer *= in_shape[i];
+              }
+              in_shape.resize(dim_size);
+              in_shape[dim_size - 1] = outer;
+
+              DDim indim(in_shape);
+              LOG(INFO) << "in dims: ";
+              for (int i = 0; i < dim_size; ++i) {
+                LOG(INFO) << in_shape[i];
+              }
+              input.Resize(indim);
+              std::default_random_engine engine;
+              std::uniform_real_distribution<float> dist(-1, 1);
+              if (i == 0) {
+                int* indata = input.mutable_data<int32_t>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = static_cast<int>(dist(engine) > 0);
+                }
+                where_index_compute_ref<int32_t>(&input, &output_ref);
+              } else if (i == 1) {
+                int64_t* indata = input.mutable_data<int64_t>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = static_cast<int64_t>(dist(engine) > 0);
+                }
+                where_index_compute_ref<int64_t>(&input, &output_ref);
+              } else if (i == 2) {
+                int8_t* indata = input.mutable_data<int8_t>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = static_cast<int8_t>(dist(engine) > 0);
+                }
+                where_index_compute_ref<int8_t>(&input, &output_ref);
+              } else if (i == 3) {
+                bool* indata = input.mutable_data<bool>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = dist(engine) > 0;
+                }
+                where_index_compute_ref<bool>(&input, &output_ref);
+              } else {
+                float* indata = input.mutable_data<float>();
+                for (int i = 0; i < indim.production(); ++i) {
+                  indata[i] = dist(engine) > 0;
+                }
+                where_index_compute_ref<float>(&input, &output_ref);
+              }
+              where_index.Run();
+              const int64_t* outdata = output.data<int64_t>();
+              const int64_t* outdata_ref = output_ref.data<int64_t>();
+              CHECK_EQ(output.dims(), output_ref.dims())
+                  << "where_index out shape error! out_dim is not equal "
+                     "to out_ref dim";
+              for (int i = 0; i < output.numel(); i++) {
+                if (std::abs(outdata[i] - outdata_ref[i]) > 0) {
+                  LOG(FATAL) << "where_index cmp error, i: " << i
+                             << ", output_data: " << outdata[i]
+                             << ", output_ref_data: " << outdata_ref[i]
+                             << "input precision: "
+                             << static_cast<int>(input.precision());
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(where_index, kHost, kAny, kAny, def);
diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt
index f9395d45ccecccaf3f873797d0c2d71eda266319..634a0afc551d83be58487d7393e092196e0f6cc5 100644
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
@@ -4,6 +4,7 @@ endif()
 
 add_subdirectory(bridges)
 add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} ${mlu_subgraph_bridges})
-add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
-add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
-add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_mlu})
+add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu})
+add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps})
+# depend on transpose function in backend/x86/math/math_function
+add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function})
diff --git a/lite/kernels/mlu/bridges/CMakeLists.txt b/lite/kernels/mlu/bridges/CMakeLists.txt
index 82510ab9b6a794f5c6b1ffb43d2d3c55db3a5514..91323925e1ef49462c180fd96392d638e273fd69 100644
--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -3,7 +3,7 @@ if(NOT LITE_WITH_MLU)
 endif()
 
 lite_cc_library(subgraph_bridge_utility_mlu SRCS utility.cc DEPS ${mlu_builder_libs} tensor)
-lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs})
+lite_cc_library(subgraph_bridge_tensor_mlu SRCS tensor.cc DEPS ${mlu_builder_libs} subgraph_bridge_utility_mlu)
 lite_cc_library(subgraph_bridge_graph_mlu SRCS graph.cc DEPS subgraph_bridge_utility_mlu subgraph_bridge_tensor_mlu)
 
 set(mlu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_mlu subgraph_bridge_graph_mlu)
@@ -18,6 +18,16 @@ lite_cc_library(subgraph_bridge_fc_op_mlu SRCS fc_op.cc DEPS ${subgraph_bridge_d
 lite_cc_library(subgraph_bridge_scale_op_mlu SRCS scale_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_interp_op_mlu SRCS interpolate_op.cc DEPS ${subgraph_bridge_deps_mlu})
 lite_cc_library(subgraph_bridge_concat_op_mlu SRCS concat_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_transpose_op_mlu SRCS transpose_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_dropout_op_mlu SRCS dropout_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_slice_op_mlu SRCS slice_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_split_op_mlu SRCS split_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_cast_op_mlu SRCS cast_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_layout_op_mlu SRCS layout_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_argmax_op_mlu SRCS argmax_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_squeeze_op_mlu SRCS squeeze_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_reshape_op_mlu SRCS reshape_op.cc DEPS ${subgraph_bridge_deps_mlu})
+lite_cc_library(subgraph_bridge_flatten_op_mlu SRCS flatten_op.cc DEPS ${subgraph_bridge_deps_mlu})
 set(mlu_subgraph_bridges
         subgraph_bridge_registry
         subgraph_bridge_utility_mlu
@@ -28,12 +38,35 @@ set(mlu_subgraph_bridges
         subgraph_bridge_pool_op_mlu
         subgraph_bridge_softmax_op_mlu
         subgraph_bridge_fc_op_mlu
+        subgraph_bridge_transpose_op_mlu
         subgraph_bridge_batch_norm_op_mlu
         subgraph_bridge_scale_op_mlu
         subgraph_bridge_interp_op_mlu
         subgraph_bridge_concat_op_mlu
+        subgraph_bridge_dropout_op_mlu
+        subgraph_bridge_slice_op_mlu
+        subgraph_bridge_split_op_mlu
+        subgraph_bridge_cast_op_mlu
+        subgraph_bridge_layout_op_mlu
+        subgraph_bridge_argmax_op_mlu
+        subgraph_bridge_squeeze_op_mlu
+        subgraph_bridge_reshape_op_mlu
+        subgraph_bridge_flatten_op_mlu
         CACHE INTERNAL "mlu_subgraph_bridges")
 
+
+if (LITE_BUILD_EXTRA)
+  lite_cc_library(subgraph_bridge_lrn_op_mlu SRCS lrn_op.cc DEPS ${subgraph_bridge_deps_mlu})
+  lite_cc_library(subgraph_bridge_gather_op_mlu SRCS gather_op.cc DEPS ${subgraph_bridge_deps_mlu})
+  lite_cc_library(subgraph_bridge_norm_op_mlu SRCS norm_op.cc DEPS ${subgraph_bridge_deps_mlu})
+  set(mlu_subgraph_bridges
+      "${mlu_subgraph_bridges}"
+      subgraph_bridge_lrn_op_mlu
+      subgraph_bridge_gather_op_mlu
+      subgraph_bridge_norm_op_mlu
+      CACHE INTERNAL "mlu_subgraph_bridges")
+endif()
+
 lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
 lite_cc_test(test_conv_converter_mlu SRCS conv_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_act_converter_mlu SRCS act_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
@@ -45,4 +78,21 @@ lite_cc_test(test_fc_converter_mlu SRCS fc_op_test.cc DEPS scope optimizer targe
 lite_cc_test(test_scale_converter_mlu SRCS scale_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_interp_converter_mlu SRCS interpolate_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 lite_cc_test(test_concat_converter_mlu SRCS concat_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_transpose_converter_mlu SRCS transpose_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_dropout_converter_mlu SRCS dropout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_slice_converter_mlu SRCS slice_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_split_converter_mlu SRCS split_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_layout_converter_mlu SRCS layout_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_cast_converter_mlu SRCS cast_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_argmax_converter_mlu SRCS argmax_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_squeeze_converter_mlu SRCS squeeze_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_reshape_converter_mlu SRCS reshape_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+lite_cc_test(test_flatten_converter_mlu SRCS flatten_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+
+if (LITE_BUILD_EXTRA)
+  lite_cc_test(test_norm_converter_mlu SRCS norm_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+  lite_cc_test(test_lrn_converter_mlu SRCS lrn_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+  lite_cc_test(test_gather_converter_mlu SRCS gather_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+endif()
+
 message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
diff --git a/lite/kernels/mlu/bridges/act_op.cc b/lite/kernels/mlu/bridges/act_op.cc
index 286195d9d5f961288dd0156db31ff8aacae58227..d24c7fac216ed0ba213a4fd95365132a693281c3 100644
--- a/lite/kernels/mlu/bridges/act_op.cc
+++ b/lite/kernels/mlu/bridges/act_op.cc
@@ -60,6 +60,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  output_tensor->mlu_tensor()));
   }
   graph->FuseOp(activation_op);
+  CNML_CALL(cnmlDestroyBaseOp(&activation_op));
   return SUCCESS;
 }
 
@@ -72,6 +73,9 @@ REGISTER_SUBGRAPH_BRIDGE(sigmoid,
                          kMLU,
                          paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(relu, kMLU, paddle::lite::subgraph::mlu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(relu6,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(tanh, kMLU, paddle::lite::subgraph::mlu::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
                          kMLU,
diff --git a/lite/kernels/mlu/bridges/act_op_test.cc b/lite/kernels/mlu/bridges/act_op_test.cc
index 2b7747f4d8b647b8cb621876907f6178ebf9fe88..11c0c3f732c4c29fff3aedc6cfdcf55760128b5d 100644
--- a/lite/kernels/mlu/bridges/act_op_test.cc
+++ b/lite/kernels/mlu/bridges/act_op_test.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <random>
+
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/mlu/bridges/test_helper.h"
@@ -116,7 +118,7 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
     opdesc.SetAttr("offset", 0.5f);
   }
 
-  // create and convert op to NPU model, then run it on NPU
+  // create and convert op to MLU model, then run it on MLU
   auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
   // execute reference implementation and save to output tensor
   act_ref(op);
@@ -134,7 +136,8 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
 
 TEST(MLUBridges, activation) {
   std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
-  std::vector<std::string> types{"sigmoid", "relu", "tanh", "leaky_relu"};
+  std::vector<std::string> types{
+      "sigmoid", "relu", "relu6", "tanh", "leaky_relu"};
   for (auto x_shape : shapes) {
     for (auto op_type : types) {
       test_act(x_shape, op_type);
@@ -149,5 +152,6 @@ TEST(MLUBridges, activation) {
 
 USE_SUBGRAPH_BRIDGE(sigmoid, kMLU)
 USE_SUBGRAPH_BRIDGE(relu, kMLU)
+USE_SUBGRAPH_BRIDGE(relu6, kMLU)
 USE_SUBGRAPH_BRIDGE(tanh, kMLU)
 USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU)
diff --git a/lite/kernels/mlu/bridges/argmax_op.cc b/lite/kernels/mlu/bridges/argmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b004639f07c79e5cc414e2d60bc1f32ec522f0f5
--- /dev/null
+++ b/lite/kernels/mlu/bridges/argmax_op.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims().Vectorize();
+
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  int axis = op_info->GetAttr<int64_t>("axis");
+  if (axis < 0) {
+    axis = axis + x_dims.size();
+  }
+  cnmlDimension_t argmax_mode = static_cast<cnmlDimension_t>(axis);
+  auto mlu_output_dim = x->dims().Vectorize();
+  // shape is NCHW, layout is NHWC
+  mlu_output_dim[axis] = 1;
+  auto input_tensor = graph->GetNode(x_var_name);
+  // if use_fp16 and axis is not c, cast input datatype from fp16 to fp32, so
+  // output datatype is int32
+  bool cast_to_fp32 =
+      graph->FPType() == CNML_DATA_FLOAT16 && argmax_mode != CNML_DIM_C;
+  cnmlBaseOp_t cast_op{nullptr};
+  std::shared_ptr<MLUTensor> fp32_input_tensor;
+  if (cast_to_fp32) {
+    fp32_input_tensor = graph->AddNode(x_var_name + ".fp32",
+                                       x_dims,
+                                       CNML_TENSOR,
+                                       CNML_NCHW,
+                                       CNML_DATA_FLOAT32);
+    cnmlCreateCastOp(&cast_op,
+                     CNML_CAST_FLOAT16_TO_FLOAT32,
+                     input_tensor->mlu_tensor(),
+                     fp32_input_tensor->mlu_tensor());
+  }
+  auto output_tensor = graph->AddNode(
+      out_var_name, mlu_output_dim, CNML_TENSOR, CNML_NCHW, CNML_DATA_INT32);
+
+  CHECK(graph->HasNode(x_var_name));
+  cnmlBaseOp_t argmax_op{nullptr};
+  // ======================= DEBUG INFO =====================
+  VLOG(6) << "x_var_name: " << x_var_name;
+  VLOG(6) << "out_var_name: " << out_var_name;
+  VLOG(6) << "x dims: " << x->dims();
+  VLOG(6) << "output dims: " << output->dims();
+  VLOG(6) << "axis: " << axis;
+  VLOG(6) << "cast_to_fp32: " << cast_to_fp32;
+  cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR);
+  cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR);
+  // ======================= DEBUG END =====================
+
+  CNML_CALL(cnmlCreateArgmaxOp(&argmax_op,
+                               argmax_mode,
+                               cast_to_fp32 ? fp32_input_tensor->mlu_tensor()
+                                            : input_tensor->mlu_tensor(),
+                               output_tensor->mlu_tensor()));
+  if (cast_to_fp32) {
+    graph->FuseOp(cast_op);
+  }
+  graph->FuseOp(argmax_op);
+  CNML_CALL(cnmlDestroyBaseOp(&argmax_op));
+  if (cast_op) {
+    CNML_CALL(cnmlDestroyBaseOp(&cast_op));
+  }
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(arg_max,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ArgmaxConverter);
diff --git a/lite/kernels/mlu/bridges/argmax_op_test.cc b/lite/kernels/mlu/bridges/argmax_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9eeb172812b8deecd6a8f1f2eb321ade4289fa9b
--- /dev/null
+++ b/lite/kernels/mlu/bridges/argmax_op_test.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/argmax_op.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <iostream>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+template <typename dtype, typename out_dtype>
+void argmax_ref(const std::shared_ptr<operators::ArgmaxOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int axis = op_info->GetAttr<int64_t>("axis");
+  auto x_dims = x->dims();
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  auto y_shape = x_dims.Vectorize();
+  y_shape.erase(y_shape.begin() + axis);
+  out->Resize(y_shape);
+  auto out_dims = out->dims();
+
+  auto* x_data = x->mutable_data<dtype>();
+  auto* out_data = out->mutable_data<out_dtype>();
+
+  const int size = x_dims[axis];
+  const int in_channel = x_dims.count(axis, x_dims.size());
+  const int out_channel = out_dims.count(axis, out_dims.size());
+  const int in_stride = x_dims.count(axis + 1, x_dims.size());
+  const int out_stride = x_dims.count(0, axis);
+  // int index = 0;
+  for (int n = 0; n < out_stride; n++) {
+    for (int k = 0; k < in_stride; k++) {
+      const float* in_ptr = x_data + n * in_channel + k;
+      std::vector<std::pair<float, int>> vec;
+      vec.resize(size);
+      for (int i = 0; i < size; i++) {
+        vec[i] = std::make_pair(in_ptr[i * in_stride], i);
+      }
+      // sort
+      std::partial_sort(vec.begin(),
+                        vec.begin() + 1,
+                        vec.end(),
+                        std::greater<std::pair<float, int>>());
+
+      out_dtype* out_ptr = out_data + n * out_channel + k;
+      *out_ptr = vec[0].second;
+    }
+  }
+}
+
+void test_argmax(const std::vector<int64_t>& input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+  // initialize input&output data
+  FillTensor<float, float>(x, -9, 9);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("arg_max");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", static_cast<int64_t>(axis));
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::ArgmaxOpLite>(opdesc, &scope);
+  argmax_ref<float, int>(op);
+  out_ref->CopyDataFrom(*out);
+  Tensor input_x;
+  input_x.Resize(DDim(input_shape));
+  // change input layout from NCHW to NHWC
+  transpose<float>(x->mutable_data<float>(),
+                   input_x.mutable_data<float>(),
+                   {static_cast<int>(input_shape[0]),
+                    static_cast<int>(input_shape[1]),
+                    static_cast<int>(input_shape[2]),
+                    static_cast<int>(input_shape[3])},
+                   {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  auto* out_data = out->mutable_data<int>();
+  auto* out_ref_data = out_ref->mutable_data<int>();
+  std::vector<int64_t> out_shape = input_shape;
+  out_shape[axis] = 1;
+  Tensor output_trans;
+  output_trans.Resize(out_shape);
+  // Change output layout from NHWC to NCHW
+  transpose<int>(out_data,
+                 output_trans.mutable_data<int>(),
+                 {static_cast<int>(out_shape[0]),
+                  static_cast<int>(out_shape[2]),
+                  static_cast<int>(out_shape[3]),
+                  static_cast<int>(out_shape[1])},
+                 {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<int>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, arg_max) {
+  test_argmax({1, 2, 3, 4}, 1);
+  test_argmax({1, 2, 3, 4}, 2);
+  test_argmax({1, 2, 3, 4}, 3);
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(arg_max, kMLU);
diff --git a/lite/kernels/mlu/bridges/batch_norm_op.cc b/lite/kernels/mlu/bridges/batch_norm_op.cc
index 7353a685dd5fd3a5bcc8c88def8ffb8b96fdde55..ceac1ac696d788869e77a1b173cc0bb4d10a4e21 100644
--- a/lite/kernels/mlu/bridges/batch_norm_op.cc
+++ b/lite/kernels/mlu/bridges/batch_norm_op.cc
@@ -48,25 +48,32 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
   auto mean_dims = mean->dims().Vectorize();
+  if (mean_dims.size() < 4) {
+    mean_dims.insert(mean_dims.begin(), 4 - mean_dims.size(), 1);
+  }
   auto mean_tensor = graph->AddNode(
-      mean_var_name, mean_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+      mean_var_name, mean_dims, CNML_CONST, CNML_NHWC, graph->FPType());
 
   auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
   auto variance_dims = variance->dims().Vectorize();
+  if (variance_dims.size() < 4) {
+    variance_dims.insert(variance_dims.begin(), 4 - variance_dims.size(), 1);
+  }
   auto variance_tensor = graph->AddNode(
-      variance_var_name, variance_dims, CNML_CONST, CNML_CNHW, graph->FPType());
+      variance_var_name, variance_dims, CNML_CONST, CNML_NHWC, graph->FPType());
 
   auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
   auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
 
-  int co = static_cast<int>(mean_dims[0]);
+  int co = static_cast<int>(mean_dims[3]);
 
+  std::vector<float> variance_trans(co);
+  std::vector<float> mean_trans(co);
   for (int i = 0; i < co; ++i) {
-    variance->mutable_data<float>()[i] =
+    variance_trans[i] =
         scale->data<float>()[i] / sqrtf(variance->data<float>()[i] + epsilon);
-    mean->mutable_data<float>()[i] =
-        mean->data<float>()[i] -
-        bias->data<float>()[i] / variance->data<float>()[i];
+    mean_trans[i] =
+        mean->data<float>()[i] - bias->data<float>()[i] / variance_trans[i];
   }
 
   auto input_tensor = graph->GetNode(x_var_name);
@@ -77,10 +84,14 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                          mean_tensor->mlu_tensor(),
                                          variance_tensor->mlu_tensor()));
 
-  graph->BindConstData(variance_var_name, variance);
-  graph->BindConstData(mean_var_name, mean);
+  graph->BindConstRawData(
+      variance_var_name, variance_trans.data(), variance_trans.size(), true);
+  graph->BindConstRawData(
+      mean_var_name, mean_trans.data(), mean_trans.size(), true);
   graph->FuseOp(bn_op);
 
+  CNML_CALL(cnmlDestroyBaseOp(&bn_op));
+
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/cast_op.cc b/lite/kernels/mlu/bridges/cast_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..25d988ce5aee519dfb00574343956022b30a89e7
--- /dev/null
+++ b/lite/kernels/mlu/bridges/cast_op.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto in_dtype = op_info->GetAttr<int>("in_dtype");
+  auto out_dtype = op_info->GetAttr<int>("out_dtype");
+
+  CHECK(graph->HasNode(x_var_name));
+  auto x_tensor = graph->GetNode(x_var_name);
+
+  cnmlDataType_t out_type;
+  cnmlCastType_t cast_type;
+  if (in_dtype == 4 && out_dtype == 5) {
+    cast_type = CNML_CAST_FLOAT16_TO_FLOAT32;
+    out_type = CNML_DATA_FLOAT32;
+  } else if (in_dtype == 5 && out_dtype == 4) {
+    cast_type = CNML_CAST_FLOAT32_TO_FLOAT16;
+    out_type = CNML_DATA_FLOAT16;
+  } else {
+    CHECK(0) << "Unsupported cast type";
+  }
+
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, out_type);
+
+  cnmlBaseOp_t cast_op;
+  CNML_CALL(cnmlCreateCastOp(&cast_op,
+                             cast_type,
+                             x_tensor->mlu_tensor(),
+                             output_tensor->mlu_tensor()));
+  graph->FuseOp(cast_op);
+  CNML_CALL(cnmlDestroyBaseOp(&cast_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(cast,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::CastConverter);
diff --git a/lite/kernels/mlu/bridges/cast_op_test.cc b/lite/kernels/mlu/bridges/cast_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2389ad5560cd2ede710626cfd40f8db8bff56351
--- /dev/null
+++ b/lite/kernels/mlu/bridges/cast_op_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/cast_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void test_cast_FP16_to_FP32(std::vector<int64_t> shape) {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(shape));
+  auto* x_data = x->mutable_data<paddle::lite::fluid::float16>();
+
+  // initialize input&output data
+  for (int i = 0; i < x->dims().production(); i++) {
+    x_data[i] = static_cast<paddle::lite::fluid::float16>(i);
+  }
+  // initialize op desc
+  int in_dtype = 4, out_dtype = 5;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("cast");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("in_dtype", in_dtype);
+  opdesc.SetAttr("out_dtype", out_dtype);
+
+  auto op = CreateOp<operators::CastOp>(opdesc, &scope);
+
+  Tensor data;
+  data.Resize(DDim(shape));
+  auto* copy_data = data.mutable_data<paddle::lite::fluid::float16>();
+  data.CopyDataFrom(*x);
+  x->set_precision(paddle::lite_api::PrecisionType::kFP16);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], static_cast<double>(copy_data[i]), 5e-4);
+  }
+}
+
+void test_cast_FP32_to_FP16(std::vector<int64_t> shape) {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(shape));
+  auto* x_data = x->mutable_data<float>();
+
+  // initialize input&output data
+  for (int i = 0; i < x->dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  // initialize op desc
+  int in_dtype = 5, out_dtype = 4;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("cast");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("in_dtype", in_dtype);
+  opdesc.SetAttr("out_dtype", out_dtype);
+
+  auto op = CreateOp<operators::CastOp>(opdesc, &scope);
+
+  Tensor data;
+  data.Resize(DDim(shape));
+  auto* copy_data = data.mutable_data<float>();
+  data.CopyDataFrom(*x);
+  x->set_precision(paddle::lite_api::PrecisionType::kFloat);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<paddle::lite::fluid::float16>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(static_cast<double>(out_data[i]), copy_data[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, cast) {
+  test_cast_FP16_to_FP32({2, 3, 4, 5});
+  test_cast_FP16_to_FP32({6, 3, 2, 5});
+  test_cast_FP32_to_FP16({2, 3, 4, 5});
+  test_cast_FP32_to_FP16({6, 3, 2, 5});
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(cast, kMLU);
diff --git a/lite/kernels/mlu/bridges/concat_op.cc b/lite/kernels/mlu/bridges/concat_op.cc
index 14f0da746a00c1ea10ffae824217dbb2df84df55..1d566639937d79cf1c98c70bfc1294d874fb89c4 100644
--- a/lite/kernels/mlu/bridges/concat_op.cc
+++ b/lite/kernels/mlu/bridges/concat_op.cc
@@ -44,9 +44,10 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   auto dims = output_dims.size();
   int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
-  CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
-  int nchw_to_nhwc_axis_map[4] = {0, 3, 1, 2};
-  int nhwc_axis = nchw_to_nhwc_axis_map[axis];
+  CHECK_LT(axis, dims) << "Unsupport dims in mlu concat";
+  // value of nhwc2nchw_axis is index of nhwc
+  // order of nhwc2nchw_axis is nchw
+  int nhwc_axis = GetAxisNHWC2NCHW<int>(dims)[axis];
 
   auto output_tensor = graph->AddNode(
       out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
@@ -60,6 +61,7 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  &outputs,
                                  1));
   graph->FuseOp(concat_op);
+  CNML_CALL(cnmlDestroyBaseOp(&concat_op));
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/conv_op.cc b/lite/kernels/mlu/bridges/conv_op.cc
index e7e21f7ad2f64275746e015289c9372368e46f5c..84c5bd5638585a5b5e1e22308c9ddf3c06acd9e9 100644
--- a/lite/kernels/mlu/bridges/conv_op.cc
+++ b/lite/kernels/mlu/bridges/conv_op.cc
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #include "lite/operators/conv_op.h"
+
 #include <algorithm>
+
 #include "lite/kernels/mlu/bridges/graph.h"
 #include "lite/kernels/mlu/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -30,6 +32,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const auto* op_info = op->op_info();
   const auto* scope = op->scope();
   VLOG(3) << "[MLU] Converting " << op_info->Type() << "... ";
+  CHECK(!op_info->HasAttr("act_type"));
 
   // get input, filter and op attributes
   const auto input_var_name = op_info->Input("Input").front();
@@ -43,8 +46,13 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   const auto output_shape = output->dims().Vectorize();
   const auto bs = input_dims[0];
   const auto oc = filter_dims[0];
+  const auto groups = op_info->GetAttr<int>("groups");
+
   CHECK_EQ(input_dims.size(), 4u);
   CHECK_EQ(filter_dims.size(), 4u);
+  CHECK(!(op_info->HasAttr("fuse_relu") &&
+          (op_info->GetAttr<bool>("fuse_relu") == true)))
+      << "UnSupported param fuse_relu is true!";
   const auto strides = op_info->GetAttr<std::vector<int>>("strides");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
@@ -70,13 +78,32 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                       padding_algorithm,
                                       input_dims,
                                       filter_dims);
+  bool is_group_mode = groups > 1;
 
+  bool is_depthwise_mode = false;
+  if (filter_dims[0] == groups && filter_dims[1] == 1 && dilations[0] == 1 &&
+      dilations[1] == 1) {  // depthwise filter shape = {1, ic ,kh ,kw}
+    is_depthwise_mode = true;
+    is_group_mode = false;
+  }
+
+  auto input_tensor = graph->GetNode(input_var_name);
   const auto output_tensor = graph->AddNode(
       output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  std::vector<int64_t> cnml_filter_shape = {
+      filter_dims[0], filter_dims[1], filter_dims[2], filter_dims[3]};
+  if (is_depthwise_mode) {
+    /*paddle filter shape is {oc , ic / groups == 1, kh, kw} while
+     cnml depthwise conv filter expect shape {oc / groups == 1 , ic , kh, kw}
+     so we should shape filter shape
+     */
+    cnml_filter_shape = {
+        filter_dims[1], filter_dims[0], filter_dims[2], filter_dims[3]};
+  }
 
   // Create filter node
   const auto filter_tensor = graph->AddNode(filter_var_name,
-                                            filter_dims.Vectorize(),
+                                            cnml_filter_shape,
                                             CNML_FILTER,
                                             CNML_NCHW,
                                             graph->FPType());
@@ -89,15 +116,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     dequant(filter_dequant.data(),
             filter->mutable_data<int8_t>(),
             1,
-            filter_dims[0],
-            filter_dims[1] * filter_dims[2] * filter_dims[3],
+            cnml_filter_shape[0],
+            cnml_filter_shape[1] * cnml_filter_shape[2] * cnml_filter_shape[3],
             weight_scale);
     transpose(filter_dequant.data(),
               filter->mutable_data<float>(),
-              {static_cast<int>(filter_dims[0]),
-               static_cast<int>(filter_dims[1]),
-               static_cast<int>(filter_dims[2]),
-               static_cast<int>(filter_dims[3])},
+              {static_cast<int>(cnml_filter_shape[0]),
+               static_cast<int>(cnml_filter_shape[1]),
+               static_cast<int>(cnml_filter_shape[2]),
+               static_cast<int>(cnml_filter_shape[3])},
               {0, 2, 3, 1});
     filter->set_precision(PrecisionType::kFloat);
   } else if (filter->precision() != PrecisionType::kFloat) {
@@ -116,7 +143,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     std::vector<int64_t> bias_shape;
     if (bias_data_size == oc) {
       // 0: {oc}
-      bias_shape = {oc};
+      bias_shape = {1, 1, 1, oc};
     } else if (bias_data_size == output_data_size / bs) {
       LOG(FATAL) << "Unsupported ... ...";
       // 1: {1, oc, oh, ow}
@@ -130,18 +157,15 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                  << " isn't supported in conv2d Op when output dimension is "
                  << output_dims;
     }
-    bias_tensor = graph->AddNode(bias_var_name,
-                                 bias_dims.Vectorize(),
-                                 CNML_CONST,
-                                 CNML_CNHW,
-                                 graph->FPType());
+    bias_tensor = graph->AddNode(
+        bias_var_name, bias_shape, CNML_CONST, CNML_NHWC, graph->FPType());
     graph->BindConstData(bias_var_name, bias);
   }
 
   const auto input_scale = op_info->GetAttr<float>("input_scale");
 
   bool use_first_conv = false;
-  if (lite::DeviceInfo::Global().UseFirstConv() && input_dims[1] == 3) {
+  if (lite::TargetWrapperMlu::UseFirstConv() && input_dims[1] == 3) {
     use_first_conv = true;
   }
 
@@ -158,38 +182,75 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                             paddings[0],
                                             paddings[0]));
     const auto mean_tensor = graph->AddNode("first_conv_mean_tensor",
-                                            std::vector<int64_t>{3},
+                                            std::vector<int64_t>{1, 1, 1, 3},
                                             CNML_CONST,
-                                            CNML_CNHW,
+                                            CNML_NHWC,
                                             graph->FPType());
     const auto std_tensor = graph->AddNode("first_conv_std_tensor",
-                                           std::vector<int64_t>{3},
+                                           std::vector<int64_t>{1, 1, 1, 3},
                                            CNML_CONST,
-                                           CNML_CNHW,
+                                           CNML_NHWC,
                                            graph->FPType());
 
     graph->BindConstRawData("first_conv_mean_tensor",
-                            lite::DeviceInfo::Global().MeanVec().data(),
+                            lite::TargetWrapperMlu::MeanVec().data(),
                             3,
                             false);
     graph->BindConstRawData("first_conv_std_tensor",
-                            lite::DeviceInfo::Global().StdVec().data(),
+                            lite::TargetWrapperMlu::StdVec().data(),
                             3,
                             false);
 
-    graph->GetNode(input_var_name)->set_mlu_dtype(CNML_DATA_UINT8);
+    input_tensor->set_mlu_dtype(CNML_DATA_UINT8);
     CNML_CALL(cnmlCreateConvFirstOpForward(
         &conv_op,
         conv_param,
-        graph->GetNode(input_var_name)->mlu_tensor(),
+        input_tensor->mlu_tensor(),
         mean_tensor->mlu_tensor(),
         output_tensor->mlu_tensor(),
         filter_tensor->mlu_tensor(),
         bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
         std_tensor->mlu_tensor()));
     CNML_CALL(cnmlDestroyConvFirstOpParam(&conv_param));
+  } else if (is_depthwise_mode) {
+    cnmlConvDepthwiseOpParam_t conv_depthwise_param;
+    cnmlCreateConvDepthwiseOpParam_V2(&conv_depthwise_param,
+                                      strides[0],
+                                      strides[1],
+                                      paddings[0] * 2,
+                                      paddings[2] * 2);
+    CNML_CALL(cnmlCreateConvDepthwiseOpForward(
+        &conv_op,
+        conv_depthwise_param,
+        input_tensor->mlu_tensor(),
+        output_tensor->mlu_tensor(),
+        filter_tensor->mlu_tensor(),
+        bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
+    CNML_CALL(cnmlDestroyConvDepthwiseOpParam(&conv_depthwise_param));
+  } else if (is_group_mode) {
+    cnmlConvOpParam_t conv_param;
+    CNML_CALL(cnmlCreateConvOpParam(&conv_param,
+                                    strides[0],
+                                    strides[1],
+                                    dilations[0],
+                                    dilations[1],
+                                    paddings[0] * 2,
+                                    paddings[2] * 2));
+    CNML_CALL(cnmlCreateConvGroupOpForward(
+        &conv_op,
+        conv_param,
+        input_tensor->mlu_tensor(),
+        output_tensor->mlu_tensor(),
+        filter_tensor->mlu_tensor(),
+        bias_tensor ? bias_tensor->mlu_tensor() : nullptr,
+        groups));
+    CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
   } else {
     cnmlConvOpParam_t conv_param;
+    VLOG(5) << "conv param (" << input_var_name << ")"
+            << "stride: " << strides[0] << ',' << strides[1] << '\t'
+            << "dilations: " << dilations[0] << ',' << dilations[1] << '\t'
+            << "paddings: " << paddings[0] << ',' << paddings[2] << std::endl;
     CNML_CALL(cnmlCreateConvOpParam(&conv_param,
                                     strides[0],
                                     strides[1],
@@ -200,19 +261,21 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     CNML_CALL(cnmlCreateConvOpForward(
         &conv_op,
         conv_param,
-        graph->GetNode(input_var_name)->mlu_tensor(),
+        input_tensor->mlu_tensor(),
         output_tensor->mlu_tensor(),
         filter_tensor->mlu_tensor(),
         bias_tensor ? bias_tensor->mlu_tensor() : nullptr));
     CNML_CALL(cnmlDestroyConvOpParam(&conv_param));
   }
 
-  graph->SetComputingDataType(
-      conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
-  graph->SetComputingDataType(
-      conv_op,
-      filter_tensor->mlu_tensor(),
-      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+  if (!is_depthwise_mode) {
+    graph->SetComputingDataType(
+        conv_op, graph->GetNode(input_var_name)->mlu_tensor(), 1 / input_scale);
+    graph->SetComputingDataType(
+        conv_op,
+        filter_tensor->mlu_tensor(),
+        1 / *max_element(weight_scale.begin(), weight_scale.end()));
+  }
   CNML_CALL(cnmlSetOperationComputingLayout(conv_op, CNML_NHWC));
   if (HasInputArg(op_info, scope, "Bias")) {
     auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
@@ -220,6 +283,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
   graph->BindConstData(filter_var_name, filter);
   graph->FuseOp(conv_op);
+  CNML_CALL(cnmlDestroyBaseOp(&conv_op));
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
diff --git a/lite/kernels/mlu/bridges/conv_op_test.cc b/lite/kernels/mlu/bridges/conv_op_test.cc
index 1b04814d7d88d227d0bb3e0b58aef26d62f06966..ddaf5b321ffd2af1fbd91af6cf15b5c7789cbba3 100644
--- a/lite/kernels/mlu/bridges/conv_op_test.cc
+++ b/lite/kernels/mlu/bridges/conv_op_test.cc
@@ -13,8 +13,11 @@
 // limitations under the License.
 
 #include "lite/operators/conv_op.h"
+
 #include <gtest/gtest.h>
+
 #include <random>
+
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/mlu/bridges/test_helper.h"
@@ -331,6 +334,10 @@ TEST(MLUBridges, conv) {
 #endif
 }
 
+TEST(MLUBridges, depthwise_conv2d) {
+  test_conv(1, 8, 8, 14, 14, false, false, false, true, 1, 1, 2, 3);
+}
+
 }  // namespace mlu
 }  // namespace subgraph
 }  // namespace lite
diff --git a/lite/kernels/mlu/bridges/dropout_op.cc b/lite/kernels/mlu/bridges/dropout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9aa296236e05a0c80ed9b7001f940cce99b019f7
--- /dev/null
+++ b/lite/kernels/mlu/bridges/dropout_op.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Create act node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  /* auto mask_var_name = op_info->Output("Mask").front(); */
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  /* auto mask = scope->FindVar(mask_var_name)->GetMutable<Tensor>(); */
+  /* auto mask_dims = mask->dims().Vectorize(); */
+  /* auto mask_tensor = graph->AddNode( */
+  /*     mask_var_name, mask_dims, CNML_TENSOR, CNML_NCHW, graph->FPType()); */
+
+  // is_test is true by default
+  // if(op_info->HasAttr("is_test")){
+  //   auto is_test = op_info->GetAttr<bool>("is_test");
+  //   CHECK(is_test != true);
+  // }
+
+  // Param fix_seed and seed is useless in MLU
+
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
+  float alpha = 1.0f - dropout_prob;
+  if (dropout_implementation == "upscale_in_train") {
+    alpha = 1.;
+  }
+  float beta = 0.;
+
+  std::vector<int64_t> shape = {1, 1, 1, 1};
+  std::string alpha_var_name = string_format("dropout_alpha_%p", op);
+  std::string beta_var_name = string_format("dropout_beta_%p", op);
+  auto alpha_tensor = graph->AddNode(
+      alpha_var_name, shape, CNML_CONST, CNML_NHWC, graph->FPType());
+  auto beta_tensor = graph->AddNode(
+      beta_var_name, shape, CNML_CONST, CNML_NHWC, graph->FPType());
+
+  graph->BindConstRawData(alpha_var_name, &alpha, 1);
+  graph->BindConstRawData(beta_var_name, &beta, 1);
+
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlBaseOp_t scale_op;
+  CNML_CALL(cnmlCreateScaleOp(&scale_op,
+                              input_tensor->mlu_tensor(),
+                              output_tensor->mlu_tensor(),
+                              alpha_tensor->mlu_tensor(),
+                              beta_tensor->mlu_tensor()));
+  graph->FuseOp(scale_op);
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(dropout,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::DropoutConverter);
diff --git a/lite/kernels/mlu/bridges/dropout_op_test.cc b/lite/kernels/mlu/bridges/dropout_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44f03e3051a6c568d541b98b64808e27470d8916
--- /dev/null
+++ b/lite/kernels/mlu/bridges/dropout_op_test.cc
@@ -0,0 +1,158 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/dropout_op.h"
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void dropout_ref(const std::shared_ptr<operators::DropoutOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto dropout_implementation =
+      op_info->GetAttr<std::string>("dropout_implementation");
+  auto dropout_prob = op_info->GetAttr<float>("dropout_prob");
+  float alpha = 1.0f - dropout_prob;
+  if (dropout_implementation == "upscale_in_train") {
+    alpha = 1.;
+  }
+  float beta = 0.;
+
+  auto x_data = x->data<float>();
+  auto out_data = out->mutable_data<float>();
+  DDim x_dims = x->dims();
+  DDim out_dims = out->dims();
+  CHECK_EQ(x_dims.production(), out_dims.production());
+  for (int i = 0; i < out_dims.production(); i++) {
+    out_data[i] = x_data[i] * alpha + beta;
+  }
+}
+
+void test_dropout(int bs,
+                  int ic,
+                  int ih,
+                  int iw,
+                  std::string dropout_implementation,
+                  float dropout_prob,
+                  float bias) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string mask_var_name("mask");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* mask = scope.Var(mask_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+
+  // initialize input&output data
+  FillTensor<float, int>(x);
+
+  // initialize op desc
+  bool is_test = true;
+  bool fix_seed = false;
+  int seed = 0;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("dropout");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetOutput("Mask", {mask_var_name});
+  opdesc.SetAttr("is_test", is_test);
+  opdesc.SetAttr("fix_seed", fix_seed);
+  opdesc.SetAttr("seed", seed);
+  opdesc.SetAttr("dropout_implementation", dropout_implementation);
+  opdesc.SetAttr("dropout_prob", dropout_prob);
+  VLOG(6) << "mask: " << mask->dims()[0] << std::endl;
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::DropoutOp>(opdesc, &scope);
+  dropout_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_trans;
+  input_trans.Resize({bs, ic, ih, iw});
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {bs, ic, ih, iw},
+            {0, 2, 3, 1});
+  auto os = out->dims();
+  out->Resize({static_cast<int>(os[0]),
+               static_cast<int>(os[2]),
+               static_cast<int>(os[3]),
+               static_cast<int>(os[1])});
+  x->CopyDataFrom(input_trans);
+  x->Resize({bs, ih, iw, ic});
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor('out')
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output_trans;
+  output_trans.Resize(os);
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, dropout) {
+  for (auto bs : {1, 3}) {
+    for (auto ic : {1, 3}) {
+      for (auto ih : {3, 4}) {
+        for (auto iw : {4, 3}) {
+          for (auto dropout_implementation :
+               {"downgrade_in_infer", "upscale_in_train"}) {
+            for (auto dropout_prob : {0.f, 1.0f}) {
+              VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
+                      << " iw: " << iw
+                      << " dropout_implementation: " << dropout_implementation
+                      << " dropout_prob: " << dropout_prob;
+              test_dropout(
+                  bs, ic, ih, iw, dropout_implementation, dropout_prob, 0.);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(dropout, kMLU);
diff --git a/lite/kernels/mlu/bridges/elementwise_ops.cc b/lite/kernels/mlu/bridges/elementwise_ops.cc
index 41526a0100ba71be9eda25983cb96aa888d6cf4d..5f7192a0628a7887dbca15d63f1ba22799d7ee4b 100644
--- a/lite/kernels/mlu/bridges/elementwise_ops.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops.cc
@@ -23,7 +23,7 @@ namespace mlu {
 
 std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
   auto x_dims = x.dims();
-  CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
+  // CHECK_EQ(x_dims.size(), 4UL) << "[MLU] Only support 4-dimension x";
   auto y_dims = y->dims();
   CHECK_GE(x_dims.size(), y_dims.size());
 
@@ -117,6 +117,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   graph->FuseOp(elementwise_op);
+  CNML_CALL(cnmlDestroyBaseOp(&elementwise_op));
   cnmlBaseOp_t act_op;
   if (op_type == "fusion_elementwise_add_activation") {
     auto mid_tensor = graph->GetNode(out_var_name + "_mid");
@@ -127,6 +128,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  mid_tensor->mlu_tensor(),
                                  output_tensor->mlu_tensor()));
     graph->FuseOp(act_op);
+    CNML_CALL(cnmlDestroyBaseOp(&act_op));
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
diff --git a/lite/kernels/mlu/bridges/elementwise_ops_test.cc b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
index e5087dd708eee3ba255fbfa0383d31b12a6b6870..7844e5b1b57567f72750b21ba288547cb165eb54 100644
--- a/lite/kernels/mlu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/mlu/bridges/elementwise_ops_test.cc
@@ -153,7 +153,7 @@ void test_elementwise_add(const std::vector<int64_t>& x_shape,
   opdesc.SetOutput("Out", {out_var_name});
   opdesc.SetAttr("axis", axis);
 
-  // create and convert op to NPU model, then run it on NPU
+  // create and convert op to MLU model, then run it on MLU
   auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
 
   // execute reference implementation and save to output tensor
diff --git a/lite/kernels/mlu/bridges/fc_op.cc b/lite/kernels/mlu/bridges/fc_op.cc
index 286feec8d4d44eaa025f333d559c32ca72f042ff..ed9ef7edd002ad0476efb84b34239ce07641538a 100644
--- a/lite/kernels/mlu/bridges/fc_op.cc
+++ b/lite/kernels/mlu/bridges/fc_op.cc
@@ -34,7 +34,7 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto w_var_name = op_info->Input("W").front();
   auto output_var_name = op_info->Output("Out").front();
 
-  // int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
+  CHECK(!op_info->HasAttr("activation_type"));
   auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
   auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
   auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
@@ -45,9 +45,28 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(w_dims.size(), 2UL);
 
   // Create w node
-  std::vector<int64_t> w_shape{w_dims[1], w_dims[0]};
+  std::vector<int64_t> cnml_w_shape;
+  if (x_dims.size() == 4) {
+    if (x_dims[1] * x_dims[2] * x_dims[3] == w_dims[0]) {
+      cnml_w_shape = {
+          static_cast<int>(w_dims[1]),
+          static_cast<int>(x_dims[1]),  // input_c
+          static_cast<int>(x_dims[2]),  //  input_h
+          static_cast<int>(x_dims[3]),  //  input_w
+      };
+    } else {
+      LOG(FATAL)
+          << "in fc op, we expect input_h * input_w * input_c == filter_c"
+          << " but we got input_c = " << x_dims[1] << " input_h = " << x_dims[2]
+          << " input_w = " << x_dims[3] << " filter_c = " << w_dims[0]
+          << std::endl;
+    }
+  } else {
+    cnml_w_shape = {w_dims[1], w_dims[0]};
+  }
+
   auto w_tensor = graph->AddNode(
-      w_var_name, w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
+      w_var_name, cnml_w_shape, CNML_FILTER, CNML_NCHW, graph->FPType());
 
   auto input_scale = op_info->GetAttr<float>("input_scale");
 
@@ -63,15 +82,15 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (HasInputArg(op_info, scope, "Bias")) {
     bias_var_name = op_info->Input("Bias").front();
     auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-    auto bias_dims = bias->dims();
+    auto bias_dims = bias->dims().Vectorize();
     CHECK(!graph->HasNode(bias_var_name));
+    if (bias_dims.size() < 4u) {
+      bias_dims.insert(bias_dims.begin(), 4 - bias_dims.size(), 1);
+    }
     // CHECK_EQ(bias_dims.production(), n);
 
-    bias_tensor = graph->AddNode(bias_var_name,
-                                 bias_dims.Vectorize(),
-                                 CNML_CONST,
-                                 CNML_CNHW,
-                                 graph->FPType());
+    bias_tensor = graph->AddNode(
+        bias_var_name, bias_dims, CNML_CONST, CNML_NHWC, graph->FPType());
     graph->BindConstData(bias_var_name, bias);
   }
   cnmlBaseOp_t fc_op;
@@ -88,18 +107,46 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (w->precision() == PrecisionType::kUnk ||
       w->precision() == PrecisionType::kInt8) {
     std::vector<float> w_dequant(w->data_size());
-    dequant(w_dequant.data(),
-            w->mutable_data<int8_t>(),
-            1,
-            w_dims[1],
-            w_dims[0],
-            weight_scale);
-    for (int i = 0; i < w_dims[1]; i++) {
-      for (int j = 0; j < w_dims[0]; j++) {
-        w->mutable_data<float>()[i * w_dims[0] + j] =
-            w_dequant[i + j * w_dims[1]];
-      }
+    if (cnml_w_shape.size() == 2) {
+      dequant(w_dequant.data(),
+              w->mutable_data<int8_t>(),
+              1,
+              cnml_w_shape[0],
+              cnml_w_shape[1],
+              weight_scale);
+      transpose2d(w_dequant.data(),
+                  w->mutable_data<float>(),
+                  {static_cast<int>(cnml_w_shape[0]),
+                   static_cast<int>(cnml_w_shape[1])});
+    } else if (cnml_w_shape.size() == 4) {
+      dequant(w_dequant.data(),
+              w->mutable_data<int8_t>(),
+              1,
+              cnml_w_shape[0],
+              cnml_w_shape[1] * cnml_w_shape[2] * cnml_w_shape[3],
+              weight_scale);
+
+      int c_o_num = cnml_w_shape[0];
+      int c_i_num = cnml_w_shape[1];
+      int h_i_num = cnml_w_shape[2];
+      int w_i_num = cnml_w_shape[3];
+
+      // chw == ci * hi * wi == w_dim[0]
+      // first trans [chw, co] -> [co,chw]
+      std::vector<float> first_trans_output(w_dequant.size());
+      int chw = c_i_num * h_i_num * w_i_num;
+      transpose2d(w_dequant.data(), first_trans_output.data(), {chw, c_o_num});
+
+      // second trans [co,ci,hi,wi] -> [co,hi,wi,ci]
+      transpose(first_trans_output.data(),
+                w->mutable_data<float>(),
+                {c_o_num, c_i_num, h_i_num, w_i_num},
+                {0, 2, 3, 1});
+    } else {
+      LOG(FATAL) << "expect w_shape.size == 2 or 4, but got "
+                 << cnml_w_shape.size() << std::endl;
     }
+
     w->set_precision(PrecisionType::kFloat);
   } else if (w->precision() != PrecisionType::kFloat) {
     LOG(FATAL) << "UnSupported weight precision!";
@@ -110,9 +157,10 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   graph->SetComputingDataType(
       fc_op,
       w_tensor->mlu_tensor(),
-      1 / *min_element(weight_scale.begin(), weight_scale.end()));
+      1 / *max_element(weight_scale.begin(), weight_scale.end()));
 
   graph->FuseOp(fc_op);
+  CNML_CALL(cnmlDestroyBaseOp(&fc_op));
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
diff --git a/lite/kernels/mlu/bridges/fc_op_test.cc b/lite/kernels/mlu/bridges/fc_op_test.cc
index fe1c889f431350b4175ac400aefe77e6392405c5..af856a55a2ddc563d210af3b4ef0e669b32f5a57 100644
--- a/lite/kernels/mlu/bridges/fc_op_test.cc
+++ b/lite/kernels/mlu/bridges/fc_op_test.cc
@@ -175,9 +175,9 @@ void test_fc(const std::vector<int64_t>& input_shape,
 
 TEST(MLUBridges, fc) {
   for (bool use_bias : {true, false}) {
-    // test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
-    // test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
-    // test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
+    test_fc({1, 8, 8, 1}, {64, 4}, 1, use_bias);
+    test_fc({1, 5, 5, 1}, {25, 7}, 1, use_bias);
+    test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
     test_fc({1, 1024, 1, 1}, {1024, 32}, 1, use_bias);
   }
 }
diff --git a/lite/kernels/mlu/bridges/flatten_op.cc b/lite/kernels/mlu/bridges/flatten_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..faf7e6fd2801cdcaad4bce0a20921843f1d1b516
--- /dev/null
+++ b/lite/kernels/mlu/bridges/flatten_op.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int FlattenConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  // ================== Trans1: NHWC => NCHW ===========================
+  auto input_tensor = graph->GetNode(x_var_name);
+  auto trans_1_axis = std::move(GetAxisNHWC2NCHW<int>(x->dims().size()));
+  auto trans1_out = graph->AddNode(x_var_name + ".trans.i",
+                                   x->dims().Vectorize(),
+                                   CNML_TENSOR,
+                                   CNML_NCHW,
+                                   graph->FPType(),
+                                   CNML_NCHW);
+  cnmlBaseOp_t trans1_op{nullptr};
+  cnmlNdTransposeOpParam_t trans1_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans1_param, trans_1_axis.data(), trans_1_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans1_op,
+                                       input_tensor->mlu_tensor(),
+                                       trans1_out->mlu_tensor(),
+                                       trans1_param));
+  // ======================== Trans1 End ==================================
+
+  // ======================= Flatten op ===================================
+  cnmlBaseOp_t flatten_op;
+  auto trans2_input = graph->AddNode(out_var_name + ".trans.o",
+                                     output_dims,
+                                     CNML_TENSOR,
+                                     CNML_NCHW,
+                                     graph->FPType(),
+                                     CNML_NCHW);
+  int cnml_trans2_input_shape[4];
+  CNML_CALL(
+      cnmlGetTensorShape(trans2_input->mlu_tensor(), cnml_trans2_input_shape));
+  cnmlReshapeOpParam_t reshape_param{nullptr};
+  CNML_CALL(cnmlCreateNdReshapeOpParam(
+      &reshape_param, cnml_trans2_input_shape, output->dims().size()));
+
+  // Use cnmlCreatexxxOpForward to create op.
+  CNML_CALL(cnmlCreateReshapeOp(&flatten_op,
+                                reshape_param,
+                                trans1_out->mlu_tensor(),
+                                trans2_input->mlu_tensor()));
+  // ======================= Flatten End ===================================
+
+  // ================== Trans2: NCHW => NHWC ===============================
+  auto trans_2_axis = std::move(GetAxisNCHW2NHWC<int>(output->dims().size()));
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  cnmlBaseOp_t trans2_op{nullptr};
+  cnmlNdTransposeOpParam_t trans2_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans2_param, trans_2_axis.data(), trans_2_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans2_op,
+                                       trans2_input->mlu_tensor(),
+                                       output_tensor->mlu_tensor(),
+                                       trans2_param));
+  // ======================== Trans2 End ==================================
+
+  // ============== DEBUG LOG ===============
+
+  VLOG(6) << "x_var_name: " << x_var_name;
+  VLOG(6) << "out_var_name: " << out_var_name;
+  VLOG(6) << "input dim: " << x->dims();
+  VLOG(6) << "output dim: " << output->dims();
+  //   cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(trans1_out->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(trans2_input->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR);
+  // ============== DEBUG END ===============
+  graph->FuseOp(trans1_op);
+  graph->FuseOp(flatten_op);
+  graph->FuseOp(trans2_op);
+  CNML_CALL(cnmlDestroyBaseOp(&trans1_op));
+  CNML_CALL(cnmlDestroyBaseOp(&flatten_op));
+  CNML_CALL(cnmlDestroyBaseOp(&trans2_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(flatten,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::FlattenConverter);
+REGISTER_SUBGRAPH_BRIDGE(flatten2,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::FlattenConverter);
diff --git a/lite/kernels/mlu/bridges/flatten_op_test.cc b/lite/kernels/mlu/bridges/flatten_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..190b837ffeecfd494ffbd748220207cd63da5c06
--- /dev/null
+++ b/lite/kernels/mlu/bridges/flatten_op_test.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/flatten_op.h"
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void test_flatten(std::vector<int64_t> input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+  Tensor x_cpu;
+
+  // initialize input&output data
+  FillTensor<float, int>(x);
+  x_cpu.CopyDataFrom(*x);
+
+  Tensor input_trans;
+  input_trans.Resize(input_shape);
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {static_cast<int>(input_shape[0]),
+             static_cast<int>(input_shape[1]),
+             static_cast<int>(input_shape[2]),
+             static_cast<int>(input_shape[3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_trans);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("flatten2");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr<int>("axis", axis);
+  auto op = CreateOp<operators::FlattenOp>(opdesc, &scope);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], x_cpu.mutable_data<float>()[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, flatten) { test_flatten({1, 2, 4, 4}, 2); }
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(flatten, kMLU);
+USE_SUBGRAPH_BRIDGE(flatten2, kMLU);
diff --git a/lite/kernels/mlu/bridges/gather_op.cc b/lite/kernels/mlu/bridges/gather_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b68f1af76456eede14ec550c623d6a8355f5d5e8
--- /dev/null
+++ b/lite/kernels/mlu/bridges/gather_op.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto index_var_name = op_info->Input("Index").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  CHECK(graph->HasNode(x_var_name));
+  auto x_tensor = graph->GetNode(x_var_name);
+  auto index_tensor = graph->GetNode(index_var_name);
+
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+
+  cnmlBaseOp_t gather_op;
+  CNML_CALL(cnmlCreateGatherV2Op(&gather_op,
+                                 x_tensor->mlu_tensor(),
+                                 index_tensor->mlu_tensor(),
+                                 output_tensor->mlu_tensor(),
+                                 CNML_DIM_N));
+  graph->FuseOp(gather_op);
+  CNML_CALL(cnmlDestroyBaseOp(&gather_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(gather,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::GatherConverter);
diff --git a/lite/kernels/mlu/bridges/gather_op_test.cc b/lite/kernels/mlu/bridges/gather_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..413de7c9d7fda750b387c2daa21ef1e40e7982c7
--- /dev/null
+++ b/lite/kernels/mlu/bridges/gather_op_test.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/gather_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+template <typename dtype>
+void gather_ref(const std::shared_ptr<operators::GatherOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto index =
+      scope->FindVar(op_info->Input("Index").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+
+  auto x_dims = x->dims();
+  auto index_dims = index->dims();
+  CHECK(index_dims.size() == 1 ||
+        (index_dims.size() == 2 && index_dims[1] == 1));
+
+  int batch_size = index_dims[0];
+  DDim out_dims = x_dims;
+  out_dims[0] = batch_size;
+  out->Resize(out_dims);
+
+  auto x_data = x->data<float>();
+  auto index_data = index->data<int>();
+  auto out_data = out->mutable_data<float>();
+
+  auto slice_num = x_dims[0];
+  auto slice_size = x_dims.Slice(1, x_dims.size()).production();
+  for (int i = 0; i < batch_size; i++) {
+    auto index = index_data[i];
+    CHECK_LT(index, slice_num) << "index <= slice_num";
+    CHECK_GE(index, 0) << "index > 0";
+    memcpy(out_data + i * slice_size,
+           x_data + index * slice_size,
+           slice_size * sizeof(float));
+  }
+}
+
+void test_gather() {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  std::string index_var_name = "index";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  auto* index = scope.Var(index_var_name)->GetMutable<Tensor>();
+
+  x->Resize({5, 4, 3, 2});
+  index->Resize({2});
+  // initialize input&output data
+  FillTensor<float>(x);
+  FillTensor<int>(index, 1, 3);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("gather");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetInput("Index", {index_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  auto op = CreateOp<operators::GatherOp>(opdesc, &scope);
+  gather_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input;
+  input.Resize({5, 4, 3, 2});
+  transpose<float>(x->mutable_data<float>(),
+                   input.mutable_data<float>(),
+                   {static_cast<int>(5),
+                    static_cast<int>(4),
+                    static_cast<int>(3),
+                    static_cast<int>(2)},
+                   {0, 2, 3, 1});
+  x->CopyDataFrom(input);
+  LaunchOp(op, {x_var_name, index_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+
+  Tensor output;
+  output.Resize(out->dims());
+  transpose<float>(out_data,
+                   output.mutable_data<float>(),
+                   {static_cast<int>(out->dims()[0]),
+                    static_cast<int>(out->dims()[2]),
+                    static_cast<int>(out->dims()[3]),
+                    static_cast<int>(out->dims()[1])},
+                   {0, 3, 1, 2});
+  out_data = output.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, gather) { test_gather(); }
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(gather, kMLU);
diff --git a/lite/kernels/mlu/bridges/graph.cc b/lite/kernels/mlu/bridges/graph.cc
index 65c2f8214c13ee8d004dbe4b2e706523d007469c..bbe88547c8d60e1468653a28dad97af09b24f952 100644
--- a/lite/kernels/mlu/bridges/graph.cc
+++ b/lite/kernels/mlu/bridges/graph.cc
@@ -27,10 +27,14 @@ std::shared_ptr<MLUTensor> Graph::AddNode(const std::string& name,
                                           cnmlTensorType_t tensor_type,
                                           cnmlDataOrder_t shape_order,
                                           cnmlDataType_t mlu_dtype,
+                                          cnmlDataOrder_t data_order,
                                           void* raw_ptr) {
   CHECK(!HasNode(name));
+  VLOG(5) << "add mlu node: " << name << "\t data type "
+          << static_cast<int>(mlu_dtype) << "\t data order "
+          << static_cast<int>(data_order);
   auto node = std::shared_ptr<MLUTensor>(
-      new MLUTensor(shape, tensor_type, shape_order, mlu_dtype));
+      new MLUTensor(shape, tensor_type, shape_order, mlu_dtype, data_order));
   node->set_mlu_ptr(raw_ptr);
   nodes_.insert(std::make_pair(name, node));
   return node;
diff --git a/lite/kernels/mlu/bridges/graph.h b/lite/kernels/mlu/bridges/graph.h
index 2c6bd63a87e53332a329d0c5c66fcf372a2584ca..07c6b20efb9a72106cf6ae288c411e490345b089 100644
--- a/lite/kernels/mlu/bridges/graph.h
+++ b/lite/kernels/mlu/bridges/graph.h
@@ -15,13 +15,15 @@
 #pragma once
 
 #include <cmath>
-#include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
+
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 #include "lite/kernels/mlu/bridges/tensor.h"
+#include "lite/utils/env.h"
 
 #define PRINT_HW_TIME false
 
@@ -45,32 +47,30 @@ class Graph {
     CNRT_CALL(cnrtCreateNotifier(&notifier_end_));
 #endif
   }
-
   ~Graph() {
     FreeConstData();
     CNML_CALL(cnmlDestroyFusionOp(&fusion_op_));
-    for (auto op : ops_) {
-      CNML_CALL(cnmlDestroyBaseOp(&op));
-    }
 #if PRINT_HW_TIME
     CNRT_CALL(cnrtDestroyNotifier(&notifier_start_));
     CNRT_CALL(cnrtDestroyNotifier(&notifier_end_));
     double total_time = 0;
-    for (auto& f : time_log_) {
-      total_time += f;
+    if (!time_log_.empty()) {
+      for (auto& f : time_log_) {
+        total_time += f;
+      }
+      std::cout << "cnml hardware time for " << time_log_.size()
+                << " process:" << total_time / time_log_.size() << std::endl;
     }
-    std::cout << "cnml hardware time for " << time_log_.size()
-              << " process:" << total_time / time_log_.size() << std::endl;
 #endif
   }
-
   // Data node
   std::shared_ptr<MLUTensor> AddNode(
       const std::string& name,
       std::vector<int64_t> shape,
       cnmlTensorType_t tensor_type = CNML_TENSOR,
-      cnmlDataOrder_t data_order = CNML_NCHW,
+      cnmlDataOrder_t shape_order = CNML_NCHW,
       cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
+      cnmlDataOrder_t data_order = CNML_NHWC,
       void* raw_ptr = nullptr);
 
   std::shared_ptr<MLUTensor> GetNode(const std::string& name) {
@@ -82,9 +82,16 @@ class Graph {
     return nodes_.find(name) != nodes_.end();
   }
 
-  void AddInput(std::shared_ptr<MLUTensor> tensor) {
+  void AddInput(std::shared_ptr<MLUTensor> tensor,
+                bool disable_batch_size_changeable = true) {
     inputs_.push_back(tensor->mlu_tensor());
     input_tensors_.push_back(tensor);
+    if (!disable_batch_size_changeable) {
+      constexpr int input_dimNb = 4;
+      bool input_dim_mutable[4] = {true, false, false, false};
+      CNML_CALL(cnmlSetTensorDimMutable(
+          tensor->mlu_tensor(), input_dim_mutable, input_dimNb));
+    }
   }
 
   void AddOutput(std::shared_ptr<MLUTensor> tensor) {
@@ -92,6 +99,22 @@ class Graph {
     output_tensors_.push_back(tensor);
   }
 
+  std::vector<std::shared_ptr<MLUTensor>>* MutableInputs() {
+    return &input_tensors_;
+  }
+
+  std::vector<std::shared_ptr<MLUTensor>>* MutableOutputs() {
+    return &output_tensors_;
+  }
+  void GenOfflineModel(const std::string& name) {
+    cnmlModel_t model;
+    const std::string& symbol = "subnet0";
+    const auto& filename = name + ".offline.cambricon";
+    CNML_CALL(cnmlCreateModel(&model, filename.c_str()));
+    CNML_CALL(cnmlAddFusionOpToModel(model, fusion_op_, symbol.c_str()));
+    CNML_CALL(cnmlSaveModel(model, filename.c_str()));
+    CNML_CALL(cnmlDestroyModel(model));
+  }
   void FuseOp(cnmlBaseOp_t op) { CNML_CALL(cnmlFuseOp(op, fusion_op_)); }
 
   void Compile(cnmlCoreVersion_t core_version, int core_number) {
@@ -103,18 +126,37 @@ class Graph {
     CNML_CALL(cnmlSetFusionOpCorenum(fusion_op_, core_number));
     CNML_CALL(cnmlSetFusionOpCoreVersion(fusion_op_, core_version));
     CNML_CALL(cnmlCompileFusionOp_V2(fusion_op_));
-    for (auto in : input_tensors_) {
-      input_addrs_.push_back(in->mlu_data());
-    }
-    for (auto out : output_tensors_) {
-      output_addrs_.push_back(out->mlu_data());
-    }
   }
 
+#define MEASURE_HWTIME_START(que)                       \
+  do {                                                  \
+    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que)); \
+  } while (0)
+
+#define MEASURE_HWTIME_END(que)                                                \
+  do {                                                                         \
+    thread_local float hw_time;                                                \
+    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));                          \
+    CNRT_CALL(cnrtSyncQueue(que));                                             \
+    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time)); \
+    hw_time /= 1000.0f;                                                        \
+    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;       \
+    std::lock_guard<std::mutex> lk(time_mut_);                                 \
+    time_log_.push_back(hw_time);                                              \
+  } while (0)
+
   void Compute(cnrtInvokeFuncParam_t forward_param, cnrtQueue_t que) {
+    input_addrs_.resize(input_tensors_.size());
+    output_addrs_.resize(output_tensors_.size());
+    for (size_t i = 0; i < input_addrs_.size(); ++i) {
+      input_addrs_[i] = input_tensors_[i]->mlu_data();
+    }
+    for (size_t i = 0; i < output_addrs_.size(); ++i) {
+      output_addrs_[i] = output_tensors_[i]->mlu_data();
+    }
+
 #if PRINT_HW_TIME
-    thread_local float hw_time;
-    CNRT_CALL(cnrtPlaceNotifier(notifier_start_, que));
+    MEASURE_HWTIME_START(que);
 #endif
     CNML_CALL(cnmlComputeFusionOpForward_V3(fusion_op_,
                                             input_addrs_.data(),
@@ -124,18 +166,46 @@ class Graph {
                                             &forward_param,
                                             que));
 #if PRINT_HW_TIME
-    CNRT_CALL(cnrtPlaceNotifier(notifier_end_, que));
+    MEASURE_HWTIME_END(que);
 #endif
+  }
 
-    CNRT_CALL(cnrtSyncQueue(que));
+  void Compute(cnrtQueue_t que,
+               const std::vector<std::shared_ptr<MLUTensor>>& in,
+               const std::vector<std::shared_ptr<MLUTensor>>& out) {
+    std::vector<cnmlTensor_t> in_tensor;
+    std::vector<cnmlTensor_t> out_tensor;
+    input_addrs_.resize(in.size());
+    output_addrs_.resize(out.size());
+    for (size_t i = 0; i < input_addrs_.size(); ++i) {
+      input_addrs_[i] = in[i]->mlu_data();
+      in_tensor.push_back(in[i]->mlu_tensor());
+    }
+    for (size_t i = 0; i < output_addrs_.size(); ++i) {
+      output_addrs_[i] = out[i]->mlu_data();
+      out_tensor.push_back(out[i]->mlu_tensor());
+    }
+
+#if PRINT_HW_TIME
+    MEASURE_HWTIME_START(que);
+#endif
+    /* Because of using cnmlSetTensorDimMutable, cnmlComputeFusionOpForward_V3
+     * -> cnmlComputeFusionOpForward_V4 */
+    CNML_CALL(cnmlComputeFusionOpForward_V4(fusion_op_,
+                                            &in_tensor[0],
+                                            input_addrs_.data(),
+                                            input_addrs_.size(),
+                                            &out_tensor[0],
+                                            output_addrs_.data(),
+                                            output_addrs_.size(),
+                                            que,
+                                            NULL));
 #if PRINT_HW_TIME
-    CNRT_CALL(cnrtNotifierDuration(notifier_start_, notifier_end_, &hw_time));
-    hw_time /= 1000.0f;
-    DLOG(INFO) << "cnml hardware time " << hw_time << "ms" << std::endl;
-    std::lock_guard<std::mutex> lk(time_mut_);
-    time_log_.push_back(hw_time);
+    MEASURE_HWTIME_END(que);
 #endif
   }
+#undef MEASURE_HWTIME_START
+#undef MEASURE_HWTIME_END
 
   template <typename T>
   void* RegisterConstData(size_t len) {
@@ -165,7 +235,7 @@ class Graph {
       CNML_CALL(cnmlBindConstData_V2(
           nodes_[tensor_name]->mlu_tensor(), alloc_data, false));
     } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      void* data_fp16 = RegisterConstData<::paddle::lite::fluid::float16>(len);
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
       CNRT_CALL(
           cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
                            CNRT_FLOAT32,
@@ -180,7 +250,7 @@ class Graph {
     }
   }
 
-  void BindConstData(std::string tensor_name, ::paddle::lite::Tensor* tensor) {
+  void BindConstData(std::string tensor_name, paddle::lite::Tensor* tensor) {
     const float* data = tensor->data<float>();
     size_t len = tensor->data_size();
     if (fp_type_ == CNML_DATA_FLOAT32) {
@@ -189,10 +259,14 @@ class Graph {
           const_cast<void*>(static_cast<const void*>(data)),
           false));
     } else if (fp_type_ == CNML_DATA_FLOAT16) {
-      auto* data_fp16 = tensor->mutable_data<::paddle::lite::fluid::float16>();
-      for (size_t i = 0; i < len; ++i) {
-        data_fp16[i] = static_cast<::paddle::lite::fluid::float16>(data[i]);
-      }
+      void* data_fp16 = RegisterConstData<paddle::lite::fluid::float16>(len);
+      CNRT_CALL(
+          cnrtCastDataType(const_cast<void*>(static_cast<const void*>(data)),
+                           CNRT_FLOAT32,
+                           data_fp16,
+                           CNRT_FLOAT16,
+                           len,
+                           nullptr));
       CNML_CALL(cnmlBindConstData_V2(nodes_[tensor_name]->mlu_tensor(),
                                      static_cast<void*>(data_fp16),
                                      false));
@@ -206,19 +280,23 @@ class Graph {
                             float scale,
                             cnmlDataType_t data_type = CNML_DATA_INT8) {
     cnmlQuantizedParam_t quant_param;
-    CNML_CALL(
-        cnmlCreateQuantizedParam(&quant_param, scale2position(scale), 1, 0.0));
+    int pos = scale2position(scale);
+    auto cnml_scale = pow(2, pos) * scale;
+    VLOG(5) << "[cnml quantized param] pos: " << pos
+            << "\tscale: " << cnml_scale << std::endl;
+    CNML_CALL(cnmlCreateQuantizedParam(&quant_param, pos, cnml_scale, 0.0));
     CNML_CALL(
         cnmlSetOperationComputingDataType(op, tensor, data_type, quant_param));
     CNML_CALL(cnmlDestroyQuantizedParam(&quant_param));
   }
 
-  void SetFPType(::paddle::lite_api::PrecisionType type) {
+  void SetFPType(paddle::lite_api::PrecisionType type) {
+    origin_fp_type_ = type;
     switch (type) {
-      case ::paddle::lite_api::PrecisionType::kFP16:
+      case paddle::lite_api::PrecisionType::kFP16:
         fp_type_ = CNML_DATA_FLOAT16;
         break;
-      case ::paddle::lite_api::PrecisionType::kFloat:
+      case paddle::lite_api::PrecisionType::kFloat:
         fp_type_ = CNML_DATA_FLOAT32;
         break;
       default:
@@ -230,14 +308,14 @@ class Graph {
 
  private:
   cnmlDataType_t fp_type_{CNML_DATA_FLOAT32};
-  std::map<std::string, std::shared_ptr<MLUTensor>> nodes_;
+  paddle::lite_api::PrecisionType origin_fp_type_{PRECISION(kFloat)};
+  std::unordered_map<std::string, std::shared_ptr<MLUTensor>> nodes_;
   std::vector<cnmlTensor_t> inputs_;
   std::vector<cnmlTensor_t> outputs_;
   std::vector<void*> input_addrs_;
   std::vector<void*> output_addrs_;
   std::vector<std::shared_ptr<MLUTensor>> input_tensors_;
   std::vector<std::shared_ptr<MLUTensor>> output_tensors_;
-  std::vector<cnmlBaseOp_t> ops_;
   cnmlFusionOp_t fusion_op_;
   std::vector<void*> const_data_storage_;
 #if PRINT_HW_TIME
diff --git a/lite/kernels/mlu/bridges/interpolate_op.cc b/lite/kernels/mlu/bridges/interpolate_op.cc
index 2c1a2aeeff799d31d4328169fce058259543fb1f..32840736b8d9a9712d59a8175cd7d70311a34aad 100644
--- a/lite/kernels/mlu/bridges/interpolate_op.cc
+++ b/lite/kernels/mlu/bridges/interpolate_op.cc
@@ -85,6 +85,7 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                         nn_param));
   CNML_CALL(cnmlDestroyNearestNeighborOpParam(&nn_param));
   graph->FuseOp(interp_op);
+  CNML_CALL(cnmlDestroyBaseOp(&interp_op));
 
   return SUCCESS;
 }
diff --git a/lite/kernels/mlu/bridges/layout_op.cc b/lite/kernels/mlu/bridges/layout_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d14695c4357e06832e06a68646628bfa8d211c43
--- /dev/null
+++ b/lite/kernels/mlu/bridges/layout_op.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int LayoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("Input").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  std::shared_ptr<MLUTensor> output_tensor;
+
+  CHECK(graph->HasNode(x_var_name));
+  std::vector<int> axis;
+  auto x_tensor = graph->GetNode(x_var_name);
+  auto x_data_order = x_tensor->dorder();
+  auto x_dims = x->dims().Vectorize();
+  if (x_data_order == CNML_NCHW) {
+    switch (x_dims.size()) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        axis = {0, 2, 1};
+        break;
+      case 4:
+        axis = {0, 2, 3, 1};
+        break;
+      case 5:
+        axis = {0, 2, 3, 4, 1};
+        break;
+      default:
+        CHECK(0) << "Unsupport shape";
+    }
+    output_tensor = graph->AddNode(
+        out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, x_tensor->dtype());
+    VLOG(3) << "layout transpose nchw to nhwc" << std::endl;
+  } else {
+    switch (x_dims.size()) {
+      case 2:
+        axis = {0, 1};
+        break;
+      case 3:
+        axis = {0, 2, 1};
+        break;
+      case 4:
+        axis = {0, 3, 1, 2};
+        break;
+      case 5:
+        axis = {0, 4, 1, 2, 3};
+        break;
+      default:
+        CHECK(0) << "Unsupport shpae";
+    }
+    VLOG(3) << "layout transpose nhwc to nchw" << std::endl;
+    output_tensor = graph->AddNode(out_var_name,
+                                   output_dims,
+                                   CNML_TENSOR,
+                                   CNML_NCHW,
+                                   x_tensor->dtype(),
+                                   CNML_NCHW);
+  }
+  cnmlBaseOp_t layout_op;
+  cnmlNdTransposeOpParam_t transpose_param;
+  CNML_CALL(
+      cnmlCreateNdTransposeOpParam(&transpose_param, axis.data(), axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&layout_op,
+                                       x_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor(),
+                                       transpose_param));
+  graph->FuseOp(layout_op);
+  CNML_CALL(cnmlDestroyBaseOp(&layout_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(layout,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::LayoutConverter);
diff --git a/lite/kernels/mlu/bridges/layout_op_test.cc b/lite/kernels/mlu/bridges/layout_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..69b905b0750fe99e29c6aaa9bffdc9f20229a239
--- /dev/null
+++ b/lite/kernels/mlu/bridges/layout_op_test.cc
@@ -0,0 +1,190 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/layout_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void test_layout_NHWC2NCHW(std::vector<int64_t> input_shape) {
+  // prepare input&output variables
+  std::string x_var_name = "input";
+  std::string out_var_name = "out";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(input_shape));
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("layout");
+  opdesc.SetInput("Input", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  auto op = CreateOp<operators::LayoutOp>(opdesc, &scope);
+
+  // execute reference implementation and save to output tensor
+  Tensor input;
+  input.Resize(DDim(input_shape));
+  switch (input_shape.size()) {
+    case 2:
+      transpose<float>(
+          x->mutable_data<float>(),
+          input.mutable_data<float>(),
+          {static_cast<int>(input_shape[0]), static_cast<int>(input_shape[1])},
+          {0, 1});
+      break;
+    case 3:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[1])},
+                       {0, 2, 1});
+      break;
+    case 4:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3]),
+                        static_cast<int>(input_shape[1])},
+                       {0, 3, 1, 2});
+      break;
+    case 5:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3]),
+                        static_cast<int>(input_shape[4]),
+                        static_cast<int>(input_shape[1])},
+                       {0, 4, 1, 2, 3});
+      break;
+    default:
+      CHECK(0) << "Unsupport";
+  }
+  auto* x_data = input.mutable_data<float>();
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], x_data[i], 5e-4);
+  }
+}
+
+void test_layout_NCHW2NHWC(std::vector<int64_t> input_shape) {
+  // prepare input&output variables
+  std::string x_var_name = "input";
+  std::string out_var_name = "out";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(DDim(input_shape));
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("layout");
+  opdesc.SetInput("Input", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  auto op = CreateOp<operators::LayoutOp>(opdesc, &scope);
+
+  // execute reference implementation and save to output tensor
+  Tensor input;
+  input.Resize(DDim(input_shape));
+  switch (input_shape.size()) {
+    case 2:
+      transpose<float>(
+          x->mutable_data<float>(),
+          input.mutable_data<float>(),
+          {static_cast<int>(input_shape[0]), static_cast<int>(input_shape[1])},
+          {0, 1});
+      break;
+    case 3:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[1]),
+                        static_cast<int>(input_shape[2])},
+                       {0, 2, 1});
+      break;
+    case 4:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[1]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3])},
+                       {0, 2, 3, 1});
+      break;
+    case 5:
+      transpose<float>(x->mutable_data<float>(),
+                       input.mutable_data<float>(),
+                       {static_cast<int>(input_shape[0]),
+                        static_cast<int>(input_shape[1]),
+                        static_cast<int>(input_shape[2]),
+                        static_cast<int>(input_shape[3]),
+                        static_cast<int>(input_shape[4])},
+                       {0, 2, 3, 4, 1});
+      break;
+    default:
+      CHECK(0) << "Unsupport";
+  }
+  auto* x_data = input.mutable_data<float>();
+  LaunchOp(op, {x_var_name}, {out_var_name}, CNML_NCHW);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], x_data[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, layout) {
+  test_layout_NHWC2NCHW({12, 32, 4});
+  test_layout_NHWC2NCHW({12, 32, 44, 3});
+  test_layout_NHWC2NCHW({12, 32, 44, 3, 6});
+  test_layout_NCHW2NHWC({12, 32, 55});
+  test_layout_NCHW2NHWC({12, 32, 44, 3});
+  test_layout_NCHW2NHWC({12, 32, 44, 3, 8});
+  test_layout_NHWC2NCHW({12, 32});
+  test_layout_NCHW2NHWC({12, 32});
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(layout, kMLU);
diff --git a/lite/kernels/mlu/bridges/lrn_op.cc b/lite/kernels/mlu/bridges/lrn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..657f0dd6781590e1a9ca90bf25e4efcf789863dd
--- /dev/null
+++ b/lite/kernels/mlu/bridges/lrn_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int LrnConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Create lrn node and get params from op
+  auto fp_type = graph->FPType();
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type);
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+
+  auto alpha = op_info->GetAttr<float>("alpha");
+  auto beta = op_info->GetAttr<float>("beta");
+  auto k = op_info->GetAttr<float>("k");
+  if (op_info->HasAttr("norm_region")) {
+    CHECK(op_info->GetAttr<std::string>("norm_region") == "AcrossChannels")
+        << "Unsuport WithinChannel";
+  }
+  auto local_size = op_info->GetAttr<int>("n");
+  CHECK(op_info->HasAttr("input_scale"));
+  auto input_scale = op_info->GetAttr<float>("input_scale");
+  VLOG(5) << "lrn input scale: " << input_scale;
+
+  cnmlLrnOpParam_t param;
+  cnmlBaseOp_t lrn_op;
+  CNML_CALL(
+      cnmlCreateLrnOpParam(&param, CNML_LRN_V3, local_size, alpha, beta, k));
+  CNML_CALL(cnmlCreateLrnOp(
+      &lrn_op, param, input_tensor->mlu_tensor(), output_tensor->mlu_tensor()));
+  CNML_CALL(cnmlDestroyLrnOpParam(&param));
+
+  graph->SetComputingDataType(
+      lrn_op, input_tensor->mlu_tensor(), 1 / input_scale);
+  CNML_CALL(cnmlSetOperationComputingDataType(
+      lrn_op, output_tensor->mlu_tensor(), fp_type, nullptr));
+
+  graph->FuseOp(lrn_op);
+  CNML_CALL(cnmlDestroyBaseOp(&lrn_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(lrn, kMLU, paddle::lite::subgraph::mlu::LrnConverter);
diff --git a/lite/kernels/mlu/bridges/lrn_op_test.cc b/lite/kernels/mlu/bridges/lrn_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21f7e816baeac264bf1b43b7520d464afa38c395
--- /dev/null
+++ b/lite/kernels/mlu/bridges/lrn_op_test.cc
@@ -0,0 +1,242 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/lrn_op.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <cmath>
+#include <string>
+#include <vector>
+
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+/**
+ * @brief get sum of x^2 between channels [size elements]
+ *
+ * @tparam float
+ * @param input
+ * @param channel_id: the c-th channel within n-th graph.
+ * @param offset_within_channel: the pixel's offset within a channel.
+ * @param offset_num: the first address of n-th graph.
+ * @param c
+ * @param h
+ * @param w
+ * @param size
+ * @return float
+ */
+float lrn_square(const float* input,
+                 int channel_id,
+                 int offset_within_channel,
+                 int offset_num,
+                 int c,
+                 int h,
+                 int w,
+                 int size) {
+  int pre_pad = (size - 1) / 2;
+  float res = 0;
+  const float* src = input + offset_num;
+
+  // handle left channels with padding situation.
+  if (channel_id - pre_pad < 0) {
+    for (int i = 0; i <= channel_id; ++i) {
+      res += src[i * h * w + offset_within_channel] *
+             src[i * h * w + offset_within_channel];
+    }
+  }
+
+  // handle left channels.
+  if (channel_id - pre_pad >= 0) {
+    for (int i = channel_id - pre_pad; i <= channel_id; ++i) {
+      res += src[i * h * w + offset_within_channel] *
+             src[i * h * w + offset_within_channel];
+    }
+  }
+
+  // handle right channels.
+  if (channel_id + pre_pad < c) {
+    for (int i = channel_id + 1; i <= channel_id + pre_pad; ++i) {
+      res += src[i * h * w + offset_within_channel] *
+             src[i * h * w + offset_within_channel];
+    }
+  }
+
+  // handle right channels with padding situation.
+  if (channel_id + pre_pad >= c && channel_id + 1 < c) {
+    for (int i = channel_id + 1; i < c; ++i) {
+      res += src[i * h * w + offset_within_channel] *
+             src[i * h * w + offset_within_channel];
+    }
+  }
+
+  return res;
+}
+
+void lrn_compute_ref(std::shared_ptr<operators::LrnOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x =
+      scope->FindVar(op_info->Input("X").front())->GetMutable<lite::Tensor>();
+  auto out = scope->FindVar(op_info->Output("Out").front())
+                 ->GetMutable<lite::Tensor>();
+
+  const float* x_data = x->data<const float>();
+  float* out_data = out->mutable_data<float>();
+  auto x_dims = x->dims();
+
+  auto alpha = op_info->GetAttr<float>("alpha");
+  auto beta = op_info->GetAttr<float>("beta");
+  auto k = op_info->GetAttr<float>("k");
+  auto norm_region = op_info->GetAttr<std::string>("norm_region");
+  auto local_size = op_info->GetAttr<int>("n");
+
+  int N = x_dims[0];
+  int C = x_dims[1];
+  int H = x_dims[2];
+  int W = x_dims[3];
+
+  int offset_num = 0;
+  int offset_within_channel = 0;
+  int dst_id;
+
+  float square;
+
+  for (int n = 0; n < N; ++n) {
+    offset_num = n * C * H * W;
+
+    for (int c = 0; c < C; ++c) {
+      for (int h = 0; h < H; ++h) {
+        for (int w = 0; w < W; ++w) {
+          offset_within_channel = h * W + w;
+          dst_id = offset_num + c * H * W + offset_within_channel;
+          square = lrn_square(x_data,
+                              c,
+                              offset_within_channel,
+                              offset_num,
+                              C,
+                              H,
+                              W,
+                              local_size);
+          out_data[dst_id] = x_data[dst_id] * pow(k + alpha * square, -beta);
+        }
+      }
+    }
+  }
+}
+
+void test_lrn(float alpha,
+              float beta,
+              float k,
+              int local_size,
+              int n,
+              int c,
+              int h,
+              int w,
+              const std::string& norm_region) {
+  Scope scope;
+  std::string x_var_name("X_test");
+  std::string out_var_name("Out_test");
+  std::string out_ref_var_name("Out_ref");
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+
+  std::vector<int64_t> x_dim{n, c, h, w};
+  x->Resize(x_dim);
+  out->Resize(x_dim);
+  out_ref->Resize(x_dim);
+  auto* x_data = x->mutable_data<float>();
+  FillTensor<float, float>(x, 0.f, 1.f);
+  float *dmax, *dmin;
+  std::tie(dmin, dmax) =
+      std::minmax_element(x_data, x_data + x->data_size() - 1);
+
+  cpp::OpDesc opdesc;
+  opdesc.SetType("lrn");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("alpha", alpha);
+  opdesc.SetAttr("beta", beta);
+  opdesc.SetAttr("k", k);
+  opdesc.SetAttr("n", local_size);
+  opdesc.SetAttr("norm_region", norm_region);
+  opdesc.SetAttr<float>("input_scale", (*dmax - *dmin) / 255.f);
+
+  auto op = CreateOp<operators::LrnOpLite>(opdesc, &scope);
+
+  // baseline
+  lrn_compute_ref(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_x;
+  input_x.Resize(x->dims());
+  transpose(x->mutable_data<float>(),
+            input_x.mutable_data<float>(),
+            {static_cast<int>(x_dim[0]),
+             static_cast<int>(x_dim[1]),
+             static_cast<int>(x_dim[2]),
+             static_cast<int>(x_dim[3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  Tensor output_trans;
+  auto os = out->dims();
+  output_trans.Resize(os);
+  transpose(out->mutable_data<float>(),
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+
+  auto output_data = output_trans.mutable_data<float>();
+  auto* output_ref_data = out_ref->mutable_data<float>();
+  for (size_t i = 0; i < out->data_size(); i++) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+  }
+}
+
+TEST(MLUBridges, lrn) {
+  int local_size = 5;
+  float alpha = 0.0001f;
+  float beta = 0.75;
+  float k = 2.0f;
+  std::string norm_region = "AcrossChannels";
+  for (int w : {2, 4, 8}) {
+    for (int h : {2, 4, 8}) {
+      for (int c : {1, 2, 3, 4}) {
+        for (int n : {1, 2, 3, 4}) {
+          test_lrn(alpha, beta, k, local_size, n, c, h, w, norm_region);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(lrn, kMLU)
diff --git a/lite/kernels/mlu/bridges/norm_op.cc b/lite/kernels/mlu/bridges/norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..492c3932a8c8a68f7eba687dde30d888d6e0f297
--- /dev/null
+++ b/lite/kernels/mlu/bridges/norm_op.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int NormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims().Vectorize();
+
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  int axis = op_info->GetAttr<int>("axis");
+  int epsilon = op_info->GetAttr<float>("epsilon");
+  if (axis < 0) {
+    axis = axis + x_dims.size();
+  }
+  std::vector<int> nchw2nhwc = {0, 3, 1, 2};
+  int nhwc_axis = nchw2nhwc[axis];
+
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+
+  // ======== DEBUG ===============
+  VLOG(6) << "x name=" << x_var_name;
+  VLOG(6) << "out name=" << out_var_name;
+  VLOG(6) << "x dims=" << x->dims();
+  VLOG(6) << "out dims=" << output->dims();
+  VLOG(6) << "axis =" << axis;
+  VLOG(6) << "nwhc axis=" << nhwc_axis;
+  VLOG(6) << "epsilon =" << epsilon;
+  // cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR);
+  // cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR);
+  // ======== DEBUG END ============
+  cnmlBaseOp_t norm_op{nullptr};
+
+  cnmlNormalizeOpParam_t param;
+  int mode = -1;
+  switch (axis) {
+    case 0:
+      mode = 3;  // N
+      break;
+    case 1:
+      mode = 0;  // C
+      break;
+    case 2:
+      mode = 4;  // H
+      break;
+    case 3:
+      mode = 5;  // W
+      break;
+    default:
+      CHECK(0);
+      break;
+  }
+  cnmlCreateNormalizeOpParamV2(&param,
+                               0,  // p
+                               0,  // use_scale
+                               mode,
+                               1,  // weight
+                               epsilon);
+
+  CNML_CALL(cnmlCreateNormalizeOp(&norm_op,
+                                  param,
+                                  input_tensor->mlu_tensor(),
+                                  output_tensor->mlu_tensor(),
+                                  nullptr,
+                                  false /*is_fix8_mode*/));
+  graph->FuseOp(norm_op);
+  CNML_CALL(cnmlDestroyBaseOp(&norm_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(norm,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::NormConverter);
diff --git a/lite/kernels/mlu/bridges/norm_op_test.cc b/lite/kernels/mlu/bridges/norm_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35b5eabbb9ffacd96c3ca6500dd9181f4d5bec5b
--- /dev/null
+++ b/lite/kernels/mlu/bridges/norm_op_test.cc
@@ -0,0 +1,148 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/norm_op.h"
+
+#include <gtest/gtest.h>
+
+#include <cmath>
+#include <iostream>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+// void ToFile(std::string file_name, Tensor* tensor) {
+//   int count = tensor->dims().production();
+//   auto data = tensor->mutable_data<float>();
+//   std::ostringstream outs;
+//   for (size_t i = 0; i < count; i++) {
+//     outs << data[i] << std::endl;
+//   }
+//   std::ofstream of;
+//   of.open(file_name, std::ios::out);
+//   of << outs.str();
+//   of.close();
+// }
+
+void norm_ref(const std::shared_ptr<operators::NormOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  int axis = op_info->GetAttr<int>("axis");
+  int epsilon = op_info->GetAttr<float>("epsilon");
+  auto x_dims = x->dims();
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  out->Resize(x_dims.Vectorize());
+  auto* out_data = out->mutable_data<float>();
+
+  const auto* x_data = x->data<float>();
+  int pre_n = x_dims.count(0, axis);
+  int n = x_dims[axis];
+  int post_n = x_dims.count(axis + 1, x_dims.size());
+  for (int i = 0; i < pre_n; i++) {
+    for (int k = 0; k < post_n; k++) {
+      float sum = epsilon;
+      const float* in_tmp = x_data + i * n * post_n + k;
+      for (int j = 0; j < n; j++) {
+        sum += in_tmp[j * post_n] * in_tmp[j * post_n];
+      }
+      sum = std::sqrt(sum);
+      float* out_tmp = out_data + i * n * post_n + k;
+      for (int j = 0; j < n; j++) {
+        out_tmp[j * post_n] = in_tmp[j * post_n] / sum;
+      }
+    }
+  }
+}
+
+void test_norm(const std::vector<int64_t>& input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+  // initialize input&output data
+  FillTensor<float, float>(x, -9, 9);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  float epsilon = 1e-9f;
+  opdesc.SetType("norm");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", static_cast<int>(axis));
+  opdesc.SetAttr("epsilon", static_cast<float>(epsilon));
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::NormOp>(opdesc, &scope);
+  norm_ref(op);
+  out_ref->CopyDataFrom(*out);
+  Tensor input_x;
+  input_x.Resize(DDim(input_shape));
+  // change input layout from NCHW to NHWC
+  transpose<float>(x->mutable_data<float>(),
+                   input_x.mutable_data<float>(),
+                   {static_cast<int>(input_shape[0]),
+                    static_cast<int>(input_shape[1]),
+                    static_cast<int>(input_shape[2]),
+                    static_cast<int>(input_shape[3])},
+                   {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  std::vector<int64_t> out_shape = input_shape;
+  Tensor output_trans;
+  output_trans.Resize(out_shape);
+  // Change output layout from NHWC to NCHW
+  transpose<float>(out_data,
+                   output_trans.mutable_data<float>(),
+                   {static_cast<int>(out_shape[0]),
+                    static_cast<int>(out_shape[2]),
+                    static_cast<int>(out_shape[3]),
+                    static_cast<int>(out_shape[1])},
+                   {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(MLUBridges, norm) {
+  test_norm({1, 2, 3, 4}, 1);
+  test_norm({1, 2, 3, 4}, 2);
+  test_norm({1, 2, 3, 4}, 3);
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(norm, kMLU);
diff --git a/lite/kernels/mlu/bridges/paddle_use_bridges.h b/lite/kernels/mlu/bridges/paddle_use_bridges.h
index d31ba0dd41111860a3b26d8ac3afb3273bef4557..be5c64b3b7056d0b8de1589d198db541b5a3777b 100644
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -15,6 +15,7 @@
 #pragma once
 
 USE_SUBGRAPH_BRIDGE(relu, kMLU);
+USE_SUBGRAPH_BRIDGE(relu6, kMLU)
 USE_SUBGRAPH_BRIDGE(conv2d, kMLU);
 USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kMLU);
 USE_SUBGRAPH_BRIDGE(elementwise_add, kMLU);
@@ -24,5 +25,26 @@ USE_SUBGRAPH_BRIDGE(batch_norm, kMLU);
 USE_SUBGRAPH_BRIDGE(fc, kMLU);
 USE_SUBGRAPH_BRIDGE(nearest_interp, kMLU);
 USE_SUBGRAPH_BRIDGE(leaky_relu, kMLU);
+USE_SUBGRAPH_BRIDGE(transpose, kMLU);
+USE_SUBGRAPH_BRIDGE(transpose2, kMLU);
 USE_SUBGRAPH_BRIDGE(concat, kMLU);
 USE_SUBGRAPH_BRIDGE(scale, kMLU);
+USE_SUBGRAPH_BRIDGE(sigmoid, kMLU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kMLU);
+USE_SUBGRAPH_BRIDGE(dropout, kMLU);
+USE_SUBGRAPH_BRIDGE(arg_max, kMLU);
+USE_SUBGRAPH_BRIDGE(split, kMLU);
+USE_SUBGRAPH_BRIDGE(cast, kMLU);
+USE_SUBGRAPH_BRIDGE(layout, kMLU);
+USE_SUBGRAPH_BRIDGE(slice, kMLU);
+USE_SUBGRAPH_BRIDGE(squeeze, kMLU);
+USE_SUBGRAPH_BRIDGE(squeeze2, kMLU);
+USE_SUBGRAPH_BRIDGE(flatten, kMLU);
+USE_SUBGRAPH_BRIDGE(flatten2, kMLU);
+USE_SUBGRAPH_BRIDGE(reshape, kMLU);
+USE_SUBGRAPH_BRIDGE(reshape2, kMLU);
+#ifdef LITE_BUILD_EXTRA
+USE_SUBGRAPH_BRIDGE(gather, kMLU);
+USE_SUBGRAPH_BRIDGE(lrn, kMLU)
+USE_SUBGRAPH_BRIDGE(norm, kMLU)
+#endif
diff --git a/lite/kernels/mlu/bridges/pool_op.cc b/lite/kernels/mlu/bridges/pool_op.cc
index f77c8084c76fc52c39938e723f02bde9b3cac41b..c734de1eec75d253a9b6b8d7a7f21d710df3d949 100644
--- a/lite/kernels/mlu/bridges/pool_op.cc
+++ b/lite/kernels/mlu/bridges/pool_op.cc
@@ -55,6 +55,9 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto global_pooling = op_info->GetAttr<bool>("global_pooling");
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  CHECK(!(op_info->HasAttr("exclusive") &&
+          op_info->GetAttr<bool>("exclusive") == false))
+      << "Unsupport param exclusive is false!";
 
   if (paddings.size() == 2L) {
     for (size_t i = 0; i < 2L; ++i) {
@@ -62,8 +65,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
     }
   }
-  int pad_height = paddings[0];
-  int pad_width = paddings[2];
   std::string padding_algorithm("");
   if (op_info->HasAttr("padding_algorithm")) {
     padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
@@ -72,6 +73,8 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   if (op_info->HasAttr("adaptive")) {
     adaptive = op_info->GetAttr<bool>("adaptive");
   }
+  auto input_dims = x->dims();
+
   lite::operators::UpdatePadding(&paddings,
                                  global_pooling,
                                  adaptive,
@@ -80,31 +83,31 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                  strides,
                                  ksize);
 
-  //  std::vector<int64_t> output_shape({input_dims[0], input_dims[1]});
-  //  for (size_t i = 0; i < 2; i++) {
-  //    output_shape.push_back(
-  //        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] -
-  //        ksize[0]) /
-  //            strides[i] +
-  //        1);
-  //  }
+  if (global_pooling) {
+    ksize.resize(static_cast<size_t>(input_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      ksize[i] = static_cast<int>(input_dims[i + 2]);
+    }
+  }
 
   auto output_tensor = graph->AddNode(
       output_var_name, output_shape, CNML_TENSOR, CNML_NCHW, graph->FPType());
 
   cnmlPoolOpParam_t pool_param;
   CNML_CALL(
-      cnmlCreatePoolOpParam_V2(&pool_param,
+      cnmlCreatePoolOpParam_V3(&pool_param,
                                ksize[0],
                                ksize[1],
                                strides[0],
                                strides[1],
-                               pad_height,
-                               pad_width,
-                               1,  // dilation
-                               1,
+                               paddings[0],
+                               paddings[1],
+                               paddings[2],
+                               paddings[3],
+                               1,  // dilation h
+                               1,  // dilation w
                                ToCnmlPoolMode(pooling_type),
-                               ceil_mode ? CNML_POOL_KVALID : CNML_POOL_KFULL,
+                               ceil_mode ? CNML_POOL_KFULL : CNML_POOL_KVALID,
                                true, /* real */
                                1 /* blend factor */));
   cnmlBaseOp_t pool_op;
@@ -114,6 +117,7 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                              output_tensor->mlu_tensor()));
   CNML_CALL(cnmlDestroyPoolOpParam(&pool_param));
   graph->FuseOp(pool_op);
+  CNML_CALL(cnmlDestroyBaseOp(&pool_op));
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/pool_op_test.cc b/lite/kernels/mlu/bridges/pool_op_test.cc
index 8cee8dbe86109b14cff49f329d71074a9b3bfb61..2ae888744fde3e94e857f04d50ceb1eb878f3c1c 100644
--- a/lite/kernels/mlu/bridges/pool_op_test.cc
+++ b/lite/kernels/mlu/bridges/pool_op_test.cc
@@ -43,6 +43,12 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
   std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
   bool global_pooling = op_info->GetAttr<bool>("global_pooling");
 
+  if (pooling_type == "max") {
+    for (int i = 0; i < out_dims.production(); ++i) {
+      dst_ptr[i] = -65504.f;
+    }
+  }
+
   int in_n = in_dims[0];
   int in_c = in_dims[1];
   int in_h = in_dims[2];
@@ -203,62 +209,46 @@ void test_pool(int bs,
 }
 
 TEST(MLUBridges, pool) {
-  // for (auto pooling_type : {"max", "avg"}) {
-  //   for (auto ceil_mode : {true, false}) {
-  //     for (auto global_pooling : {/*true, */ false}) {
-  //       for (auto exclusive : {true /*, false*/}) {
-  //         for (auto ksize : {2, 3}) {
-  //           for (auto stride : {1, 2}) {
-  //             for (auto padding : {0, 1}) {
-  //               for (auto bs : {1, 3}) {
-  //                 for (auto ic : {1, 3}) {
-  //                   for (auto ih : {3, 7}) {
-  //                     for (auto iw : {3, 7}) {
-  //                       test_pool(bs,
-  //                                 ic,
-  //                                 ih,
-  //                                 iw,
-  //                                 pooling_type,
-  //                                 ceil_mode,
-  //                                 global_pooling,
-  //                                 exclusive,
-  //                                 ksize,
-  //                                 stride,
-  //                                 padding);
-  //                     }
-  //                   }
-  //                 }
-  //               }
-  //             }
-  //           }
-  //         }
-  //       }
-  //     }
-  //   }
-  // }
-
   for (auto pooling_type : {"max", "avg"}) {
     for (auto ceil_mode : {true, false}) {
-      bool global_pooling = false;
-      bool exclusive = true;
-      int ksize = 2;
-      int stride = 1;
-      int padding = 0;
-      int bs = 6;
-      int ic = 6;
-      int ih = 6;
-      int iw = 6;
-      test_pool(bs,
-                ic,
-                ih,
-                iw,
-                pooling_type,
-                ceil_mode,
-                global_pooling,
-                exclusive,
-                ksize,
-                stride,
-                padding);
+      for (auto global_pooling : {true, false}) {
+        for (auto exclusive : {true /*, false*/}) {
+          for (auto ksize : {2, 3}) {
+            for (auto stride : {1, 2}) {
+              for (auto padding : {0, 1}) {
+                for (auto bs : {1, 3}) {
+                  for (auto ic : {1, 3}) {
+                    for (auto ih : {3, 7}) {
+                      for (auto iw : {3, 7}) {
+                        LOG(INFO)
+                            << "shape: " << bs << ',' << ic << ',' << ih << ','
+                            << iw << '\t' << "pooling type: " << pooling_type
+                            << '\t' << "ceil model: " << ceil_mode << '\t'
+                            << "global_pooling: " << global_pooling << '\t'
+                            << "exclusive: " << exclusive << '\t'
+                            << "ksize: " << ksize << '\t'
+                            << "stride: " << stride << '\t'
+                            << "padding: " << padding;
+                        test_pool(bs,
+                                  ic,
+                                  ih,
+                                  iw,
+                                  pooling_type,
+                                  ceil_mode,
+                                  global_pooling,
+                                  exclusive,
+                                  ksize,
+                                  stride,
+                                  padding);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
     }
   }
 }
diff --git a/lite/kernels/mlu/bridges/reshape_op.cc b/lite/kernels/mlu/bridges/reshape_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b47322b3462525be64e42b608d052719d7c5f0b
--- /dev/null
+++ b/lite/kernels/mlu/bridges/reshape_op.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  // ================== Trans1: NHWC => NCHW ===========================
+  auto input_tensor = graph->GetNode(x_var_name);
+  auto trans_1_axis = std::move(GetAxisNHWC2NCHW<int>(x->dims().size()));
+  auto trans1_out = graph->AddNode(x_var_name + ".trans.i",
+                                   x->dims().Vectorize(),
+                                   CNML_TENSOR,
+                                   CNML_NCHW,
+                                   graph->FPType(),
+                                   CNML_NCHW);
+  cnmlBaseOp_t trans1_op{nullptr};
+  cnmlNdTransposeOpParam_t trans1_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans1_param, trans_1_axis.data(), trans_1_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans1_op,
+                                       input_tensor->mlu_tensor(),
+                                       trans1_out->mlu_tensor(),
+                                       trans1_param));
+  // ======================== Trans1 End ==================================
+
+  // ======================= Reshape op ===================================
+  cnmlBaseOp_t reshape_op;
+  auto trans2_input = graph->AddNode(out_var_name + ".trans.o",
+                                     output_dims,
+                                     CNML_TENSOR,
+                                     CNML_NCHW,
+                                     graph->FPType(),
+                                     CNML_NCHW);
+  cnmlReshapeOpParam_t reshape_param{nullptr};
+  int cnml_trans2_input_shape[4];
+  CNML_CALL(
+      cnmlGetTensorShape(trans2_input->mlu_tensor(), cnml_trans2_input_shape));
+  CNML_CALL(
+      cnmlCreateNdReshapeOpParam(&reshape_param, cnml_trans2_input_shape, 4));
+
+  // Use cnmlCreatexxxOpForward to create op.
+  CNML_CALL(cnmlCreateReshapeOp(&reshape_op,
+                                reshape_param,
+                                trans1_out->mlu_tensor(),
+                                trans2_input->mlu_tensor()));
+  // ======================= Reshape op End ===================================
+
+  // ================== Trans2: NCHW => NHWC ===============================
+  auto trans_2_axis = std::move(GetAxisNCHW2NHWC<int>(output->dims().size()));
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  cnmlBaseOp_t trans2_op{nullptr};
+  cnmlNdTransposeOpParam_t trans2_param{nullptr};
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &trans2_param, trans_2_axis.data(), trans_2_axis.size()));
+  CNML_CALL(cnmlCreateNdTransposeProOp(&trans2_op,
+                                       trans2_input->mlu_tensor(),
+                                       output_tensor->mlu_tensor(),
+                                       trans2_param));
+  // ======================== Trans2 End ==================================
+
+  // =============== DEBUG ====================
+  VLOG(6) << "x_var_name: " << x_var_name;
+  VLOG(6) << "out_var_name: " << out_var_name;
+  VLOG(6) << "input dim: " << x->dims();
+  VLOG(6) << "output dim: " << output->dims();
+  int cnml_input_shape[4];
+  CNML_CALL(cnmlGetTensorShape(input_tensor->mlu_tensor(), cnml_input_shape));
+  VLOG(6) << "cnml input dim: ";
+  for (size_t i = 0; i < 4; i++) {
+    VLOG(6) << cnml_input_shape[i];
+  }
+  //   cnmlPrintTensor(input_tensor->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(trans1_out->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(trans2_input->mlu_tensor(), CNML_TENSOR);
+  //   cnmlPrintTensor(output_tensor->mlu_tensor(), CNML_TENSOR);
+  // =============== DEBUG END =================
+
+  graph->FuseOp(trans1_op);
+  graph->FuseOp(reshape_op);
+  graph->FuseOp(trans2_op);
+  CNML_CALL(cnmlDestroyBaseOp(&trans1_op));
+  CNML_CALL(cnmlDestroyBaseOp(&reshape_op));
+  CNML_CALL(cnmlDestroyBaseOp(&trans2_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::ReshapeConverter);
diff --git a/lite/kernels/mlu/bridges/reshape_op_test.cc b/lite/kernels/mlu/bridges/reshape_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0cd2c6cc26f8f40ee83c99755d8842b072693b1a
--- /dev/null
+++ b/lite/kernels/mlu/bridges/reshape_op_test.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/reshape_op.h"
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+void test_reshape(std::vector<int64_t> input_shape,
+                  std::vector<int64_t> out_shape) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+  Tensor x_cpu;
+
+  // initialize input&output data
+  FillTensor<float, int>(x);
+  x_cpu.CopyDataFrom(*x);
+
+  Tensor input_trans;
+  input_trans.Resize(input_shape);
+  transpose(x->mutable_data<float>(),
+            input_trans.mutable_data<float>(),
+            {static_cast<int>(input_shape[0]),
+             static_cast<int>(input_shape[1]),
+             static_cast<int>(input_shape[2]),
+             static_cast<int>(input_shape[3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_trans);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("reshape2");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  std::vector<int> shape_attr;
+  shape_attr.resize(out_shape.size());
+  for (size_t i = 0; i < out_shape.size(); i++) {
+    shape_attr[i] = static_cast<int>(out_shape[i]);
+  }
+
+  opdesc.SetAttr<std::vector<int>>("shape", shape_attr);
+  auto op = CreateOp<operators::ReshapeOp>(opdesc, &scope);
+
+  auto os = out->dims();
+  out->Resize(out_shape);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  Tensor out_trans;
+  out_trans.Resize(out_shape);
+  transpose(out->mutable_data<float>(),
+            out_trans.mutable_data<float>(),
+            {static_cast<int>(out_shape[0]),
+             static_cast<int>(out_shape[1]),
+             static_cast<int>(out_shape[2]),
+             static_cast<int>(out_shape[3])},
+            {0, 3, 1, 2});
+  out->CopyDataFrom(out_trans);
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], x_cpu.mutable_data<float>()[i], 1e-5);
+  }
+}
+
+TEST(MLUBridges, reshape) { test_reshape({1, 2, 4, 4}, {1, 4, 2, 4}); }
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(reshape, kMLU);
+USE_SUBGRAPH_BRIDGE(reshape2, kMLU);
diff --git a/lite/kernels/mlu/bridges/scale_op.cc b/lite/kernels/mlu/bridges/scale_op.cc
index 5557602bd7576ccd71c51f52a538a45fe27f7ada..5b6b3dff7969562b19344f9eccbf219d26c3e02d 100644
--- a/lite/kernels/mlu/bridges/scale_op.cc
+++ b/lite/kernels/mlu/bridges/scale_op.cc
@@ -61,6 +61,7 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                               alpha_tensor->mlu_tensor(),
                               beta_tensor->mlu_tensor()));
   graph->FuseOp(scale_op);
+  CNML_CALL(cnmlDestroyBaseOp(&scale_op));
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/slice_op.cc b/lite/kernels/mlu/bridges/slice_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..067d110bf4160c5bcf2bbd3009d82bbb5804c998
--- /dev/null
+++ b/lite/kernels/mlu/bridges/slice_op.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto scope = op->scope();
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // input
+  auto input_var_name = op_info->Input("Input").front();
+  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input_shape = input->dims().Vectorize();
+  // output
+  auto output_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  // attr
+  auto axes = op_info->GetAttr<std::vector<int32_t>>("axes");
+  auto starts = op_info->GetAttr<std::vector<int32_t>>("starts");
+  auto ends = op_info->GetAttr<std::vector<int32_t>>("ends");
+
+  CHECK(graph->HasNode(input_var_name));
+  auto input_tensor = graph->GetNode(input_var_name);
+  auto output_tensor = graph->AddNode(output_var_name,
+                                      output->dims().Vectorize(),
+                                      CNML_TENSOR,
+                                      CNML_NCHW,
+                                      graph->FPType());
+
+  std::vector<int32_t> begin_index(input_shape.size(), 0);
+  std::vector<int32_t> end_index(input_shape.size());
+  std::vector<int32_t> strides(input_shape.size(), 1);
+  auto nhwc2nchw_axis = std::move(GetAxisNHWC2NCHW<int>(input_shape.size()));
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    end_index[nhwc2nchw_axis[i]] = input_shape[i];
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    int dim_value = input_shape[axes[i]];
+    int end = ends[i] < 0 ? std::max(ends[i] + dim_value, 0) : ends[i];
+    begin_index[nhwc2nchw_axis[axes[i]]] =
+        starts[i] < 0 ? std::max(starts[i] + dim_value, 0) : starts[i];
+    end_index[nhwc2nchw_axis[axes[i]]] = std::min(end, dim_value);
+  }
+
+  cnmlNdStridedSliceOpParam_t param;
+  cnmlBaseOp_t slice_op;
+  CNML_CALL(cnmlCreateNdStridedSliceOpParam(&param,
+                                            input_shape.size(),
+                                            begin_index.data(),
+                                            end_index.data(),
+                                            strides.data()));
+  CNML_CALL(cnmlCreateNdStridedSliceOp(&slice_op,
+                                       param,
+                                       input_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor()));
+  CNML_CALL(cnmlDestroyNdStridedSliceOpParam(&param));
+
+  graph->FuseOp(slice_op);
+  CNML_CALL(cnmlDestroyBaseOp(&slice_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(slice,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SliceConverter);
diff --git a/lite/kernels/mlu/bridges/slice_op_test.cc b/lite/kernels/mlu/bridges/slice_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a5e2a9f5a4c99b6f46fff24686cdbe546cae727d
--- /dev/null
+++ b/lite/kernels/mlu/bridges/slice_op_test.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/slice_op.h"
+#include <gtest/gtest.h>
+#include <utility>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+static void slice_ref(const float* input,
+                      std::vector<int64_t> in_dims,
+                      std::vector<int> axes,
+                      std::vector<int> starts,
+                      std::vector<int> ends,
+                      float* out) {
+  auto out_dims = in_dims;
+  std::vector<int> real_starts(in_dims.size(), 0);
+  std::vector<int> real_ends(in_dims.size(), 0);
+  std::vector<int> real_step(in_dims.size(), 0);
+  for (size_t i = 0; i < in_dims.size(); i++) {
+    real_ends[i] = in_dims[i];
+  }
+  for (size_t i = 0; i < axes.size(); i++) {
+    int dim_value = in_dims[axes[i]];
+    if (dim_value > 0) {
+      int start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i];
+      int end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i];
+      start = std::max(start, 0);
+      end = std::max(end, 0);
+      end = std::min(end, dim_value);
+      out_dims[axes[i]] = end - start;
+      real_starts[axes[i]] = start;
+      real_ends[axes[i]] = end;
+    }
+  }
+  const int LEN = in_dims.size();
+  int dst_step[LEN];
+  for (size_t i = 0; i < in_dims.size(); ++i) {
+    dst_step[i] = 1;
+  }
+  int src_step[LEN];
+  for (size_t i = 0; i < in_dims.size(); ++i) {
+    src_step[i] = 1;
+  }
+  int out_num = out_dims[in_dims.size() - 1];
+  for (int i = in_dims.size() - 2; i >= 0; i--) {
+    dst_step[i] = out_dims[i + 1] * dst_step[i + 1];
+    src_step[i] = in_dims[i + 1] * src_step[i + 1];
+    out_num *= out_dims[i];
+  }
+
+  for (int dst_id = 0; dst_id < out_num; dst_id++) {
+    int src_id = 0;
+    int index_id = dst_id;
+    for (size_t j = 0; j < out_dims.size(); j++) {
+      int cur_id = index_id / dst_step[j];
+      index_id = index_id % dst_step[j];
+      src_id += (cur_id + real_starts[j]) * src_step[j];
+    }
+    out[dst_id] = input[src_id];
+  }
+}
+
+static void test_case(std::vector<int64_t> x_shape,
+                      std::vector<int64_t> out_shape,
+                      std::vector<int> starts,
+                      std::vector<int> ends,
+                      std::vector<int> axes) {
+  Scope scope;
+
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  x->Resize(lite::DDim(x_shape));
+  out->Resize(lite::DDim(out_shape));
+
+  auto x_data = x->mutable_data<float>();
+  FillTensor<float, float>(x, 0.f, 2.f);
+
+  cpp::OpDesc opdesc;
+  opdesc.SetType("slice");
+  opdesc.SetInput("Input", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axes", axes);
+  opdesc.SetAttr("starts", starts);
+  opdesc.SetAttr("ends", ends);
+
+  std::vector<float> out_ref(out->data_size(), 0);
+  slice_ref(x_data, x_shape, axes, starts, ends, out_ref.data());
+
+  auto type_cast = [](int64_t in) { return static_cast<int>(in); };
+  std::vector<int> i_dims;
+  std::transform(
+      x_shape.cbegin(), x_shape.cend(), std::back_inserter(i_dims), type_cast);
+
+  auto nchw2nhwc_axis = std::move(GetAxisNCHW2NHWC<int>(x_shape.size()));
+
+  Tensor input_x;
+  input_x.Resize(x->dims());
+  transpose<float>(x->mutable_data<float>(),
+                   input_x.mutable_data<float>(),
+                   i_dims,
+                   nchw2nhwc_axis);
+  x->CopyDataFrom(input_x);
+
+  auto op = CreateOp<operators::SliceOp>(opdesc, &scope);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  Tensor output_trans;
+  auto os = out->dims().Vectorize();
+  output_trans.Resize(os);
+  std::vector<int> o_dims(os.size());
+  for (size_t i = 0; i < os.size(); ++i) {
+    o_dims[i] = os[nchw2nhwc_axis[i]];
+  }
+  transpose<float>(out->mutable_data<float>(),
+                   output_trans.mutable_data<float>(),
+                   o_dims,
+                   GetAxisNHWC2NCHW<int>(x_shape.size()));
+
+  auto out_data = output_trans.mutable_data<float>();
+  for (DDim::value_type i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_ref[i], out_data[i], 1e-4);
+  }
+}
+
+TEST(MLUBridges, slice) {
+  /* test_case({3}, {3}, {-3}, {3}, {0}); */
+  test_case({3, 4}, {3, 4}, {-3, 0}, {3, 100}, {0, 1});
+  test_case({3, 4, 5}, {3, 4, 2}, {-3, 0, 2}, {3, 100, -1}, {0, 1, 2});
+  test_case({3, 4, 5, 6}, {3, 4, 2, 6}, {-3, 0, 2}, {3, 100, -1}, {0, 1, 2});
+  /* test_case({3, 4, 5, 6, 3}, {3, 4, 2, 6, 3}, {-3, 0, 2}, {3, 100, -1}, {0,
+   * 1, 2}); */
+  /* test_case({3, 4, 5, 6, 5, 2}, {3, 4, 2, 6, 5, 2}, {-3, 0, 2}, {3, 100, 1},
+   * {0, 1, 2}); */
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(slice, kMLU);
diff --git a/lite/kernels/mlu/bridges/softmax_op.cc b/lite/kernels/mlu/bridges/softmax_op.cc
index 17c911675718a15c7ede4888b268ffcd62b4d8ed..b1b621c1efc6cbc54092a8082e4d624355e07652 100644
--- a/lite/kernels/mlu/bridges/softmax_op.cc
+++ b/lite/kernels/mlu/bridges/softmax_op.cc
@@ -35,9 +35,10 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto out_var_name = op_info->Output("Out").front();
   auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
   auto output_dims = output->dims().Vectorize();
+  auto x_shape =
+      scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims().Vectorize();
 
-  // nchw axis to nhwc aixs
-  int nchw_to_nhwc_aixs_map[4] = {0, 3, 1, 2};
+  // nchw axis to nhwc axis
   int axis = 1;
   if (op_info->HasAttr("axis")) {
     axis = op_info->GetAttr<int>("axis");
@@ -45,7 +46,9 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       axis = output_dims.size() + axis;
     }
   }
-  int nhwc_axis = nchw_to_nhwc_aixs_map[axis];
+  // value of nhwc2nchw_axis is index of nhwc
+  // order of nhwc2nchw_axis is nchw
+  int nhwc_axis = GetAxisNHWC2NCHW<int>(x_shape.size())[axis];
 
   auto output_tensor = graph->AddNode(
       out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
@@ -55,6 +58,7 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                   graph->GetNode(x_var_name)->mlu_tensor(),
                                   output_tensor->mlu_tensor()));
   graph->FuseOp(softmax_op);
+  CNML_CALL(cnmlDestroyBaseOp(&softmax_op));
   return SUCCESS;
 }
 
diff --git a/lite/kernels/mlu/bridges/softmax_op_test.cc b/lite/kernels/mlu/bridges/softmax_op_test.cc
index a5251ed43c9187fc2874f9b01853b45b8abf7f1c..d5d7251205a0f60b9e5c8568a58ba48661c9df3e 100644
--- a/lite/kernels/mlu/bridges/softmax_op_test.cc
+++ b/lite/kernels/mlu/bridges/softmax_op_test.cc
@@ -93,7 +93,7 @@ void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
   opdesc.SetOutput("Out", {out_var_name});
   opdesc.SetAttr("axis", axis);
 
-  // create and convert op to NPU model, then run it on NPU
+  // create and convert op to MLU model, then run it on MLU
   auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
   // execute reference implementation and save to output tensor
   softmax_ref<float>(op);
diff --git a/lite/kernels/mlu/bridges/split_op.cc b/lite/kernels/mlu/bridges/split_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4188ba3ec08161552bc688c212408fa81ae815a3
--- /dev/null
+++ b/lite/kernels/mlu/bridges/split_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims().Vectorize();
+
+  auto out_var_name = op_info->Output("Out");
+
+  auto param_axis = op_info->GetAttr<int>("axis");
+
+  auto num = op_info->GetAttr<int>("num");
+  auto sections = op_info->GetAttr<std::vector<int>>("sections");
+  int64_t sections_num = static_cast<int64_t>(sections.size());
+  auto output_num = num > 0 ? num : sections_num;
+
+  std::vector<cnmlTensor_t> output_tensor;
+  for (auto out_name : out_var_name) {
+    auto out = scope->FindVar(out_name)->GetMutable<Tensor>();
+    auto out_dims = out->dims().Vectorize();
+    auto out_tensor = graph->AddNode(
+        out_name, out_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+    output_tensor.push_back(out_tensor->mlu_tensor());
+  }
+
+  auto dims = x_dims.size();
+  int axis = (param_axis < 0) ? (param_axis + dims) : param_axis;
+  CHECK_LE(axis, 4) << "Unsupport dims in mlu concat";
+  int nhwc_axis = GetAxisNHWC2NCHW<int>(dims)[axis];
+
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+
+  cnmlBaseOp_t split_op;
+  cnmlTensor_t inputs = input_tensor->mlu_tensor();
+  CNML_CALL(cnmlCreateNdSplitOp(
+      &split_op, nhwc_axis, &inputs, 1, output_tensor.data(), output_num));
+  graph->FuseOp(split_op);
+  CNML_CALL(cnmlDestroyBaseOp(&split_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(split,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SplitConverter);
diff --git a/lite/kernels/mlu/bridges/split_op_test.cc b/lite/kernels/mlu/bridges/split_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a44a45504036e9ef6199e9d2b534aa3dde63bb01
--- /dev/null
+++ b/lite/kernels/mlu/bridges/split_op_test.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/split_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+template <typename dtype>
+void split_ref(const std::shared_ptr<operators::SplitOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  int num = op_info->GetAttr<int>("num");
+  int axis = op_info->GetAttr<int>("axis");
+  std::vector<int> sections = op_info->GetAttr<std::vector<int>>("sections");
+  std::vector<lite::Tensor*> output_vec;
+  auto output = op_info->Output("Out");
+  for (auto out_var : output) {
+    output_vec.push_back(scope->Var(out_var)->GetMutable<Tensor>());
+  }
+  auto in_dims = x->dims();
+  auto rank = in_dims.size();
+  int outs_number = output_vec.size();
+  std::vector<lite::DDimLite> outs_dims;
+  outs_dims.reserve(outs_number);
+  if (axis < 0) {
+    axis += rank;
+  }
+  if (num > 0) {
+    int out_axis_dim = in_dims[axis] / num;
+    for (int i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = out_axis_dim;
+      outs_dims.push_back(dim);
+    }
+  } else if (sections.size() > 0) {
+    for (size_t i = 0; i < outs_number; ++i) {
+      auto dim = in_dims;
+      dim[axis] = sections[i];
+      outs_dims.push_back(dim);
+    }
+  }
+  for (int j = 0; j < outs_dims.size(); ++j) {
+    output_vec[j]->Resize(outs_dims[j]);
+  }
+
+  const dtype* din = x->mutable_data<const dtype>();
+  std::vector<int> in_strides(in_dims.size());
+  in_strides[in_dims.size() - 1] = in_dims[in_dims.size() - 1];
+  for (int i = in_dims.size() - 2; i >= 0; --i) {
+    in_strides[i] = in_strides[i + 1] * in_dims[i];
+  }
+
+  int input_offset = 0;
+  for (auto out : output_vec) {
+    auto out_dim = out->dims();
+    std::vector<int> out_strides(out_dim.size());
+    out_strides[out_dim.size() - 1] = out_dim[out_dim.size() - 1];
+    for (int i = out_dim.size() - 2; i >= 0; --i) {
+      out_strides[i] = out_strides[i + 1] * out_dim[i];
+    }
+
+    dtype* out_data = out->mutable_data<dtype>();
+    int before = out_strides[0] / out_strides[axis];
+    int in_after = in_strides[axis];
+    int out_after = out_strides[axis];
+
+    for (int i = 0; i < before; ++i) {
+      std::memcpy(out_data + i * out_after,
+                  din + input_offset + i * in_after,
+                  sizeof(dtype) * out_after);
+    }
+    input_offset += out_strides[axis];
+  }
+}
+
+void test_split(int bs,
+                int ic,
+                int ih,
+                int iw,
+                int axis,
+                int num,
+                std::vector<int> sections) {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name_1 = "out_1";
+  std::string out_var_name_2 = "out_2";
+  std::string out_ref_var_name_1 = "out_ref_1";
+  std::string out_ref_var_name_2 = "out_ref_2";
+
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out_1 = scope.Var(out_var_name_1)->GetMutable<Tensor>();
+  auto* out_2 = scope.Var(out_var_name_2)->GetMutable<Tensor>();
+  auto* out_ref_1 = scope.Var(out_ref_var_name_1)->GetMutable<Tensor>();
+  auto* out_ref_2 = scope.Var(out_ref_var_name_2)->GetMutable<Tensor>();
+  x->Resize({bs, ic, ih, iw});
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("split");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name_1, out_var_name_2});
+  opdesc.SetAttr("axis", axis);
+  opdesc.SetAttr("sections", sections);
+  opdesc.SetAttr("num", num);
+
+  auto op = CreateOp<operators::SplitOp>(opdesc, &scope);
+  split_ref<float>(op);
+  out_ref_1->CopyDataFrom(*out_1);
+  out_ref_2->CopyDataFrom(*out_2);
+  // execute reference implementation and save to output tensor
+
+  Tensor input;
+  input.Resize({bs, ic, ih, iw});
+  transpose<float>(x->mutable_data<float>(),
+                   input.mutable_data<float>(),
+                   {static_cast<int>(bs),
+                    static_cast<int>(ic),
+                    static_cast<int>(ih),
+                    static_cast<int>(iw)},
+                   {0, 2, 3, 1});
+  x->CopyDataFrom(input);
+  LaunchOp(op, {x_var_name}, {out_var_name_1, out_var_name_2});
+
+  // compare results
+  auto* out_data_1 = out_1->mutable_data<float>();
+  auto* out_data_2 = out_2->mutable_data<float>();
+  auto* out_ref_data_1 = out_ref_1->mutable_data<float>();
+  auto* out_ref_data_2 = out_ref_2->mutable_data<float>();
+
+  Tensor output1, output2;
+  output1.Resize(out_1->dims());
+  output2.Resize(out_2->dims());
+  transpose<float>(out_data_1,
+                   output1.mutable_data<float>(),
+                   {static_cast<int>(out_1->dims()[0]),
+                    static_cast<int>(out_1->dims()[2]),
+                    static_cast<int>(out_1->dims()[3]),
+                    static_cast<int>(out_1->dims()[1])},
+                   {0, 3, 1, 2});
+  transpose<float>(out_data_2,
+                   output2.mutable_data<float>(),
+                   {static_cast<int>(out_2->dims()[0]),
+                    static_cast<int>(out_2->dims()[2]),
+                    static_cast<int>(out_2->dims()[3]),
+                    static_cast<int>(out_2->dims()[1])},
+                   {0, 3, 1, 2});
+  out_data_1 = output1.mutable_data<float>();
+  out_data_2 = output2.mutable_data<float>();
+  for (int i = 0; i < out_1->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data_1[i], out_ref_data_1[i], 5e-4);
+  }
+  for (int i = 0; i < out_2->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data_2[i], out_ref_data_2[i], 5e-4);
+  }
+}
+
+TEST(MLUBridges, split) {
+  test_split(4, 2, 3, 1, 0, 2, {});
+  test_split(4, 2, 3, 1, 0, 0, {3, 1});
+  test_split(4, 6, 3, 1, 1, 2, {});
+  test_split(4, 6, 3, 1, 1, 0, {2, 4});
+  test_split(4, 2, 2, 1, 2, 2, {});
+  test_split(4, 2, 6, 1, 2, 0, {3, 3});
+  test_split(4, 2, 3, 4, 3, 2, {});
+  test_split(4, 2, 3, 6, 3, 0, {5, 1});
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(split, kMLU);
diff --git a/lite/kernels/mlu/bridges/squeeze_op.cc b/lite/kernels/mlu/bridges/squeeze_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f8af5b014bdba29bb50036473f671ec359f26d4
--- /dev/null
+++ b/lite/kernels/mlu/bridges/squeeze_op.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int SqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Create act node and set params from op
+  auto fp_type = graph->FPType();
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, fp_type);
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+
+  auto output_dims_nhwc = DimNCHW2NHWC(output_dims);
+  std::vector<int> o_dims(output_dims.size());
+  std::transform(output_dims_nhwc.cbegin(),
+                 output_dims_nhwc.cend(),
+                 o_dims.begin(),
+                 [](DDim::value_type d) { return static_cast<int>(d); });
+
+  cnmlReshapeOpParam_t param;
+  cnmlBaseOp_t squeeze_op;
+  CNML_CALL(cnmlCreateNdReshapeOpParam(&param, o_dims.data(), o_dims.size()));
+  CNML_CALL(cnmlCreateReshapeOp(&squeeze_op,
+                                param,
+                                input_tensor->mlu_tensor(),
+                                output_tensor->mlu_tensor()));
+  CNML_CALL(cnmlDestroyReshapeOpParam(&param));
+  graph->FuseOp(squeeze_op);
+  CNML_CALL(cnmlDestroyBaseOp(&squeeze_op));
+
+  if (op_type == "squeeze2") {
+    auto xshape_var_name = op_info->Output("XShape").front();
+    auto xshape = scope->FindVar(xshape_var_name)->GetMutable<Tensor>();
+    auto dims_64 = xshape->dims().Vectorize();
+    auto dims_64_nhwc = DimNCHW2NHWC(dims_64);
+    auto xshape_tensor = graph->AddNode(
+        xshape_var_name, dims_64, CNML_TENSOR, CNML_NCHW, fp_type);
+
+    std::vector<int> xshape_dims(dims_64.size());
+    std::transform(dims_64_nhwc.cbegin(),
+                   dims_64_nhwc.cend(),
+                   xshape_dims.begin(),
+                   [](DDim::value_type d) { return static_cast<int>(d); });
+
+    cnmlBaseOp_t squeeze2_op;
+    CNML_CALL(cnmlCreateNdReshapeOpParam(
+        &param, xshape_dims.data(), xshape_dims.size()));
+    CNML_CALL(cnmlCreateReshapeOp(&squeeze2_op,
+                                  param,
+                                  input_tensor->mlu_tensor(),
+                                  xshape_tensor->mlu_tensor()));
+    CNML_CALL(cnmlDestroyReshapeOpParam(&param));
+    graph->FuseOp(squeeze2_op);
+    CNML_CALL(cnmlDestroyBaseOp(&squeeze2_op));
+  }
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(squeeze,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SqueezeConverter);
+REGISTER_SUBGRAPH_BRIDGE(squeeze2,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::SqueezeConverter);
diff --git a/lite/kernels/mlu/bridges/squeeze_op_test.cc b/lite/kernels/mlu/bridges/squeeze_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad16dac2e978fa977acacf62ed6adca16365ed6d
--- /dev/null
+++ b/lite/kernels/mlu/bridges/squeeze_op_test.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/squeeze_op.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+// squeeze
+TEST(MLUBridges, squeeze) {
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string ref_var_name("ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(ref_var_name)->GetMutable<Tensor>();
+  std::vector<int64_t> x_shape({1, 3, 1, 5});
+  x->Resize(x_shape);
+  out_ref->Resize(x_shape);
+  std::vector<int64_t> out_shape({3, 5});
+  out->Resize(out_shape);
+
+  FillTensor<float>(x, 0, 10);
+  out_ref->CopyDataFrom(*x);
+
+  // SqueezeCompute squeeze;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("squeeze");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  std::vector<int> axes{0, -2};
+  opdesc.SetAttr("axes", axes);
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::SqueezeOp>(opdesc, &scope);
+  LaunchOp(op, {x_var_name}, {out_var_name});
+
+  auto x_data = out_ref->data<float>();
+  auto out_data = out->data<float>();
+  for (int j = 0; j < out->numel(); ++j) {
+    EXPECT_NEAR(out_data[j], x_data[j], 1e-5);
+  }
+}
+
+// squeeze2
+TEST(MLUBridges, squeeze2) {
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string xshape_var_name("xshape");
+  std::string ref_var_name("ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* xshape = scope.Var(xshape_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(ref_var_name)->GetMutable<Tensor>();
+  std::vector<int64_t> x_shape({1, 3, 1, 5});
+  x->Resize(x_shape);
+  out_ref->Resize(x_shape);
+  std::vector<int64_t> out_shape({3, 5});
+  out->Resize(out_shape);
+  std::vector<int64_t> xshape_shape({1, 3, 1, 5});
+  xshape->Resize(xshape_shape);
+
+  FillTensor<float>(x, 0, 10);
+  out_ref->CopyDataFrom(*x);
+
+  // Squeeze2Compute squeeze2;
+  cpp::OpDesc opdesc;
+  opdesc.SetType("squeeze2");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetOutput("XShape", {xshape_var_name});
+
+  std::vector<int> axes({0, -2});
+  opdesc.SetAttr("axes", axes);
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::SqueezeOp>(opdesc, &scope);
+  LaunchOp(op, {x_var_name}, {out_var_name, xshape_var_name});
+
+  auto x_data = out_ref->mutable_data<float>();
+  auto out_data = out->mutable_data<float>();
+  auto xshape_data = xshape->mutable_data<float>();
+  for (int j = 0; j < out->numel(); ++j) {
+    EXPECT_NEAR(out_data[j], x_data[j], 1e-5);
+    EXPECT_NEAR(xshape_data[j], x_data[j], 1e-5);
+  }
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(squeeze, kMLU);
+USE_SUBGRAPH_BRIDGE(squeeze2, kMLU);
diff --git a/lite/kernels/mlu/bridges/tensor.cc b/lite/kernels/mlu/bridges/tensor.cc
index be7e1f09beaee61dace598b958ab4f95f14b38f8..f1bf48d66e8693e72a96f0f52c285a717f464128 100644
--- a/lite/kernels/mlu/bridges/tensor.cc
+++ b/lite/kernels/mlu/bridges/tensor.cc
@@ -16,6 +16,9 @@
 #include <glog/logging.h>
 #include <algorithm>
 #include <climits>
+#include <fstream>
+#include <sstream>
+#include <string>
 #include <vector>
 
 namespace paddle {
@@ -25,8 +28,9 @@ namespace mlu {
 
 MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
                      cnmlTensorType_t tensor_type,
-                     cnmlDataOrder_t data_order,
-                     cnmlDataType_t mlu_dtype)
+                     cnmlDataOrder_t shape_order,
+                     cnmlDataType_t mlu_dtype,
+                     cnmlDataOrder_t data_order)
     : mlu_tensor_(nullptr), tensor_type_(tensor_type), mlu_ptr_(nullptr) {
   std::vector<int> int_shape;
   for (auto i : shape) {
@@ -36,15 +40,18 @@ MLUTensor::MLUTensor(const std::vector<int64_t>& shape,
       LOG(FATAL) << "Shape size is beyond the limitation of MLUTensor!";
     }
   }
-  remember(int_shape, tensor_type, mlu_dtype, data_order);
+  remember(int_shape, tensor_type, mlu_dtype, shape_order, data_order);
 }
 
 void MLUTensor::remember(const std::vector<int>& shape,
                          cnmlTensorType_t tensor_type,
                          cnmlDataType_t mlu_dtype,
-                         cnmlDataOrder_t shape_order) {
+                         cnmlDataOrder_t shape_order,
+                         cnmlDataOrder_t data_order) {
   tensor_type_ = tensor_type;
   mlu_dtype_ = mlu_dtype;
+  data_order_ = data_order;
+  origin_shape_.assign(shape.begin(), shape.end());
 
   int size = 4;
   if (shape.size() > 4 || shape_order == CNML_ARRAY) {
@@ -239,13 +246,22 @@ void MLUTensor::remember(const std::vector<int>& shape,
         break;
     }
   }
-  dim_ = shape_.size();
+  auto shape_NCHW = DimNHWC2NCHW(shape_);
+  shape_NCHW.erase(shape_NCHW.begin() + shape.size(), shape_NCHW.end());
+  dim_ = shape_NCHW.size();
+  shape_ = DimNCHW2NHWC(shape_NCHW);
 }
 
 void MLUTensor::Create() {
   if (mlu_tensor_ == nullptr) {
     CNML_CALL(cnmlCreateTensor_V2(&mlu_tensor_, tensor_type_));
     std::vector<int> dim_shape(shape_);
+    if (data_order_ == CNML_NCHW) {
+      std::transform(origin_shape_.cbegin(),
+                     origin_shape_.cend(),
+                     dim_shape.begin(),
+                     [](DDim::value_type in) { return static_cast<int>(in); });
+    }
     int* dim_strides = nullptr;
     CNML_CALL(cnmlSetTensorShape_V2(
         mlu_tensor_, dim_, dim_shape.data(), dim_strides));
@@ -258,6 +274,84 @@ cnmlTensor_t MLUTensor::mlu_tensor() {
   return mlu_tensor_;
 }
 
+void MLUTensor::ToFile(std::string file_name) {
+  if (mlu_ptr_) {
+    VLOG(5) << "to dump mlu ptr: " << mlu_ptr_ << " to: " << file_name;
+    int count = 1;
+    for (size_t i = 0; i < shape_.size(); i++) {
+      count *= shape_[i];
+    }
+    VLOG(6) << " dump count: " << count;
+    VLOG(6) << " dump shape: ";
+    for (size_t i = 0; i < shape_.size(); i++) {
+      VLOG(6) << shape_[i] << " ";
+    }
+
+    std::vector<float> cpu_data_fp32(count);
+    // fp16 to fp32
+    if (mlu_dtype_ == CNML_DATA_FLOAT16) {
+      VLOG(6) << " convert fp16 to fp32 ";
+      std::vector<uint16_t> cpu_data_fp16(count);
+      cnrtMemcpy(cpu_data_fp16.data(),
+                 mlu_ptr_,
+                 count * sizeof(uint16_t),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+      for (int i = 0; i < count; i++) {
+        cnrtConvertHalfToFloat(&(cpu_data_fp32[i]), cpu_data_fp16[i]);
+      }
+    } else {
+      cnrtMemcpy(cpu_data_fp32.data(),
+                 mlu_ptr_,
+                 count * sizeof(float),
+                 CNRT_MEM_TRANS_DIR_DEV2HOST);
+    }
+
+    // trans to nchw
+    std::vector<float> cpu_data_trans(count);
+    if (data_order_ != CNML_NCHW) {
+      switch (shape_.size()) {
+        case 4:
+          transpose(cpu_data_fp32.data(),
+                    cpu_data_trans.data(),
+                    shape_,
+                    {0, 3, 1, 2});
+          break;
+        case 3:
+          transpose(
+              cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 2, 1});
+          break;
+        case 2:
+          transpose(
+              cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0, 1});
+          break;
+        case 1:
+          transpose(cpu_data_fp32.data(), cpu_data_trans.data(), shape_, {0});
+          break;
+        default:
+          CHECK(0) << "ToFile only support dim <=4";
+          break;
+      }
+    }
+
+    // to file
+    std::ostringstream outs;
+    for (int i = 0; i < count; i++) {
+      if (data_order_ == CNML_NCHW) {
+        outs << cpu_data_fp32[i] << std::endl;
+      } else {
+        outs << cpu_data_trans[i] << std::endl;
+      }
+    }
+    std::ofstream of;
+    of.open(file_name, std::ios::out);
+    of << outs.str();
+    of.close();
+  } else {
+    LOG(FATAL) << "mlu ptr is null ,can not dump mlu content to : "
+               << file_name;
+  }
+}
+
 MLUTensor::~MLUTensor() {
   if (mlu_tensor_ != nullptr) {
     CNML_CALL(cnmlDestroyTensor(&mlu_tensor_));
diff --git a/lite/kernels/mlu/bridges/tensor.h b/lite/kernels/mlu/bridges/tensor.h
index 12dc97a772dabc529bf183f783a22a9f2dfa936d..22268f69ba39926dbbfb1bbb18e3a86331097f90 100644
--- a/lite/kernels/mlu/bridges/tensor.h
+++ b/lite/kernels/mlu/bridges/tensor.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <fstream>
+#include <string>
 #include <vector>
 #include "lite/kernels/mlu/bridges/utility.h"
 
@@ -33,13 +35,15 @@ class MLUTensor {
 
   MLUTensor(const std::vector<int64_t>& shape,
             cnmlTensorType_t tensor_type = CNML_TENSOR,
-            cnmlDataOrder_t data_order = CNML_NCHW,
-            cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32);
+            cnmlDataOrder_t shape_order = CNML_NCHW,
+            cnmlDataType_t mlu_dtype = CNML_DATA_FLOAT32,
+            cnmlDataOrder_t data_order = CNML_NHWC);
 
   void remember(const std::vector<int>& shape,
                 cnmlTensorType_t tensor_type,
                 cnmlDataType_t mlu_dtype,
-                cnmlDataOrder_t shape_order);
+                cnmlDataOrder_t shape_order,
+                cnmlDataOrder_t data_order);
   void Create();
   cnmlTensor_t mlu_tensor();
   void* mlu_data() {
@@ -47,14 +51,21 @@ class MLUTensor {
     return mlu_ptr_;
   }
 
+  cnmlDataType_t dtype() { return mlu_dtype_; }
   void set_mlu_dtype(cnmlDataType_t type) { mlu_dtype_ = type; }
 
+  const std::vector<int64_t>& get_origin_shape() const { return origin_shape_; }
+
   ~MLUTensor();
 
+  void ToFile(std::string file_name);
+  cnmlDataOrder_t dorder() { return data_order_; }
+
  private:
   cnmlTensor_t mlu_tensor_;
 
   std::vector<int> shape_;
+  std::vector<int64_t> origin_shape_;
   cnmlTensorType_t tensor_type_;
   cnmlDataType_t mlu_dtype_;
   int dim_{0};
diff --git a/lite/kernels/mlu/bridges/test_helper.cc b/lite/kernels/mlu/bridges/test_helper.cc
index 377a00689ef3a27f78ae008072578ab3701cd337..36eeb473f6a37aa28a9447280f808f5fb08978d0 100644
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -24,18 +24,38 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
+template <lite_api::PrecisionType Dtype>
+void PrepareInput(Graph* graph,
+                  const std::string& input_name,
+                  Tensor* input_tensor,
+                  cnmlDataOrder_t order) {
+  thread_local Tensor temp_input;
+  temp_input.Resize(input_tensor->dims().Vectorize());
+  temp_input.CopyDataFrom(*input_tensor);
+  using data_type = typename MLUTypeTraits<Dtype>::type;
+  auto input_node = graph->AddNode(
+      input_name,
+      input_tensor->dims().Vectorize(),
+      CNML_TENSOR,
+      CNML_NCHW,
+      MLUTypeTraits<Dtype>::cnml_type,
+      order,
+      reinterpret_cast<void*>(
+          input_tensor->template mutable_data<data_type>(TARGET(kMLU))));
+  CHECK(input_node);
+  CNRT_CHECK(cnrtMemcpy(input_tensor->template mutable_data<data_type>(),
+                        temp_input.mutable_data<data_type>(),
+                        sizeof(data_type) * input_tensor->dims().production(),
+                        CNRT_MEM_TRANS_DIR_HOST2DEV));
+}
+
 void LaunchOp(const std::shared_ptr<lite::OpLite> op,
               const std::vector<std::string>& input_var_names,
-              const std::vector<std::string>& output_var_names) {
+              const std::vector<std::string>& output_var_names,
+              cnmlDataOrder_t order) {
   CNRT_CALL(cnrtInit(0));
-  ::paddle::lite::SetMluDevice(0);
+  lite::SetMluDevice(0);
   cnrtQueue_t queue_;
-  cnrtInvokeFuncParam_t forward_param;
-  u32_t affinity = 1;
-  int data_param = 1;
-  forward_param.data_parallelism = &data_param;
-  forward_param.affinity = &affinity;
-  forward_param.end = CNRT_PARAM_END;
   CNRT_CALL(cnrtCreateQueue(&queue_));
   cnrtDev_t dev_handle;
   CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0));
@@ -50,23 +70,21 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
   // Convert input data var and add it into the MLU IR graph
   for (auto& input_name : input_var_names) {
     auto input_tensor = scope->FindMutableTensor(input_name);
-    CHECK(input_tensor);
-    Tensor temp_input;
-    temp_input.Resize(input_tensor->dims().Vectorize());
-    temp_input.CopyDataFrom(*input_tensor);
-    auto input_node =
-        graph.AddNode(input_name,
-                      input_tensor->dims().Vectorize(),
-                      CNML_TENSOR,
-                      CNML_NCHW,
-                      graph.FPType(),
-                      reinterpret_cast<void*>(
-                          input_tensor->mutable_data<float>(TARGET(kMLU))));
-    CHECK(input_node);
-    CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<float>(),
-                          temp_input.mutable_data<float>(),
-                          sizeof(float) * input_tensor->dims().production(),
-                          CNRT_MEM_TRANS_DIR_HOST2DEV));
+    auto data_type = input_tensor->precision();
+
+    switch (data_type) {
+#define PREPARE_INPUT(type__)                                                 \
+  case PRECISION(type__):                                                     \
+    PrepareInput<PRECISION(type__)>(&graph, input_name, input_tensor, order); \
+    break;
+      PREPARE_INPUT(kFP16)
+      PREPARE_INPUT(kFloat)
+      PREPARE_INPUT(kInt8)
+      PREPARE_INPUT(kInt32)
+#undef PREPARE_INPUT
+      default:
+        CHECK(0);
+    }
   }
   op->CheckShape();
   op->InferShape();
@@ -89,8 +107,9 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
   }
 
   graph.Compile(CNML_MLU270, 1);
+  graph.Compute(queue_, *(graph.MutableInputs()), *(graph.MutableOutputs()));
+  CNRT_CALL(cnrtSyncQueue(queue_));
 
-  graph.Compute(forward_param, queue_);
   for (auto& output_name : output_var_names) {
     auto output_tensor = scope->FindMutableTensor(output_name);
     Tensor temp_out;
diff --git a/lite/kernels/mlu/bridges/test_helper.h b/lite/kernels/mlu/bridges/test_helper.h
index 4da9e72dfcc5a81a68467f7622e2c16aedb2ded5..36fe6f1efaed76deccdc6e9542bb52a2aefc2571 100644
--- a/lite/kernels/mlu/bridges/test_helper.h
+++ b/lite/kernels/mlu/bridges/test_helper.h
@@ -58,7 +58,8 @@ void FillTensor(Tensor* x,
 
 void LaunchOp(const std::shared_ptr<lite::OpLite> op,
               const std::vector<std::string>& input_var_names,
-              const std::vector<std::string>& output_var_names);
+              const std::vector<std::string>& output_var_names,
+              cnmlDataOrder_t order = CNML_NHWC);
 
 }  // namespace mlu
 }  // namespace subgraph
diff --git a/lite/kernels/mlu/bridges/transpose_op.cc b/lite/kernels/mlu/bridges/transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b6caeb3613fea8f348e3990ec2c9660321590116
--- /dev/null
+++ b/lite/kernels/mlu/bridges/transpose_op.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+std::vector<int> axis_to_nhwc(const std::vector<int>& axis) {
+  std::vector<int> new_axis(axis.size());
+
+  auto nhwc2nchw_axis = std::move(GetAxisNHWC2NCHW<int>(axis.size()));
+  auto nchw2nhwc_axis = std::move(GetAxisNCHW2NHWC<int>(axis.size()));
+
+  for (size_t i = 0; i < new_axis.size(); ++i) {
+    new_axis[i] = nhwc2nchw_axis[axis[nchw2nhwc_axis[i]]];
+  }
+  return new_axis;
+}
+
+int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto x_dims = x->dims().Vectorize();
+
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+  std::vector<int> axis_nhwc = axis_to_nhwc(axis);
+
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+
+  CHECK(graph->HasNode(x_var_name));
+  auto input_tensor = graph->GetNode(x_var_name);
+  cnmlBaseOp_t transpose_op{nullptr};
+
+  cnmlNdTransposeOpParam_t transpose_param{nullptr};
+
+  CNML_CALL(cnmlCreateNdTransposeOpParam(
+      &transpose_param, axis_nhwc.data(), axis_nhwc.size()));
+
+  // Use cnmlCreatexxxOpForward to create op.
+  CNML_CALL(cnmlCreateNdTransposeProOp(&transpose_op,
+                                       input_tensor->mlu_tensor(),
+                                       output_tensor->mlu_tensor(),
+                                       transpose_param));
+
+  graph->FuseOp(transpose_op);
+  CNML_CALL(cnmlDestroyBaseOp(&transpose_op));
+  return SUCCESS;
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::TransposeConverter);
diff --git a/lite/kernels/mlu/bridges/transpose_op_test.cc b/lite/kernels/mlu/bridges/transpose_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6e8f7890581279f0ab4d51006c194967fd9c61e7
--- /dev/null
+++ b/lite/kernels/mlu/bridges/transpose_op_test.cc
@@ -0,0 +1,153 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/transpose_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+
+int data_index(std::vector<int> pos, DDimLite dims) {
+  int d1 = dims[1];
+  int d2 = dims[2];
+  int d3 = dims[3];
+  return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1;
+}
+
+std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
+  std::vector<int> out_pos(in_pos.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    out_pos[axis[i]] = in_pos[i];
+  }
+  return out_pos;
+}
+
+template <typename dtype>
+void transpose_ref(const std::shared_ptr<operators::TransposeOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto input =
+      scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto output =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x_dims = input->dims();
+  auto y_dims = output->dims();
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+
+  // auto input_data = input->data<dtype>();
+  auto* input_data = input->mutable_data<dtype>();
+  auto* output_data = output->mutable_data<dtype>();
+
+  int input_n = x_dims[0];
+  int input_c = x_dims[1];
+  int input_h = x_dims[2];
+  int input_w = x_dims[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          std::vector<int> in_pos{n, c, h, w};
+          std::vector<int> out_pos = pos_trans(in_pos, axis);
+          int in_index = data_index(in_pos, x_dims);
+          int out_index = data_index(out_pos, y_dims);
+          output_data[out_index] = input_data[in_index];
+        }
+      }
+    }
+  }
+}
+
+void test_transpose(const std::vector<int64_t>& input_shape,
+                    std::vector<int> axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("transpose");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", axis);
+
+  // create and convert op to MLU model, then run it on MLU
+  auto op = CreateOp<operators::TransposeOp>(opdesc, &scope);
+
+  // transpose_ref  must run befor LaunchOp
+  // otherwise get Cannot access memory
+  // execute reference implementation and save to output tensor
+  transpose_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+
+  Tensor input_x;
+  input_x.Resize(DDim(input_shape));
+  transpose(x->mutable_data<float>(),
+            input_x.mutable_data<float>(),
+            {static_cast<int>(input_shape[0]),
+             static_cast<int>(input_shape[1]),
+             static_cast<int>(input_shape[2]),
+             static_cast<int>(input_shape[3])},
+            {0, 2, 3, 1});
+  x->CopyDataFrom(input_x);
+
+  LaunchOp(op, {x_var_name}, {out_var_name});
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+
+  Tensor output_trans;
+  output_trans.Resize(out->dims());
+  auto os = out->dims();
+  transpose(out_data,
+            output_trans.mutable_data<float>(),
+            {static_cast<int>(os[0]),
+             static_cast<int>(os[2]),
+             static_cast<int>(os[3]),
+             static_cast<int>(os[1])},
+            {0, 3, 1, 2});
+  out_data = output_trans.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+// TODO(pmshst): fix the transpose test
+TEST(MLUBridges, transpose) {
+  std::vector<int64_t> input_shape = {2, 3, 4, 5};
+  test_transpose(input_shape, std::vector<int>{0, 1, 3, 2});
+}
+
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+USE_SUBGRAPH_BRIDGE(transpose, kMLU);
+USE_SUBGRAPH_BRIDGE(transpose2, kMLU);
diff --git a/lite/kernels/mlu/bridges/utility.cc b/lite/kernels/mlu/bridges/utility.cc
index cd78553a652433fc41334a6bff5575031f5125e0..b53debd643ae2b1080644d2844d702797addabec 100644
--- a/lite/kernels/mlu/bridges/utility.cc
+++ b/lite/kernels/mlu/bridges/utility.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/mlu/bridges/utility.h"
+
 #include <utility>
 
 namespace paddle {
@@ -20,33 +21,21 @@ namespace lite {
 namespace subgraph {
 namespace mlu {
 
-void transpose(float* input_data,
-               float* output_data,
-               std::vector<int> input_shape,
-               std::vector<int> axis) {
+void transpose2d(float* input_data,
+                 float* output_data,
+                 std::vector<int> input_shape) {
+  CHECK_EQ(input_shape.size(), 2);
   int old_index = -1;
   int new_index = -1;
-  int dim[4] = {0};
-  std::vector<int> shape = input_shape;
-  for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) {
-    for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) {
-      for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) {
-        for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) {
-          old_index = dim[0] * shape[1] * shape[2] * shape[3] +
-                      dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3];
-          new_index =
-              dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] +
-              dim[axis[1]] * shape[axis[2]] * shape[axis[3]] +
-              dim[axis[2]] * shape[axis[3]] + dim[axis[3]];
-          output_data[new_index] = input_data[old_index];
-        }
-      }
+  for (int i = 0; i < input_shape[0]; i++) {
+    for (int j = 0; j < input_shape[1]; j++) {
+      old_index = i * input_shape[1] + j;
+      new_index = j * input_shape[0] + i;
+      output_data[new_index] = input_data[old_index];
     }
   }
 }
 
-int scale2position(float scale) { return static_cast<int>(-std::log2(scale)); }
-
 void dequant(float* dst, int8_t* src, size_t size, float scale) {
   for (size_t i = 0; i < size; ++i) {
     dst[i] = static_cast<float>(src[i]) * scale;
diff --git a/lite/kernels/mlu/bridges/utility.h b/lite/kernels/mlu/bridges/utility.h
index fa8fb1597c0fb068a855928dd20057d48ecd5eaf..fd1e5eb265936f11f258d86e2b6a91af1d55c6ed 100644
--- a/lite/kernels/mlu/bridges/utility.h
+++ b/lite/kernels/mlu/bridges/utility.h
@@ -16,24 +16,76 @@
 
 #include <cnml.h>
 #include <cnrt.h>
+
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "lite/backends/mlu/mlu_utils.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
-#include "lite/fluid/data_type.h"
+#include "lite/fluid/float16.h"
 
 namespace paddle {
 namespace lite {
 namespace subgraph {
 namespace mlu {
 
-void transpose(float* input_data,
-               float* output_data,
+void transpose2d(float* input_data,
+                 float* output_data,
+                 std::vector<int> input_shape);
+
+template <typename dtype>
+void transpose(dtype* input_data,
+               dtype* output_data,
                std::vector<int> input_shape,
-               std::vector<int> axis);
-int scale2position(float scale);
+               std::vector<int> axis) {
+  int old_index = -1;
+  int new_index = -1;
+  std::vector<int> shape;
+  std::vector<int> expand_axis;
+  if (input_shape.size() < 5u) {
+    for (size_t i = 0; i < 5 - input_shape.size(); i++) {
+      shape.push_back(1);
+      expand_axis.push_back(i);
+    }
+    for (size_t i = 0; i < input_shape.size(); i++) {
+      shape.push_back(input_shape[i]);
+      expand_axis.push_back(axis[i] + 5 - input_shape.size());
+    }
+  } else {
+    shape = input_shape;
+    expand_axis = axis;
+  }
+  int dim[5] = {0};
+  for (dim[0] = 0; dim[0] < shape[0]; dim[0]++) {
+    for (dim[1] = 0; dim[1] < shape[1]; dim[1]++) {
+      for (dim[2] = 0; dim[2] < shape[2]; dim[2]++) {
+        for (dim[3] = 0; dim[3] < shape[3]; dim[3]++) {
+          for (dim[4] = 0; dim[4] < shape[4]; dim[4]++) {
+            old_index = dim[0] * shape[1] * shape[2] * shape[3] * shape[4] +
+                        dim[1] * shape[2] * shape[3] * shape[4] +
+                        dim[2] * shape[3] * shape[4] + dim[3] * shape[4] +
+                        dim[4];
+            new_index = dim[expand_axis[0]] * shape[expand_axis[1]] *
+                            shape[expand_axis[2]] * shape[expand_axis[3]] *
+                            shape[expand_axis[4]] +
+                        dim[expand_axis[1]] * shape[expand_axis[2]] *
+                            shape[expand_axis[3]] * shape[expand_axis[4]] +
+                        dim[expand_axis[2]] * shape[expand_axis[3]] *
+                            shape[expand_axis[4]] +
+                        dim[expand_axis[3]] * shape[expand_axis[4]] +
+                        dim[expand_axis[4]];
+            output_data[new_index] = input_data[old_index];
+          }
+        }
+      }
+    }
+  }
+}
+
+inline int scale2position(float scale) { return std::floor(-std::log2(scale)); }
+
 void dequant(float* dst, int8_t* src, size_t size, float scale);
 
 void dequant(float* dst,
@@ -64,27 +116,94 @@ inline const ::paddle::lite::DDimLite DimNCHW2NHWC(
       std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]}));
 }
 
-inline const std::vector<int64_t> DimNHWC2NCHW(
-    const std::vector<int64_t>& dim) {
-  return std::vector<int64_t>({dim[0], dim[3], dim[1], dim[2]});
+template <typename data_type>
+inline const std::vector<data_type> DimNHWC2NCHW(
+    const std::vector<data_type>& dim) {
+  switch (dim.size()) {
+    case 1:
+      return dim;
+    case 2:
+      return dim;
+    case 3:
+      return std::vector<data_type>({dim[0], dim[2], dim[1]});
+    case 4:
+      return std::vector<data_type>({dim[0], dim[3], dim[1], dim[2]});
+    case 5:
+      return std::vector<data_type>({dim[0], dim[4], dim[1], dim[2], dim[3]});
+    default:
+      CHECK(0) << "unsupport dimension";
+  }
+}
+
+template <typename data_type>
+inline const std::vector<data_type> DimNCHW2NHWC(
+    const std::vector<data_type>& dim) {
+  switch (dim.size()) {
+    case 1:
+      return dim;
+    case 2:
+      return dim;
+    case 3:
+      return std::vector<data_type>({dim[0], dim[2], dim[1]});
+    case 4:
+      return std::vector<data_type>({dim[0], dim[2], dim[3], dim[1]});
+    case 5:
+      return std::vector<data_type>({dim[0], dim[2], dim[3], dim[4], dim[1]});
+    default:
+      CHECK(0) << "unsupport dimension";
+  }
 }
 
-inline const std::vector<int64_t> DimNCHW2NHWC(
-    const std::vector<int64_t>& dim) {
-  return std::vector<int64_t>({dim[0], dim[2], dim[3], dim[1]});
+template <typename data_type>
+inline std::vector<data_type> GetAxisNHWC2NCHW(size_t n_dims) {
+  std::vector<data_type> nhwc2nchw_axis(n_dims);
+  nhwc2nchw_axis[0] = 0;
+  if (n_dims > 1) nhwc2nchw_axis[1] = n_dims - 1;
+  for (size_t i = 2; i < n_dims; ++i) {
+    nhwc2nchw_axis[i] = i - 1;
+  }
+  return nhwc2nchw_axis;
+}
+
+template <typename data_type>
+inline std::vector<data_type> GetAxisNCHW2NHWC(size_t n_dims) {
+  std::vector<data_type> nchw2nhwc_axis(n_dims);
+  nchw2nhwc_axis[0] = 0;
+  for (size_t i = 1; i < n_dims - 1; ++i) {
+    nchw2nhwc_axis[i] = i + 1;
+  }
+  if (n_dims > 1) nchw2nhwc_axis[n_dims - 1] = 1;
+  return nchw2nhwc_axis;
 }
 
 template <paddle::lite_api::PrecisionType>
-struct FPTypeTraits {};
+struct MLUTypeTraits {
+  /* using type = void; */
+  /* static constexpr cnmlDataType_t cnml_type = CNML_DATA_INVALID; */
+};
+
+template <>
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
+  using type = float;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT32;
+};
+
+template <>
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
+  using type = paddle::lite::fluid::float16;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_FLOAT16;
+};
 
 template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
-  typedef float T;
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
+  using type = int8_t;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT8;
 };
 
 template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
-  typedef paddle::lite::fluid::float16 T;
+struct MLUTypeTraits<paddle::lite_api::PrecisionType::kInt32> {
+  using type = int32_t;
+  static constexpr cnmlDataType_t cnml_type = CNML_DATA_INT32;
 };
 
 }  // namespace mlu
diff --git a/lite/kernels/mlu/io_copy_compute.cc b/lite/kernels/mlu/io_copy_compute.cc
index 02e4d8b28e81e88201b895a4b8fbe9e93d3f17f9..ff8a7ddf6e4c465f288ba42b5b2537294a9d7ffd 100644
--- a/lite/kernels/mlu/io_copy_compute.cc
+++ b/lite/kernels/mlu/io_copy_compute.cc
@@ -41,6 +41,9 @@ class IoCopyHostToMluCompute
     auto mem_size = param.x->memory_size();
     // LOG(INFO) << "copy size " << mem_size;
     auto* data = param.y->mutable_data(TARGET(kMLU), mem_size);
+    VLOG(6) << "io_copy host to mlu] memory size: " << mem_size
+            << " precision type: " << PrecisionToStr(Precision);
+    param.y->set_precision(param.x->precision());
     CopyFromHostSync(data, param.x->raw_data(), mem_size);
   }
 
@@ -79,6 +82,13 @@ class IoCopyMluToHostCompute
     CHECK(param.x->target() == TARGET(kMLU));
     auto mem_size = param.x->memory_size();
     auto* data = param.y->mutable_data(TARGET(kHost), mem_size);
+    VLOG(6) << "io_copy mlu to host] memory size: " << mem_size
+            << " precision type: " << PrecisionToStr(Precision);
+
+    // sync queue to ensure process done
+    auto& mlu_context = this->ctx_->template As<MLUContext>();
+    CNRT_CALL(cnrtSyncQueue(mlu_context.exec_queue()));
+
     CopyToHostSync(data, param.x->raw_data(), mem_size);
   }
 
@@ -97,8 +107,14 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFloat)>,
     host_to_device_kFloat)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
@@ -108,8 +124,31 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kFP16)>,
     host_to_device_kFP16)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kInt32,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kInt32)>,
+    host_to_device_kInt32)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kInt32),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
@@ -119,8 +158,14 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFloat)>,
     device_to_host_kFloat)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
@@ -130,6 +175,29 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::IoCopyMluToHostCompute<PRECISION(kFP16)>,
     device_to_host_kFP16)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kMLU))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    io_copy,
+    kMLU,
+    kInt8,
+    kNHWC,
+    paddle::lite::kernels::mlu::IoCopyHostToMluCompute<PRECISION(kInt8)>,
+    host_to_device_to_kInt8)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kAny))})
     .Finalize();
diff --git a/lite/kernels/mlu/layout_compute.cc b/lite/kernels/mlu/layout_compute.cc
index d4e16734d6d2dae6f5c119194008bce114a2e918..42b12740ff0edb88ea2944e25ca03ade36caa956 100644
--- a/lite/kernels/mlu/layout_compute.cc
+++ b/lite/kernels/mlu/layout_compute.cc
@@ -24,9 +24,9 @@ namespace mlu {}  // namespace mlu
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kFloat,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFloat)>,
     def_layout_nhwc2nchw_fp32)
     .BindInput("Input",
@@ -41,9 +41,9 @@ REGISTER_LITE_KERNEL(
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kFP16,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNhwcToNchwCompute<PRECISION(kFP16)>,
     def_layout_nhwc2nchw_fp16)
     .BindInput("Input",
@@ -58,9 +58,9 @@ REGISTER_LITE_KERNEL(
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kFloat,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFloat)>,
     def_layout_nchw2nhwc_fp32)
     .BindInput("Input",
@@ -75,9 +75,9 @@ REGISTER_LITE_KERNEL(
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kFP16,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kFP16)>,
     def_layout_nchw2nhwc_fp16)
     .BindInput("Input",
@@ -92,11 +92,11 @@ REGISTER_LITE_KERNEL(
 
 REGISTER_LITE_KERNEL(
     layout,
-    kMLU,
+    kX86,
     kInt8,
-    kNHWC,
+    kNCHW,
     paddle::lite::kernels::mlu::LayoutNchwToNhwcCompute<PRECISION(kInt8)>,
-    def_layout_nchw2nhwc_fp32_int8)
+    def_layout_nchw2nhwc_int8)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kInt8),
diff --git a/lite/kernels/mlu/layout_compute.h b/lite/kernels/mlu/layout_compute.h
index edacdf8a98a2ffde6e538f61d4dd8259e3211b22..df254865994fe8548df0e021ecb471f5a1020080 100644
--- a/lite/kernels/mlu/layout_compute.h
+++ b/lite/kernels/mlu/layout_compute.h
@@ -22,6 +22,7 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
+#include "lite/kernels/mlu/bridges/utility.h"
 #include "lite/operators/layout_op.h"
 
 namespace paddle {
@@ -29,24 +30,6 @@ namespace lite {
 namespace kernels {
 namespace mlu {
 
-template <paddle::lite_api::PrecisionType>
-struct FPTypeTraits {};
-
-template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFloat> {
-  typedef float T;
-};
-
-template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kFP16> {
-  typedef paddle::lite::fluid::float16 T;
-};
-
-template <>
-struct FPTypeTraits<paddle::lite_api::PrecisionType::kInt8> {
-  typedef int8_t T;
-};
-
 template <lite::TargetType Target, typename T>
 inline void LayoutTransCompute(const int dim,
                                const lite::Context<Target>& context,
@@ -73,7 +56,7 @@ inline void LayoutTransCompute(const int dim,
 
 template <PrecisionType Precision>
 class LayoutNchwToNhwcCompute
-    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+    : public KernelLite<TARGET(kX86), Precision, DATALAYOUT(kNCHW)> {
  public:
   using param_t = operators::LayoutParam;
 
@@ -81,36 +64,37 @@ class LayoutNchwToNhwcCompute
     auto& param = this->template Param<param_t>();
     auto* x = param.x;
     auto* out = param.y;
-    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
-    auto x_dims = param.x->dims().size();
+    out->template mutable_data<
+        typename subgraph::mlu::MLUTypeTraits<Precision>::type>();
+    auto x_ndims = param.x->dims().size();
     auto& context = this->ctx_->template As<X86Context>();
 
     const auto origin_dims = out->dims().Vectorize();
 
     std::vector<int> axis;
-    switch (x_dims) {
+    switch (x_ndims) {
       case 2:
         axis = {0, 1};
         break;
       case 3:
         axis = {0, 2, 1};
         out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[2], out->dims()[1]});
+            origin_dims[0], origin_dims[2], origin_dims[1]});
         break;
       case 4:
         axis = {0, 2, 3, 1};
         out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]});
+            origin_dims[0], origin_dims[2], origin_dims[3], origin_dims[1]});
         break;
       default:
         CHECK(0) << "Unsupport dim in mlu layout nchw to nhwc";
     }
 
     LayoutTransCompute<lite::TargetType::kX86,
-                       typename FPTypeTraits<Precision>::T>(
-        x_dims, context, *x, out, axis);
+                       typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
+        x_ndims, context, *x, out, axis);
 
-    if (x_dims > 2) {
+    if (x_ndims > 2) {
       out->Resize(origin_dims);
     }
   }
@@ -122,7 +106,7 @@ class LayoutNchwToNhwcCompute
 
 template <PrecisionType Precision>
 class LayoutNhwcToNchwCompute
-    : public KernelLite<TARGET(kMLU), Precision, DATALAYOUT(kNHWC)> {
+    : public KernelLite<TARGET(kX86), Precision, DATALAYOUT(kNCHW)> {
  public:
   using param_t = operators::LayoutParam;
 
@@ -130,25 +114,27 @@ class LayoutNhwcToNchwCompute
     auto& param = this->template Param<param_t>();
     auto* x = param.x;
     auto* out = param.y;
-    out->template mutable_data<typename FPTypeTraits<Precision>::T>();
-    auto x_dims = param.x->dims().size();
+    out->template mutable_data<
+        typename subgraph::mlu::MLUTypeTraits<Precision>::type>();
     auto& context = this->ctx_->template As<X86Context>();
 
-    const auto origin_dims = out->dims().Vectorize();
+    TensorLite tmp_t;
+    tmp_t.ShareDataWith(*x);
 
+    const auto x_dims = x->dims().Vectorize();
+    auto x_ndims = param.x->dims().size();
     std::vector<int> axis;
-    switch (x_dims) {
+    switch (x_ndims) {
       case 2:
         axis = {0, 1};
         break;
       case 3:
-        out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[2], out->dims()[1]});
+        tmp_t.Resize(std::vector<int64_t>{x_dims[0], x_dims[2], x_dims[1]});
         axis = {0, 2, 1};
         break;
       case 4:
-        out->Resize(std::vector<int64_t>{
-            out->dims()[0], out->dims()[3], out->dims()[1], out->dims()[2]});
+        tmp_t.Resize(
+            std::vector<int64_t>{x_dims[0], x_dims[2], x_dims[3], x_dims[1]});
         axis = {0, 3, 1, 2};
         break;
       default:
@@ -156,12 +142,8 @@ class LayoutNhwcToNchwCompute
     }
 
     LayoutTransCompute<lite::TargetType::kX86,
-                       typename FPTypeTraits<Precision>::T>(
-        x_dims, context, *x, out, axis);
-
-    if (x_dims > 2) {
-      out->Resize(origin_dims);
-    }
+                       typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
+        x_ndims, context, tmp_t, out, axis);
   }
 
   std::string doc() const override {
diff --git a/lite/kernels/mlu/subgraph_compute.cc b/lite/kernels/mlu/subgraph_compute.cc
index 73ca9dcc20a6311d33e5cff6c6ed6be08f3c7a1f..450031021d3ad70c6abb348a6e498d8876f5ec56 100644
--- a/lite/kernels/mlu/subgraph_compute.cc
+++ b/lite/kernels/mlu/subgraph_compute.cc
@@ -36,8 +36,14 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFloat)>,
     def_kFloat)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(
@@ -47,6 +53,12 @@ REGISTER_LITE_KERNEL(
     kNHWC,
     paddle::lite::kernels::mlu::SubgraphCompute<PRECISION(kFP16)>,
     def_FP16)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kMLU))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kMLU))})
+    .BindInput("Inputs",
+               {LiteType::GetTensorTy(TARGET(kMLU),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kMLU),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
     .Finalize();
diff --git a/lite/kernels/mlu/subgraph_compute.h b/lite/kernels/mlu/subgraph_compute.h
index 3bfba33f4d7e8fd86f7aaf276da2ca4a8b0bd7cf..a76f57ad90bb7f7ea2b2629c80da68cba4c7fffa 100644
--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -14,17 +14,24 @@
 
 #pragma once
 
+#include <algorithm>
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
+
 #include "lite/api/paddle_place.h"
 #include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
 #include "lite/core/type_system.h"
 #include "lite/core/types.h"
 #include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/tensor.h"
 #include "lite/kernels/npu/bridges/engine.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/utils/env.h"
 
 namespace paddle {
 namespace lite {
@@ -40,56 +47,115 @@ class SubgraphEngine : public subgraph::Engine {
                  const std::vector<std::string>& input_names,
                  const std::vector<std::string>& output_names,
                  Scope* scope,
-                 ::paddle::lite_api::PrecisionType type)
+                 paddle::lite_api::PrecisionType type)
       : subgraph::Engine(
-            ctx, block_idx, block_desc, input_names, output_names, scope) {
-    graph_.SetFPType(type);
+            ctx, block_idx, block_desc, input_names, output_names, scope),
+        fp_type_(type) {
+    VLOG(4) << "[MLU] PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL is "
+            << GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL");
+    VLOG(4) << "[MLU] PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE is "
+            << GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE");
+    VLOG(4) << "[MLU] LITE_DISABLE_MLU_CAST is "
+            << GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
+    if (GetBoolFromEnv("PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE")) {
+      disable_batch_size_changeable_ = true;
+    }
   }
 
-  int Build() {
-    // In order to attach all of the ops of the block desc, we need to build
-    // the original program firstly.
-    BuildOriginProgram();
-    // Run InferShape() of all of ops, and convert Paddle ops to MLU IR graph
-    build_device_program_status_ = BuildDeviceProgram();
-    return build_device_program_status_;
+  bool InputShapeChanged() {
+    std::vector<std::vector<int64_t>> new_shape;
+    // used in batch changable situation
+    std::vector<std::vector<int64_t>> all_shape;
+    for (auto origin_itensor : origin_itensors_) {
+      if (!disable_batch_size_changeable_) {
+        auto iv = origin_itensor->dims().Vectorize();
+        all_shape.push_back(iv);
+        iv.erase(iv.begin());
+        new_shape.push_back(iv);
+      } else {
+        new_shape.push_back(origin_itensor->dims().Vectorize());
+      }
+    }
+    inputs_shape_ = new_shape;
+    all_inputs_shape_ = all_shape;
+    if (shape_graph_map_.count(inputs_shape_) > 0) {
+      return false;
+    }
+    VLOG(3) << "MLU graph input shape changed" << std::endl;
+    return true;
   }
 
-  int Launch() {
-    // Rebuild device program when the shapes of input tensors have been
-    // changed.
-    if (subgraph::CHECK_SUCCESS(build_device_program_status_) &&
-        subgraph::CHECK_REBUILD_WHEN_SHAPE_CHANGED(
-            build_device_program_status_) &&
-        InputShapeChanged()) {
-      Build();
-    }
-    if (subgraph::CHECK_FAILED(build_device_program_status_)) {
-      LaunchOriginProgram();
-    } else {
-      LaunchDeviceProgram();
+  inline cnmlDataType_t PrecisionToDatatype(PrecisionType data_type) {
+    switch (data_type) {
+      case paddle::lite_api::PrecisionType::kFP16:
+        return CNML_DATA_FLOAT16;
+      case paddle::lite_api::PrecisionType::kFloat:
+        return CNML_DATA_FLOAT32;
+      case paddle::lite_api::PrecisionType::kInt32:
+        return CNML_DATA_INT32;
+      case paddle::lite_api::PrecisionType::kInt8:
+        return CNML_DATA_UINT8;
+      default:
+        return PrecisionToDatatype(fp_type_);
     }
-    return 0;
   }
 
  protected:
-  int BuildDeviceProgram() override {
+  bool BuildDeviceProgram() override {
+    if (origin_program_.empty()) {
+      BuildOriginProgram();
+    }
+    if (!error_compile_batch_size_changeable_ &&
+        !disable_batch_size_changeable_) {
+      int status = BuildDeviceProgramImpl();
+      if (subgraph::CHECK_SUCCESS(status)) {
+        return status;
+      }
+      LOG(INFO) << "[MLU] build batch_size changeable subgraph op failed, "
+                   "changed to input_shape changeable";
+    }
+    error_compile_batch_size_changeable_ = true;
+    disable_batch_size_changeable_ = true;
+    return BuildDeviceProgramImpl();
+  }
+
+  bool BuildDeviceProgramImpl() {
     int status = 0;
+    auto graph = std::make_shared<paddle::lite::subgraph::mlu::Graph>();
+    graph->SetFPType(fp_type_);
+    std::vector<std::vector<int64_t>> new_shape;
+    origin_itensors_.clear();
+    origin_otensors_.clear();
+
+    auto data_order = block_desc_->GetOp<cpp::OpDesc>(0)->Type() == "layout"
+                          ? CNML_NCHW
+                          : CNML_NHWC;
     // Convert all of input data vars and added into the MLU IR graph
+    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
     for (auto& input_name : input_names_) {
       auto input_tensor = scope_->FindMutableTensor(input_name);
+      auto data_type = input_tensor->precision();
+      cnmlDataType_t fp_type = PrecisionToDatatype(data_type);
+      origin_itensors_.push_back(input_tensor);
+      if (!disable_batch_size_changeable_) {
+        auto iv = input_tensor->dims().Vectorize();
+        iv.erase(iv.begin());
+        new_shape.push_back(iv);
+      } else {
+        new_shape.push_back(input_tensor->dims().Vectorize());
+      }
+
       CHECK(input_tensor);
-      auto input_node =
-          graph_.AddNode(input_name,
-                         input_tensor->dims().Vectorize(),
-                         CNML_TENSOR,
-                         CNML_NCHW,
-                         graph_.FPType(),
-                         const_cast<void*>(input_tensor->raw_data()));
+      VLOG(4) << "subgraph input tensor " << input_name << std::endl;
+      auto input_node = graph->AddNode(input_name,
+                                       input_tensor->dims().Vectorize(),
+                                       CNML_TENSOR,
+                                       CNML_NCHW,
+                                       fp_type,
+                                       data_order);
       CHECK(input_node);
       // MLU doesn't support dynamic dimensions/shapes, so need to rebuild
       // the program when the shape of any input tensor is changed.
-      status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
     }
     LOG(INFO) << "START TO CONVERT ";
     // Convert all of ops and its weights and added into the MLU IR graph
@@ -98,63 +164,304 @@ class SubgraphEngine : public subgraph::Engine {
       auto op = inst.op();
       CHECK(op);
       std::string op_type = op->op_info()->Type();
+      // since cnml's compile api will not return error now, we simply check
+      // op's type
+      if (!disable_batch_size_changeable_ &&
+          std::find(unsupport_batch_size_changeable_op_type_.begin(),
+                    unsupport_batch_size_changeable_op_type_.end(),
+                    op_type) !=
+              unsupport_batch_size_changeable_op_type_.end()) {
+        status |= subgraph::FAILED;
+        VLOG(4) << "[MLU] found unsupported batch_size changeable op type: "
+                << op_type;
+        if (subgraph::CHECK_FAILED(status)) {
+          return false;
+        }
+        return true;
+      }
       op->CheckShape();
       const_cast<OpLite*>(op)->InferShape();
       if (!bridges.Exists(op_type, TARGET(kMLU))) {
         LOG(INFO) << "MLU bridges doesn't support op_type: " << op_type;
-        return subgraph::FAILED;
+        return false;
       }
       auto kernel = inst.kernel();
       status |= bridges.Select(op_type, TARGET(kMLU))(
-          reinterpret_cast<void*>(&graph_),
+          reinterpret_cast<void*>(graph.get()),
           const_cast<OpLite*>(op),
           const_cast<KernelBase*>(kernel));
       if (subgraph::CHECK_FAILED(status)) {
-        return subgraph::FAILED;
+        return false;
       }
     }
     // Obtain the output nodes of the MLU IR graph and build the graph to MLU
     // runtime
-    std::vector<std::string> valid_output_names;
     for (auto& output_name : output_names_) {
-      if (graph_.HasNode(output_name)) {
-        graph_.AddOutput(graph_.GetNode(output_name));
+      if (graph->HasNode(output_name)) {
+        graph->AddOutput(graph->GetNode(output_name));
         auto output_tensor = scope_->FindMutableTensor(output_name);
-        void* p_data = static_cast<void*>(
-            output_tensor->mutable_data<typename ::paddle::lite::subgraph::mlu::
-                                            FPTypeTraits<Precision>::T>(
-                TARGET(kMLU)));
-        auto node = graph_.GetNode(output_name);
-        CHECK(p_data);
-        node->set_mlu_ptr(p_data);
-        valid_output_names.push_back(output_name);
+        origin_otensors_.push_back(output_tensor);
+        VLOG(4) << "subgraph output tensor " << output_name << std::endl;
+
+        // auto node = graph->GetNode(output_name);
+        // CHECK(p_data);
+        // node->set_mlu_ptr(p_data);
       }
     }
     for (auto& input_name : input_names_) {
-      graph_.AddInput(graph_.GetNode(input_name));
+      graph->AddInput(graph->GetNode(input_name),
+                      disable_batch_size_changeable_);
     }
-    CHECK(!valid_output_names.empty()) << "[MLU] no valid output names";
+
+    CHECK(!origin_otensors_.empty()) << "[MLU] no valid output names";
     auto& mlu_context = this->ctx_->template As<MLUContext>();
     auto core_version = mlu_context.MLUCoreVersion();
     auto core_number = mlu_context.MLUCoreNumber();
-    graph_.Compile(core_version, core_number);
-    return status;
+    graph->Compile(core_version, core_number);
+    shape_graph_map_[new_shape] = graph;
+    if (GetBoolFromEnv("PADDLE_LITE_MLU_SAVE_OFFLINE_MODEL")) {
+      graph->GenOfflineModel(GetOfflineModName());
+    }
+    return true;
+  }
+
+  std::string TrimStrings(const std::string& origin_str) {
+    std::string str = origin_str;
+    std::size_t found = str.find("0x");
+    std::size_t found_end = 0;
+    const std::vector<std::string> del_strs = {
+        "/trans_io_copy", "/trans_cast", "/trans_layout"};
+    for (const auto& iterm : del_strs) {
+      found_end = str.find(iterm);
+      // trim point address and one of the del_strs
+      if (found != std::string::npos && found_end != std::string::npos) {
+        str.replace(found, found_end - found, "");
+        found_end = str.find(iterm);
+        str.replace(found_end, iterm.size(), "");
+        break;
+      }
+    }
+    return str;
+  }
+
+  std::string GetOfflineModName() {
+    sort(input_names_.begin(), input_names_.end());
+    sort(output_names_.begin(), output_names_.end());
+    const auto& delimiter = "__";
+    const auto& delimiter_num = "_";
+    const auto& input_shape_str = "input_shape_";
+    const auto& output_shape_str = "output_shape_";
+    std::string name = "";
+    std::string tmp = "";
+    for (const auto& input_name : input_names_) {
+      tmp = input_name;
+      name += TrimStrings(tmp) + delimiter + input_shape_str;
+      auto input_tensor = scope_->FindMutableTensor(input_name);
+      for (const auto& iterm : input_tensor->dims().Vectorize()) {
+        name += std::to_string(iterm) + delimiter_num;
+      }
+      name += delimiter;
+    }
+    for (const auto& output_name : output_names_) {
+      tmp = output_name;
+      name += TrimStrings(tmp) + delimiter + output_shape_str;
+      auto output_tensor = scope_->FindMutableTensor(output_name);
+      for (const auto& iterm : output_tensor->dims().Vectorize()) {
+        name += std::to_string(iterm) + delimiter_num;
+      }
+      name += delimiter;
+    }
+    std::replace(name.begin(), name.end(), '/', '-');
+    return name;
+  }
+
+  void InferOutputsShapeOnly() {
+    // infer outputs shape when enable BATCH_SIZE_CHANGEABLE
+    const auto iter = in_out_shape_map_.find(all_inputs_shape_);
+    if (iter != in_out_shape_map_.end()) {
+      for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+        origin_otensors_[i]->Resize(iter->second[i]);
+      }
+    } else {
+      for (auto& inst : origin_program_) {
+        auto op = inst.op();
+        CHECK(op);
+        op->CheckShape();
+        const_cast<OpLite*>(op)->InferShape();
+      }
+      std::vector<std::vector<int64_t>> outs_shape;
+      for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+        outs_shape.push_back(origin_otensors_[i]->dims().Vectorize());
+      }
+      in_out_shape_map_[all_inputs_shape_] = outs_shape;
+    }
   }
 
-  int LaunchDeviceProgram() override {
+  inline void* GetOutputDataPtr(Tensor* tensor, bool use_mlu_cast) {
+    if (use_mlu_cast) {
+      // output is float, since cast fused in subgraph
+      return static_cast<void*>(tensor->mutable_data<float>(TARGET(kMLU)));
+    } else {
+      return static_cast<void*>(
+          tensor->template mutable_data<
+              typename subgraph::mlu::MLUTypeTraits<Precision>::type>(
+              TARGET(kMLU)));
+    }
+  }
+
+  bool LaunchDeviceProgram() override {
+    // prepare input and output memory
     auto& mlu_context = this->ctx_->template As<MLUContext>();
     auto exec_queue = mlu_context.exec_queue();
-    u32_t affinity = mlu_context.affinity();
-    cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
-    int data_param = 1;
-    forward_param.data_parallelism = &data_param;
-    forward_param.affinity = &affinity;
-    forward_param.end = CNRT_PARAM_END;
-    graph_.Compute(forward_param, exec_queue);
-    return 0;
+
+    auto graph = shape_graph_map_[inputs_shape_];
+    auto* graph_input = graph->MutableInputs();
+    auto* graph_output = graph->MutableOutputs();
+    CHECK_EQ(graph_input->size(), origin_itensors_.size());
+    CHECK_EQ(graph_output->size(), origin_otensors_.size());
+
+    bool disable_mlu_cast = GetBoolFromEnv("LITE_DISABLE_MLU_CAST");
+
+    if (!disable_batch_size_changeable_) {
+      std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
+          graph_in;
+      if (shape_tensor_map_in_.find(all_inputs_shape_) !=
+          shape_tensor_map_in_.end()) {
+        graph_in = shape_tensor_map_in_[all_inputs_shape_];
+        for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+          graph_in[i]->set_mlu_ptr(
+              const_cast<void*>(origin_itensors_[i]->raw_data()));
+        }
+      } else {
+        graph_in.reserve(origin_itensors_.size());
+        for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+          paddle::lite::subgraph::mlu::MLUTensor tmp(
+              origin_itensors_[i]->dims().Vectorize());
+          tmp.set_mlu_dtype(graph_input->at(i)->dtype());
+          tmp.set_mlu_ptr(const_cast<void*>(origin_itensors_[i]->raw_data()));
+          graph_in.push_back(
+              std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
+        }
+        shape_tensor_map_in_[all_inputs_shape_] = graph_in;
+      }
+
+      // TODO(zhangmingwei): we just call every op's infer_shape to get outputs'
+      // shape, may be it's better to use cnml's api to get output shape. This
+      // can be done when cnml's tensor dimension is totally equal to lite's
+      // tensor
+      // shape.
+      InferOutputsShapeOnly();
+      // const std::vector<std::vector<int64_t>> new_output_size =
+      //    graph->InferOutputsShape(graph_in);
+
+      std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>
+          graph_out;
+
+      if (shape_tensor_map_out_.find(all_inputs_shape_) !=
+          shape_tensor_map_out_.end()) {
+        graph_out = shape_tensor_map_out_[all_inputs_shape_];
+        for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+          // origin_otensors_[i]->Resize(new_output_size.at(i));
+          graph_out[i]->set_mlu_ptr(
+              GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
+        }
+      } else {
+        graph_out.reserve(origin_otensors_.size());
+        for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+          // origin_otensors_[i]->Resize(new_output_size.at(i));
+          paddle::lite::subgraph::mlu::MLUTensor tmp(
+              origin_otensors_[i]->dims().Vectorize());
+          tmp.set_mlu_dtype(graph_output->at(i)->dtype());
+          tmp.set_mlu_ptr(
+              GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
+          graph_out.push_back(
+              std::make_shared<paddle::lite::subgraph::mlu::MLUTensor>(tmp));
+        }
+        shape_tensor_map_out_[all_inputs_shape_] = graph_out;
+      }
+      graph->Compute(exec_queue, graph_in, graph_out);
+    } else {
+      for (size_t i = 0; i < origin_itensors_.size(); ++i) {
+        graph_input->at(i)->set_mlu_ptr(
+            const_cast<void*>(origin_itensors_[i]->raw_data()));
+      }
+      for (size_t i = 0; i < origin_otensors_.size(); ++i) {
+        origin_otensors_[i]->Resize(graph_output->at(i)->get_origin_shape());
+        graph_output->at(i)->set_mlu_ptr(
+            GetOutputDataPtr(origin_otensors_[i], !disable_mlu_cast));
+      }
+      // only cnmlComputeFusionOpForward_V3 need cnrtInvokeFuncParam_t
+      cnrtInvokeFuncParam_t forward_param = mlu_context.forward_param();
+      int data_param = 1;
+      forward_param.data_parallelism = &data_param;
+      u32_t affinity = mlu_context.affinity();
+      forward_param.affinity = &affinity;
+      forward_param.end = CNRT_PARAM_END;
+      graph->Compute(forward_param, exec_queue);
+
+#ifdef MLU_DUMP_SUBGRAPH_IO
+      // Graph node store compile-time tensor while batchsize mutable is set.
+      // Only batchsize mutable is disabled, data exists in graph node at
+      // runtime
+      // =========== DUMP ===================
+      for (auto input_name : input_names_) {
+        auto input_tensor =
+            shape_graph_map_[inputs_shape_]->GetNode(input_name);
+        auto dump_name = input_name;
+        while (dump_name.find("/") != std::string::npos) {
+          dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+        }
+        VLOG(6) << "dump_name: " << dump_name;
+        input_tensor->ToFile(dump_name);
+      }
+      for (auto output_name : output_names_) {
+        if (shape_graph_map_[inputs_shape_]->HasNode(output_name)) {
+          auto output_tensor =
+              shape_graph_map_[inputs_shape_]->GetNode(output_name);
+          auto dump_name = output_name;
+          while (dump_name.find("/") != std::string::npos) {
+            dump_name = dump_name.replace(dump_name.find("/"), 1, "_");
+          }
+          VLOG(6) << "dump_name: " << dump_name;
+          output_tensor->ToFile(dump_name);
+        } else {
+          VLOG(6) << "graph does not have " << output_name << " as output"
+                  << std::endl;
+        }
+      }
+#endif
+      // =========== DUMP END ================
+    }
+
+    return true;
   }
 
-  paddle::lite::subgraph::mlu::Graph graph_;
+  paddle::lite_api::PrecisionType fp_type_;
+  std::vector<std::vector<int64_t>> inputs_shape_{};
+  std::vector<std::vector<int64_t>> all_inputs_shape_{};
+  std::map<std::vector<std::vector<int64_t>>,
+           std::shared_ptr<paddle::lite::subgraph::mlu::Graph>>
+      shape_graph_map_{};
+  // enable batch size changeable by default, this cound be changed by
+  // environment variable PADDLE_LITE_MLU_DISABLE_BATCH_SIZE_CHANGEABLE and
+  // whether the op can be compiled with batch size changeable way
+  bool disable_batch_size_changeable_{false};
+  bool error_compile_batch_size_changeable_{false};
+  std::vector<std::string> unsupport_batch_size_changeable_op_type_{"concat"};
+  // search output runtime MLUTensor for certain output shape when enable
+  // BATCH_SIZE_CHANGEABLE
+  std::map<std::vector<std::vector<int64_t>>,
+           std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
+      shape_tensor_map_out_{};
+  // search input runtime MLUTensor for certain input shape when enable
+  // BATCH_SIZE_CHANGEABLE
+  std::map<std::vector<std::vector<int64_t>>,
+           std::vector<std::shared_ptr<paddle::lite::subgraph::mlu::MLUTensor>>>
+      shape_tensor_map_in_{};
+  // search output shape for certain input shape when enable
+  // BATCH_SIZE_CHANGEABLE
+  std::map<std::vector<std::vector<int64_t>>, std::vector<std::vector<int64_t>>>
+      in_out_shape_map_{};
 };
 
 template <PrecisionType Precision>
@@ -174,12 +481,11 @@ class SubgraphCompute
                                                 param.scope,
                                                 this->precision()));
     CHECK(engine_);
-    engine_->Build();
   }
 
   void Run() override {
     CHECK(engine_);
-    engine_->Launch();
+    engine_->Run();
   }
 
   virtual ~SubgraphCompute() = default;
diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc
index 8ca8357710e1f36a7c3f21417d7633e47f18c59a..884ab1acce8f0927def660ae35941d85b4c85901 100644
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/npu/bridges/engine.h"
 #include <sys/time.h>
 #include <time.h>
+#include <algorithm>
 #include <utility>
 #include "lite/kernels/npu/bridges/registry.h"
 
@@ -22,11 +23,50 @@ namespace paddle {
 namespace lite {
 namespace subgraph {
 
-int Engine::BuildDeviceProgram() { return FAILED; }
+Engine::Engine(KernelContext *ctx,
+               int block_idx,
+               cpp::BlockDesc *block_desc,
+               const std::vector<std::string> &input_names,
+               const std::vector<std::string> &output_names,
+               lite::Scope *scope)
+    : ctx_(ctx), block_idx_(block_idx), block_desc_(block_desc), scope_(scope) {
+  input_names_ = input_names;
+  output_names_ = output_names;
+  // Sort the name of input and output tensors, it's convenient for us to get
+  // the info of input and output tensors in the same order from the device
+  // program, because the result of subgraph division may be different but right
+  // at each call of the subgraph pass.
+  std::stable_sort(input_names_.begin(), input_names_.end());
+  std::stable_sort(output_names_.begin(), output_names_.end());
+}
 
-int Engine::LaunchDeviceProgram() { return 0; }
+bool Engine::Run() {
+  if (is_first_epoch_) {
+    PrepareWorkspaceForDeviceProgram();
+    is_first_epoch_ = false;
+  }
+  if (InputShapeChanged()) {
+    BuildDeviceProgram();
+  }
+  return LaunchDeviceProgram();
+}
 
-int Engine::BuildOriginProgram() {
+bool Engine::PrepareWorkspaceForOriginProgram() {
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+  }
+  origin_otensors_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+  }
+  return true;
+}
+
+bool Engine::BuildOriginProgram() {
   // TODO(hong19860320) The block_desc need to be divided into subgraphs during
   // the exection time. But only see them as a subgraph now.
   origin_program_.clear();
@@ -34,11 +74,14 @@ int Engine::BuildOriginProgram() {
     auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
     CHECK(op_desc);
     std::string op_type = op_desc->Type();
+    // Create op and pick up the best kernel
     auto op = LiteOpRegistry::Global().Create(op_desc->Type());
+    CHECK(op) << "no Op found for " << op_type;
     op->Attach(*op_desc, scope_);
     std::unique_ptr<KernelBase> picked_kernel;
     if (op_desc->HasAttr(kKernelTypeAttr)) {
-      // Create op and pick up kernel according to the kKernelTypeAttr attribute
+      // Create op and pick up the best kernel according to the
+      // kKernelTypeAttr attribute
       auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
       std::string alias;
       Place place;
@@ -48,12 +91,14 @@ int Engine::BuildOriginProgram() {
       auto kernels = op->CreateKernels({place});
       CHECK_GT(kernels.size(), 0u) << "No kernels found for " << op_type;
       auto it = std::find_if(
-          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
+          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase> &it) {
             return it->alias() == alias;
           });
       CHECK(it != kernels.end());
       picked_kernel = std::move(*it);
     } else {
+      // TODO(hong19860320) add kernel picking according to the type of input
+      // and output tensors
       VLOG(3) << "The attr '" << kKernelTypeAttr
               << "' not found, pick the first kernel for " << op_type;
       std::vector<std::unique_ptr<KernelBase>> kernels;
@@ -74,52 +119,41 @@ int Engine::BuildOriginProgram() {
     }
     origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
   }
-  return 0;
+  CHECK(!origin_program_.empty()) << "no instructions";
+  return true;
 }
 
-int Engine::LaunchOriginProgram() {
-  for (auto& inst : origin_program_) {
-    auto op_type = inst.op()->op_info()->Type();
-    if (op_type == "feed" || op_type == "fetch") continue;
-    inst.Run();
+bool Engine::LaunchOriginProgram() {
+  if (origin_program_.empty()) {
+    BuildOriginProgram();
+  }
+  if (!origin_program_.empty()) {
+    for (auto &inst : origin_program_) {
+      auto op_type = inst.op()->op_info()->Type();
+      if (op_type == "feed" || op_type == "fetch") continue;
+      inst.Run();
+    }
+    return true;
   }
-  return 0;
+  return false;
 }
 
-int Engine::Build() {
-  // In order to attach all of the ops of the block desc, we need to build the
-  // original program firstly.
-  BuildOriginProgram();
-  // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
-  build_device_program_status_ = BuildDeviceProgram();
-  return build_device_program_status_;
+bool Engine::PrepareWorkspaceForDeviceProgram() {
+  return PrepareWorkspaceForOriginProgram();
 }
 
-void Engine::InitDeviceTensor() { return; }
+bool Engine::BuildDeviceProgram() { return BuildOriginProgram(); }
+
+bool Engine::LaunchDeviceProgram() { return LaunchOriginProgram(); }
 
 bool Engine::InputShapeChanged() {
+  bool changed = false;
   for (size_t i = 0; i < origin_itensors_.size(); i++) {
-    if (origin_itensors_[i]->dims() != origin_idims_[i]) {
-      return true;
-    }
-  }
-  return false;
-}
-
-int Engine::Launch() {
-  // Rebuild device program when the shapes of input tensors have been changed.
-  if (CHECK_SUCCESS(build_device_program_status_) &&
-      CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
-      InputShapeChanged()) {
-    Build();
-    InitDeviceTensor();
-  }
-  if (CHECK_FAILED(build_device_program_status_)) {
-    LaunchOriginProgram();
-  } else {
-    LaunchDeviceProgram();
+    auto origin_idim = origin_itensors_[i]->dims().Vectorize();
+    changed |= origin_idim != origin_idims_[i];
+    origin_idims_[i] = origin_idim;
   }
-  return 0;
+  return changed;
 }
 
 }  // namespace subgraph
diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h
index 6a3f72077a9bed7a296b184330af119262472ada..b49b8fea5a6d39610ea7398e177e7d1ec5a35f92 100644
--- a/lite/kernels/npu/bridges/engine.h
+++ b/lite/kernels/npu/bridges/engine.h
@@ -33,49 +33,36 @@ class Engine {
          cpp::BlockDesc *block_desc,
          const std::vector<std::string> &input_names,
          const std::vector<std::string> &output_names,
-         lite::Scope *scope,
-         std::string model_cache_dir = "")
-      : ctx_(ctx),
-        block_idx_(block_idx),
-        block_desc_(block_desc),
-        input_names_(input_names),
-        output_names_(output_names),
-        scope_(scope),
-        model_cache_dir_(model_cache_dir) {}
+         lite::Scope *scope);
   virtual ~Engine() = default;
 
-  virtual int Build();
-  virtual int Launch();
+  virtual bool Run();
 
  private:
   Engine(const Engine &) = delete;
 
  protected:
-  virtual int BuildDeviceProgram();
-  virtual int LaunchDeviceProgram();
+  virtual bool PrepareWorkspaceForOriginProgram();
+  virtual bool BuildOriginProgram();
+  virtual bool LaunchOriginProgram();
 
-  virtual int BuildOriginProgram();
-  virtual int LaunchOriginProgram();
+  virtual bool PrepareWorkspaceForDeviceProgram();
+  virtual bool BuildDeviceProgram();
+  virtual bool LaunchDeviceProgram();
 
-  virtual void InitDeviceTensor();
   virtual bool InputShapeChanged();
 
   KernelContext *ctx_{nullptr};
-  int block_idx_;
-  cpp::BlockDesc *block_desc_;
+  int block_idx_{-1};
+  cpp::BlockDesc *block_desc_{nullptr};
   std::vector<std::string> input_names_;
   std::vector<std::string> output_names_;
   Scope *scope_{nullptr};
-  // SUCCESS: device program build successed. FAILED: device program build
-  // failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need
-  // to rebuild when input shape changed.
-  int build_device_program_status_{0};
-  std::vector<DDim> origin_idims_;
-  std::vector<DDim> origin_odims_;
+  bool is_first_epoch_{true};
+  std::vector<std::vector<int64_t>> origin_idims_;
   std::vector<Tensor *> origin_itensors_;
   std::vector<Tensor *> origin_otensors_;
   std::vector<Instruction> origin_program_;
-  std::string model_cache_dir_{""};
 };
 
 }  // namespace subgraph
diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h
index 38b03e06fa212728888cf47b3048d71fd4de06fc..1bc588496a253aa82183e020adc39989ad8d7312 100644
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
@@ -19,7 +19,7 @@
 #include <string>
 #include <utility>
 #include <vector>
-#include "graph/op/all_ops.h"
+#include "graph/compatible/all_ops.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 
diff --git a/lite/kernels/npu/bridges/matmul_op.cc b/lite/kernels/npu/bridges/matmul_op.cc
index 32af1916899454ef7a045339da5e9fc8a6131cfc..79ba82d94f24f61c2b9f51bd29634151bfcfa0ab 100644
--- a/lite/kernels/npu/bridges/matmul_op.cc
+++ b/lite/kernels/npu/bridges/matmul_op.cc
@@ -94,10 +94,10 @@ int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   } else {
     matmul_node = graph->Add<ge::op::BatchMatMul>(out_name);
     auto matmul_op = matmul_node->data<ge::op::BatchMatMul>();
-    matmul_op->set_input_x(*x_node->data());
-    matmul_op->set_input_y(*y_node->data());
-    matmul_op->set_attr_adj_x(transpose_x);
-    matmul_op->set_attr_adj_y(transpose_y);
+    matmul_op->set_input_x1(*x_node->data());
+    matmul_op->set_input_x2(*y_node->data());
+    matmul_op->set_attr_adj_x1(transpose_x);
+    matmul_op->set_attr_adj_x2(transpose_y);
   }
 
   if (fabs(alpha - 1.f) > 1e-6f) {
diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h
index 107d90c116b8239a9060f252c45c2b2d7901ddf7..6e75e58187909ad59da37dbcb0737a92ec014e22 100644
--- a/lite/kernels/npu/bridges/utility.h
+++ b/lite/kernels/npu/bridges/utility.h
@@ -20,11 +20,11 @@
 #include <string>
 #include <vector>
 #include "graph/buffer.h"
+#include "graph/compatible/operator_reg.h"
 #include "graph/graph.h"
 #include "graph/model.h"
 #include "graph/op/all_ops.h"
 #include "graph/operator.h"
-#include "graph/operator_reg.h"
 #include "lite/core/op_lite.h"
 #include "lite/utils/macros.h"
 
@@ -97,25 +97,26 @@ REG_OP(Pad)
     /*
      * Multiplies slices of two tensors in batches.
      * <Input>
-     *      x : The input tensor
-     *      y : The input tensor
+     *    x1 : The input tensor
+     *    x2 : The input tensor
      * <Output>
-     *      z : The output tensor
+     *    y : The output tensor
      * <Attr>
-     *      adj_x : adj_x is true, the input tensor x  is  transposed, otherwise
-     * it will not be transposed. Default is false (The current version only
-     * supports false).
-     *      adj_y : adj_y is true, the input tensor y  is  transposed, otherwise
-     * it will not be transposed. Default is false.
+     *    adj_x1 : adj_x1 is true, the input tensor x1  is  transposed,
+     * otherwise it will not be transposed.
+     *             Default is false (The current version only supports false).
+     *    adj_x2 : adj_x2 is true, the input tensor x2  is  transposed,
+     * otherwise it will not be transposed.
+     *             Default is false.
      * <Added in HiAI version>
-     *      100.320.010.010
+     *    100.320.010.010
      */
     REG_OP(BatchMatMul)
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .INPUT(y, TensorType({DT_FLOAT}))
-    .OUTPUT(z, TensorType({DT_FLOAT}))
-    .ATTR(adj_x, AttrValue::BOOL{false})
-    .ATTR(adj_y, AttrValue::BOOL{false})
+    .INPUT(x1, TensorType({DT_FLOAT}))
+    .INPUT(x2, TensorType({DT_FLOAT}))
+    .OUTPUT(y, TensorType({DT_FLOAT}))
+    .ATTR(adj_x1, AttrValue::BOOL{false})
+    .ATTR(adj_x2, AttrValue::BOOL{false})
     .OP_END()
 
 }  // namespace ge
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
index f17d73f8dfd540c8a1b809d780084b05299ccc2f..6afb445e0ed411251d203bcb0420b0fba8ab6beb 100644
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -16,6 +16,7 @@
 #include <sys/time.h>
 #include <time.h>
 #include <algorithm>
+#include <functional>
 #include <utility>
 #include "hiai_ir_build.h"  // NOLINT
 #include "lite/backends/npu/device.h"
@@ -24,205 +25,275 @@
 #include "lite/kernels/npu/bridges/paddle_use_bridges.h"
 #include "lite/kernels/npu/bridges/utility.h"
 #include "lite/utils/io.h"
+#include "lite/utils/md5.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace npu {
 
-std::string SubgraphEngine::GenerateModelCacheName() const {
-  auto inames = device_inames_;
-  auto onames = device_onames_;
-  std::stable_sort(inames.begin(), inames.end());
-
-  std::string model_cache_name = "subgraph_" + std::to_string(block_idx_);
-  for (auto iname : inames) {
-    model_cache_name += "_";
-    auto itensor = scope_->FindTensor(iname);
-    int tmp = 0;
-    for (auto i : itensor->dims().Vectorize()) {
-      tmp += i * i;
+// Generate the model name by using md5 hashes based on:
+// 1. the sorted variable input names
+// 2. the shapes of the origin input tensors
+// 3. the sorted variable output names
+std::string DeviceProgram::GenerateModelName(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims) {
+  std::ostringstream os;
+  CHECK_EQ(input_names.size(), origin_idims.size());
+  for (int i = 0; i < input_names.size(); i++) {
+    os << input_names[i];
+    for (auto dim : origin_idims[i]) {
+      os << dim;
     }
-    model_cache_name += std::to_string(tmp % 1999);
   }
-  model_cache_name += "_.om";
+  for (auto output_name : output_names) {
+    os << output_name;
+  }
+  return MD5(os.str());
+}
 
-  return model_cache_name;
+// Deserialize the generated model, the precisions and dimensions of the origin
+// output tensors of the subgraph op into files
+bool DeviceProgram::LoadFromCacheFile(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::string& model_cache_dir) {
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
+  }
+  // Load from the cached model file, return a HiAI model manager client for
+  // inference
+  auto model_path = model_cache_dir + "/" + model_name_ + ".om";
+  VLOG(3) << "[NPU] Load model from " << model_path;
+  std::vector<char> model_buffer;
+  if (!ReadFile(model_path, &model_buffer)) {
+    LOG(WARNING) << "[NPU] read from " << model_path << " failed!";
+    return false;
+  }
+  bool model_comp = false;
+  model_client_ =
+      lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp);
+  if (!model_client_) {
+    LOG(WARNING) << "[NPU] Load model failed!";
+    return false;
+  }
+  // Rewrite with the compatible model data if the cached
+  // model file is incompatible with the current device
+  if (!model_comp) {
+    VLOG(3) << "[NPU] Export the compatible model to " << model_path;
+    if (!WriteFile(model_path, model_buffer)) {
+      LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!";
+    }
+  }
+  // Deserialize the precisions and shapes of the origin output tensors from the
+  // cached configuration file
+  auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+  VLOG(3) << "[NPU] Load configuration from " << config_path;
+  std::vector<char> config_buffer;
+  if (!ReadFile(config_path, &config_buffer)) {
+    LOG(WARNING) << "[NPU] read from " << config_path << " failed!";
+    return false;
+  }
+  std::string config_str(config_buffer.begin(), config_buffer.end());
+  // Parse the precision and shapes of the output tensors
+  auto output_options = Split<std::string>(config_str, ";");
+  CHECK_EQ(output_options.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (int i = 0; i < output_names.size(); i++) {
+    auto items = Split<std::string>(output_options[i], ":");
+    CHECK_EQ(items.size(), 2);  // precision and shapes
+    origin_otypes_[i] = static_cast<PrecisionType>(std::stoi(items[0]));
+    origin_odims_[i] = Split<int64_t>(items[1], ",");
+  }
+  return true;
 }
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool DeviceProgram::BuildGraphAndCacheToFile(
+    const std::vector<Instruction>& origin_program,
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    const std::vector<std::vector<int64_t>>& origin_idims,
+    const std::vector<Tensor*>& origin_otensors,
+    const std::string& model_cache_dir) {
+  // Generate the model name if not initialized
+  if (model_name_.empty()) {
+    model_name_ = GenerateModelName(input_names, output_names, origin_idims);
+  }
+  // Convert all of ops and their input vars and weights to HiAI IR nodes,
+  // then added them into the HiAI IR graph
   int status = 0;
-  // Convert all of ops and their input vars and weights and added into the NPU
-  // HiAI IR graph
+  CHECK(!origin_program.empty()) << "no instructions";
   subgraph::npu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
-  for (auto& inst : origin_program_) {
+  for (auto& inst : origin_program) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
     op->CheckShape();
     op->InferShape();
     std::string op_type = op->op_info()->Type();
     if (!bridges.Exists(op_type, TARGET(kNPU))) {
-      return subgraph::FAILED;
+      return false;
     }
     auto kernel = inst.kernel();
     status |= bridges.Select(op_type, TARGET(kNPU))(
         reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
-  // Collect the valid input and output nodes in the HiAI IR graph and update
-  // the input and output names
-  device_inames_.clear();
-  device_onames_.clear();
+  // Collect the input and output nodes of the HiAI IR graph
   std::vector<ge::Operator> device_inodes;
+  for (size_t i = 0; i < input_names.size(); i++) {
+    CHECK(graph.Has(input_names[i]) && graph.Get(input_names[i])->is_data());
+    device_inodes.push_back(*graph.Get(input_names[i])->data());
+  }
   std::vector<ge::Operator> device_onodes;
-  for (auto& input_name : input_names_) {
-    if (graph.Has(input_name)) {
-      if (graph.Get(input_name)->is_data()) {
-        device_inodes.push_back(*graph.Get(input_name)->data());
-        device_inames_.push_back(input_name);
-      } else {
-        LOG(WARNING) << "[NPU] Input node " << input_name
-                     << " is ignored because it is not a data node.";
-      }
-    } else {
-      LOG(WARNING) << "[NPU] Input node " << input_name
-                   << " is ignored because it does not exist.";
-    }
+  for (size_t i = 0; i < output_names.size(); i++) {
+    CHECK(graph.Has(output_names[i]));
+    device_onodes.push_back(*graph.Get(output_names[i])->data());
   }
-  for (auto& output_name : output_names_) {
-    if (graph.Has(output_name)) {
-      device_onodes.push_back(*graph.Get(output_name)->data());
-      device_onames_.push_back(output_name);
-    } else {
-      LOG(WARNING) << "[NPU] Output node " << output_name
-                   << " is ignored because it does not exist.";
-    }
+  // Build the HiAI IR graph to the HiAI om model
+  std::vector<char> model_buffer;
+  if (!lite::npu::Device::Global().Build(
+          device_inodes, device_onodes, &model_buffer)) {
+    LOG(WARNING) << "[NPU] Build model failed!";
+    return false;
   }
-  CHECK(!device_inames_.empty())
-      << "[NPU] No input nodes found for building NPU model";
-  CHECK(!device_onames_.empty())
-      << "[NPU] No output nodes found for building NPU model";
-
-  // Build the HiAI IR graph to HiAI om model as the device program
-  if (device_program_map_.count(inputs_shape_) > 0) {
-    return status;
+  // Load the HiAI om model and create a HiAI model manager client(from HiAI
+  // Service) to run inference.
+  bool model_comp = true;
+  model_client_ =
+      lite::npu::Device::Global().Load(model_name_, &model_buffer, &model_comp);
+  if (!model_client_) {
+    LOG(WARNING) << "[NPU] Load model failed!";
+    return false;
   }
-  std::string model_cache_full_dir =
-      model_cache_dir_.empty() ? "" : model_cache_dir_ + "/" +
-                                          GenerateModelCacheName();
-  auto device_client = lite::npu::Device::Global().Build(
-      model_name_, device_inodes, device_onodes, model_cache_full_dir);
-  if (device_client == nullptr) {
-    LOG(WARNING) << "[NPU] Build model failed!";
-    return subgraph::FAILED;
+  // Update the precison and dimensions of the origin output tensors
+  CHECK_EQ(origin_otensors.size(), output_names.size());
+  origin_otypes_.resize(output_names.size());
+  origin_odims_.resize(output_names.size());
+  for (size_t i = 0; i < output_names.size(); i++) {
+    origin_otypes_[i] = graph.Get(output_names[i])->precision();
+    origin_odims_[i] = origin_otensors[i]->dims().Vectorize();
   }
-  auto device_program = std::make_shared<device_program_t>(device_client);
-  if (!inputs_shape_.empty()) {
-    device_program_map_[inputs_shape_] = device_program;
+  if (!model_cache_dir.empty()) {
+    // Save the generated model to file, used for the model caching or the
+    // offline model generation
+    auto model_path = model_cache_dir + "/" + model_name_ + ".om";
+    VLOG(3) << "[NPU] Save model to " << model_path;
+    if (!WriteFile(model_path, model_buffer)) {
+      LOG(WARNING) << "[NPU] Open " << model_path << " for writting failed!";
+    }
+    // Serialize the precisions and shapes of the origin output tensors into the
+    // configuration file
+    std::ostringstream os;
+    for (int i = 0; i < output_names.size(); i++) {
+      os << static_cast<int32_t>(origin_otypes_[i]) << ":";
+      for (auto dim : origin_odims_[i]) {
+        os << dim << ",";
+      }
+      os << ";";
+    }
+    auto str = os.str();
+    std::vector<char> config_buffer(str.begin(), str.end());
+    auto config_path = model_cache_dir + "/" + model_name_ + ".cfg";
+    VLOG(3) << "[NPU] Save configuration to " << config_path;
+    if (!WriteFile(config_path, config_buffer)) {
+      LOG(WARNING) << "[NPU] Open " << config_path << " for writting failed!";
+    }
   }
+  return true;
+}
 
-  // Query and check the dimensions of valid input and output tensors
-  std::vector<hiai::TensorDimension> device_idims, device_odims;
-  if (device_program->client->GetModelIOTensorDim(
-          model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
-    LOG(WARNING)
-        << "[NPU] Get the dimensions of input and output tensors failed!";
-    return subgraph::FAILED;
+bool DeviceProgram::ShareBufferWithOriginTensors(
+    const std::vector<std::string>& input_names,
+    const std::vector<std::string>& output_names,
+    std::vector<Tensor*>* origin_itensors,
+    std::vector<Tensor*>* origin_otensors,
+    std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
+    std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
+  // Query the dimensions of the device input and output tensors if not
+  // initialized
+  if (device_idims_.empty() || device_odims_.empty()) {
+    if (model_client_->GetModelIOTensorDim(
+            model_name_, device_idims_, device_odims_) != hiai::AI_SUCCESS) {
+      LOG(WARNING)
+          << "[NPU] Get the dimensions of input and output tensors failed!";
+      return false;
+    }
   }
-  device_program->device_idims = device_idims;
-  device_program->device_odims = device_odims;
+  // Check the dimensions of the device tensors and the origin tensors
+  CHECK_EQ(device_itensors->size(), input_names.size());
+  CHECK_EQ(device_otensors->size(), output_names.size());
+  CHECK_EQ(origin_otypes_.size(), output_names.size());
+  CHECK_EQ(origin_odims_.size(), output_names.size());
+  CHECK_EQ(device_idims_.size(), input_names.size());
+  CHECK_EQ(device_odims_.size(), output_names.size());
+  for (int i = 0; i < input_names.size(); i++) {
+    VLOG(3) << "[NPU] Inputs[" << i << "] name: " << input_names[i]
+            << " origin dims:" << (*origin_itensors)[i]->dims().repr()
+            << " device dims: {" << device_idims_[i].GetNumber() << ","
+            << device_idims_[i].GetChannel() << ","
+            << device_idims_[i].GetHeight() << ","
+            << device_idims_[i].GetWidth() << "}";
+    CHECK_EQ((*origin_itensors)[i]->dims().production(),
+             device_idims_[i].GetNumber() * device_idims_[i].GetChannel() *
+                 device_idims_[i].GetHeight() * device_idims_[i].GetWidth());
+    VLOG(3) << "[NPU] Init the input tensors for the device program and share "
+               "their buffers with the origin input tensors";
+    // reinit device tensor will free shared buffer, so copy data to a tmp
+    // tensor
+    Tensor tmp;
+    tmp.CopyDataFrom(*(*origin_itensors)[i]);
+    (*device_itensors)[i]->Init(&(device_idims_[i]));
 
-  CHECK_EQ(device_idims.size(), device_inames_.size());
-  CHECK_EQ(device_odims.size(), device_onames_.size());
-  origin_idims_.resize(device_inames_.size());
-  origin_itensors_.resize(device_inames_.size());
-  device_itensors_.resize(device_inames_.size());
-  origin_odims_.resize(device_onames_.size());
-  origin_otensors_.resize(device_onames_.size());
-  device_otensors_.resize(device_onames_.size());
+    std::memcpy(
+        (*device_itensors)[i]->GetBuffer(), tmp.raw_data(), tmp.memory_size());
 
-  for (int i = 0; i < device_inames_.size(); i++) {
-    auto node = graph.Get(device_inames_[i]);
-    auto precision = node->precision();
-    auto layout = node->layout();
-    origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
-    CHECK(origin_itensors_[i]);
-    origin_idims_[i] = origin_itensors_[i]->dims();
-    VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i]
-            << " precision: " << PrecisionToStr(precision)
-            << " layout: " << DataLayoutToStr(layout) << " dims: {"
-            << device_idims[i].GetNumber() << ","
-            << device_idims[i].GetChannel() << ","
-            << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
-            << "}";
-    // Prepare the device input tensors
-    CHECK_EQ(origin_idims_[i].production(),
-             device_idims[i].GetNumber() * device_idims[i].GetChannel() *
-                 device_idims[i].GetHeight() * device_idims[i].GetWidth());
-    device_itensors_[i].reset(new hiai::AiTensor);
-    device_itensors_[i]->Init(&(device_idims[i]));
+    // Share data buf between device_itensor and origin_itensor
+    std::shared_ptr<Buffer> buffer =
+        std::make_shared<Buffer>((*device_itensors)[i]->GetBuffer(),
+                                 lite_api::TargetType::kHost,
+                                 (*device_itensors)[i]->GetSize());
+    (*origin_itensors)[i]->ResetBuffer(buffer,
+                                       (*device_itensors)[i]->GetSize());
   }
-  device_program->origin_idims = origin_idims_;
-
-  for (int i = 0; i < device_onames_.size(); i++) {
-    auto node = graph.Get(device_onames_[i]);
-    auto precision = node->precision();
-    auto layout = node->layout();
-    origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
-    CHECK(origin_otensors_[i]);
-    origin_odims_[i] = origin_otensors_[i]->dims();
-    VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i]
-            << " precision: " << PrecisionToStr(precision)
-            << " layout: " << DataLayoutToStr(layout) << " dims: {"
-            << device_odims[i].GetNumber() << ","
-            << device_odims[i].GetChannel() << ","
-            << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
-            << "}";
-    // Prepare the device output tensors
-    switch (precision) {
-      case PRECISION(kFloat):
-        origin_otensors_[i]->mutable_data<float>();
-        break;
-      case PRECISION(kBool):
-        origin_otensors_[i]->mutable_data<bool>();
-        break;
-      case PRECISION(kInt8):
-        origin_otensors_[i]->mutable_data<int8_t>();
-        break;
-      case PRECISION(kInt16):
-        origin_otensors_[i]->mutable_data<int16_t>();
-        break;
-      case PRECISION(kInt32):
-        origin_otensors_[i]->mutable_data<int32_t>();
-        break;
-      case PRECISION(kInt64):
-        origin_otensors_[i]->mutable_data<int64_t>();
-        break;
-      default:
-        LOG(FATAL) << "[NPU] " << device_onames_[i]
-                   << " can't mutable data with precision type "
-                   << PrecisionToStr(precision);
-        break;
-    }
-    device_program->origin_odims = origin_odims_;
-
-    CHECK_EQ(origin_odims_[i].production(),
-             device_odims[i].GetNumber() * device_odims[i].GetChannel() *
-                 device_odims[i].GetHeight() * device_odims[i].GetWidth());
-    device_otensors_[i].reset(new hiai::AiTensor);
-    device_otensors_[i]->Init(&(device_odims[i]));
+  for (int i = 0; i < output_names.size(); i++) {
+    (*origin_otensors)[i]->set_precision(origin_otypes_[i]);
+    (*origin_otensors)[i]->Resize(origin_odims_[i]);
+    VLOG(3) << "[NPU] Outputs[" << i << "] name: " << output_names[i]
+            << " origin dims:" << (*origin_otensors)[i]->dims().repr()
+            << " device dims: {" << device_odims_[i].GetNumber() << ","
+            << device_odims_[i].GetChannel() << ","
+            << device_odims_[i].GetHeight() << ","
+            << device_odims_[i].GetWidth() << "}";
+    CHECK_EQ((*origin_otensors)[i]->dims().production(),
+             device_odims_[i].GetNumber() * device_odims_[i].GetChannel() *
+                 device_odims_[i].GetHeight() * device_odims_[i].GetWidth());
+    (*device_otensors)[i]->Init(&(device_odims_[i]));
+    VLOG(3) << "[NPU] Init the output tensors for the device program and share "
+               "their buffers with the origin output tensors";
+    // Share data buf between device_itensor and origin_itensor
+    std::shared_ptr<Buffer> buffer =
+        std::make_shared<Buffer>((*device_otensors)[i]->GetBuffer(),
+                                 lite_api::TargetType::kHost,
+                                 (*device_otensors)[i]->GetSize());
+    (*origin_otensors)[i]->ResetBuffer(buffer,
+                                       (*device_otensors)[i]->GetSize());
   }
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
-  // Copy the data of origin input tensors to the buffer of input HiAI tensors
-  // init device_itensors_, device_otensors_, origin_otensors_
-  auto device_program = device_program_map_[inputs_shape_];
-
+bool DeviceProgram::ZeroCopyRun(
+    std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
+    std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors) {
+  CHECK(!model_name_.empty() && model_client_);
   // Run the HiAI model by name
   std::string key = "model_name";  // Note: key seems must be model_name
   hiai::AiContext model_context;
@@ -234,70 +305,87 @@ int SubgraphEngine::LaunchDeviceProgram() {
   };
   int istamp;
   auto start_time = GetCurrentUS();
-  CHECK_EQ(device_program->client->Process(
-               model_context, device_itensors_, device_otensors_, 1000, istamp),
+  CHECK_EQ(model_client_->Process(
+               model_context, *device_itensors, *device_otensors, 1000, istamp),
            hiai::AI_SUCCESS);
   VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
-
-  return 0;
+  return true;
 }
 
-int SubgraphEngine::Build() {
-  if (device_program_map_.count(inputs_shape_) > 0) {
-    return subgraph::SUCCESS;
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_itensors_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    device_itensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_itensors_[i]);
+  }
+  device_otensors_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    device_otensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_otensors_[i]);
   }
-  // In order to attach all of the ops of the block desc, we need to build the
-  // original program firstly.
-  BuildOriginProgram();
-  // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
-  build_device_program_status_ = BuildDeviceProgram();
-  return build_device_program_status_;
+  return true;
 }
 
-void SubgraphEngine::InitDeviceTensor() {
-  auto device_program = device_program_map_[inputs_shape_];
-  for (size_t i = 0; i < device_itensors_.size(); i++) {
-    if (device_itensors_[i]->GetBuffer() != origin_itensors_[i]->raw_data()) {
-      VLOG(3) << "init device_itensors and share input tensor buf between "
-                 "device and host";
-      device_itensors_[i]->Init(&(device_program->device_idims[i]));
-      std::memcpy(device_itensors_[i]->GetBuffer(),
-                  origin_itensors_[i]->raw_data(),
-                  origin_itensors_[i]->memory_size());
-      // share data buf between device_itensor and origin_itensor
-      std::shared_ptr<Buffer> buffer =
-          std::make_shared<Buffer>(device_itensors_[i]->GetBuffer(),
-                                   lite_api::TargetType::kHost,
-                                   device_itensors_[i]->GetSize());
-      origin_itensors_[i]->ResetBuffer(buffer, device_itensors_[i]->GetSize());
+bool SubgraphEngine::BuildDeviceProgram() {
+  // Check if the cache device program exists
+  if (!device_programs_.count(origin_idims_)) {
+    auto device_program = std::make_shared<DeviceProgram>();
+    // Obtain the model cache dir from the NPU Context of the subgraph op
+    auto model_cache_dir = ctx_->As<NPUContext>().SubgraphModelCacheDir();
+    VLOG(3) << "[NPU] Getting subgraph model_cache_dir is: " << model_cache_dir;
+    // Check and load if the cached model and configuration file exists
+    if (model_cache_dir.empty() ||
+        !device_program->LoadFromCacheFile(
+            input_names_, output_names_, origin_idims_, model_cache_dir)) {
+      // Build the model online, including converting the paddle ops to the HiAI
+      // IR nodes, building the HiAI IR graph to the om model, then load it as a
+      // new HiAI model manager client for inference.
+      if (origin_program_.empty()) {
+        BuildOriginProgram();
+      }
+      CHECK(!origin_program_.empty()) << "no instructions";
+      if (!device_program->BuildGraphAndCacheToFile(origin_program_,
+                                                    input_names_,
+                                                    output_names_,
+                                                    origin_idims_,
+                                                    origin_otensors_,
+                                                    model_cache_dir)) {
+        return false;
+      }
     }
-  }
-  for (size_t i = 0; i < device_otensors_.size(); i++) {
-    if (device_otensors_[i]->GetBuffer() != origin_otensors_[i]->raw_data()) {
-      VLOG(3) << "init device_otensors and share output tensor buf between "
-                 "device and host";
-      device_otensors_[i]->Init(&(device_program->device_odims[i]));
-      // share data buf between device_itensor and origin_itensor
-      origin_otensors_[i]->Resize(device_program->origin_odims[i]);
-      std::shared_ptr<Buffer> buffer =
-          std::make_shared<Buffer>(device_otensors_[i]->GetBuffer(),
-                                   lite_api::TargetType::kHost,
-                                   device_otensors_[i]->GetSize());
-      origin_otensors_[i]->ResetBuffer(buffer, device_otensors_[i]->GetSize());
+    if (device_program->model_client_ == nullptr) {
+      return false;
     }
+    device_programs_[origin_idims_] = device_program;
   }
+  auto device_program = device_programs_[origin_idims_];
+  CHECK(device_program && device_program->model_client_);
+  return device_program->ShareBufferWithOriginTensors(input_names_,
+                                                      output_names_,
+                                                      &origin_itensors_,
+                                                      &origin_otensors_,
+                                                      &device_itensors_,
+                                                      &device_otensors_);
 }
 
-bool SubgraphEngine::InputShapeChanged() {
-  std::vector<std::vector<int64_t>> new_shape;
-  for (auto origin_itensor : origin_itensors_) {
-    new_shape.push_back(origin_itensor->dims().Vectorize());
+bool SubgraphEngine::LaunchDeviceProgram() {
+  // Roll back to launch the origin program if the device program can't be
+  // found or the model client isn't initialized.
+  if (device_programs_.count(origin_idims_) == 0 ||
+      device_programs_[origin_idims_]->model_client_ == nullptr) {
+    return LaunchOriginProgram();
   }
-  if (inputs_shape_ == new_shape) {
-    return false;
+  auto device_program = device_programs_[origin_idims_];
+  if (!device_program->model_client_) {
+    return LaunchOriginProgram();
   }
-  inputs_shape_ = new_shape;
-  return true;
+  return device_program->ZeroCopyRun(&device_itensors_, &device_otensors_);
 }
 
 void SubgraphCompute::PrepareForRun() {
@@ -307,15 +395,13 @@ void SubgraphCompute::PrepareForRun() {
                                    param.sub_block_desc,
                                    param.input_data_names,
                                    param.output_data_names,
-                                   param.scope,
-                                   NPUContext::SubgraphModelCacheDir()));
+                                   param.scope));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace npu
diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h
index 9f0b5a944137dbf9a521235b80398feca1cd82b0..33321a7789fbc1eee5ff759dcf682d8e875ffe96 100644
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -28,52 +28,65 @@ namespace lite {
 namespace kernels {
 namespace npu {
 
-class SubgraphEngine : public subgraph::Engine {
+class DeviceProgram {
  public:
-  SubgraphEngine(KernelContext *ctx,
-                 int block_idx,
-                 cpp::BlockDesc *block_desc,
-                 const std::vector<std::string> &input_names,
-                 const std::vector<std::string> &output_names,
-                 Scope *scope,
-                 std::string model_cache_dir = "")
-      : subgraph::Engine(ctx,
-                         block_idx,
-                         block_desc,
-                         input_names,
-                         output_names,
-                         scope,
-                         model_cache_dir) {}
+  DeviceProgram() {}
+  ~DeviceProgram() {}
+  std::string GenerateModelName(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims);
+  bool LoadFromCacheFile(const std::vector<std::string>& input_names,
+                         const std::vector<std::string>& output_names,
+                         const std::vector<std::vector<int64_t>>& origin_idims,
+                         const std::string& model_cache_dir);
+  bool BuildGraphAndCacheToFile(
+      const std::vector<Instruction>& origin_program,
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      const std::vector<std::vector<int64_t>>& origin_idims,
+      const std::vector<Tensor*>& origin_otensors,
+      const std::string& model_cache_dir);
+  bool ShareBufferWithOriginTensors(
+      const std::vector<std::string>& input_names,
+      const std::vector<std::string>& output_names,
+      std::vector<Tensor*>* origin_itensors,
+      std::vector<Tensor*>* origin_otensors,
+      std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
+      std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors);
+  bool ZeroCopyRun(
+      std::vector<std::shared_ptr<hiai::AiTensor>>* device_itensors,
+      std::vector<std::shared_ptr<hiai::AiTensor>>* device_otensors);
 
-  struct device_program_t {
-    explicit device_program_t(std::shared_ptr<hiai::AiModelMngerClient> _client)
-        : client(_client) {}
-    std::shared_ptr<hiai::AiModelMngerClient> client{nullptr};
-    std::vector<DDim> origin_idims{};
-    std::vector<DDim> origin_odims{};
-    std::vector<hiai::TensorDimension> device_idims{};
-    std::vector<hiai::TensorDimension> device_odims{};
-  };
+ public:
+  std::string model_name_{""};
+  std::shared_ptr<hiai::AiModelMngerClient> model_client_{nullptr};
+  std::vector<std::vector<int64_t>> origin_odims_;
+  std::vector<PrecisionType> origin_otypes_;
+  std::vector<hiai::TensorDimension> device_idims_{};
+  std::vector<hiai::TensorDimension> device_odims_{};
+};
 
-  int Build() override;
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(KernelContext* ctx,
+                 int block_idx,
+                 cpp::BlockDesc* block_desc,
+                 const std::vector<std::string>& input_names,
+                 const std::vector<std::string>& output_names,
+                 Scope* scope)
+      : subgraph::Engine(
+            ctx, block_idx, block_desc, input_names, output_names, scope) {}
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
-
-  void InitDeviceTensor() override;
-  bool InputShapeChanged() override;
-
-  std::string GenerateModelCacheName() const;
+  bool PrepareWorkspaceForDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
-  std::string model_name_{"model.om"};
-  std::vector<std::vector<int64_t>> inputs_shape_{};
-  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
-      device_program_map_{};
-  std::vector<std::string> device_inames_{};
-  std::vector<std::string> device_onames_{};
   std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_{};
   std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_{};
+  std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<DeviceProgram>>
+      device_programs_;
 };
 
 class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 600d0d22553af9d857d03491aabd2067db8f32ef..81e1a4d7562a9decab2e2daf4001faec7ac2fcee 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -21,6 +21,7 @@ add_kernel(fusion_elementwise_sub_activation_opencl
 add_kernel(pool_opencl OPENCL basic SRCS pool_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(activation_opencl OPENCL basic SRCS activation_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_image_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(transpose_opencl OPENCL basic SRCS transpose_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(layout_opencl OPENCL basic SRCS layout_image_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(concat_opencl OPENCL basic SRCS concat_image_compute.cc DEPS ${cl_kernel_deps})
@@ -67,6 +68,9 @@ lite_cc_test(test_scale_image_opencl SRCS scale_image_compute_test.cc
 lite_cc_test(test_reshape_image_opencl SRCS reshape_image_compute_test.cc
              DEPS reshape_opencl op_registry program context)
 
+lite_cc_test(test_transpose_image_opencl SRCS transpose_image_compute_test.cc
+             DEPS transpose_opencl layout_opencl op_registry program context)
+             
 lite_cc_test(test_concat_image_opencl SRCS concat_image_compute_test.cc
              DEPS concat_opencl layout_opencl op_registry program context)
 
diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc
index fed8171cc273b437be411225363bf4a732769ae3..083f72134eba8afc7db696f68d64098b9c59a0f9 100644
--- a/lite/kernels/opencl/conv_image_compute.cc
+++ b/lite/kernels/opencl/conv_image_compute.cc
@@ -28,91 +28,83 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
-/* image kernel*/
+
 void ConvImageCompute::PrepareForRun() {
-  const auto& param = this->Param<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
+  ReInitWhenNeeded();
+
+  auto filter_dims = conv_param_->filter->dims();
+  filter_tensor_n_ = filter_dims[0];
+  filter_tensor_c_ = filter_dims[1];
+  filter_tensor_h_ = filter_dims[2];
+  filter_tensor_w_ = filter_dims[3];
 
-  float* filter_cpu = param.filter->mutable_data<float>();
   auto& context = ctx_->As<OpenCLContext>();
   CHECK(context.cl_context() != nullptr);
   const bool is_mali = context.cl_context()->IsArmMali();
-  filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
-  tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
-  tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
-  int bs = x_dims[0];
-  int c_in = x_dims[1];
-  int h_out = output_dims[2];
-  int w_out = output_dims[3];
-  int kernel_h = filter_dims[2];  // oihw
-  int kernel_w = filter_dims[3];
-  auto paddings = *param.paddings;
-  auto dilations = *param.dilations;
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[2];
-  int groups = param.groups;
-  bool relu_fused = param.fuse_relu;
-  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
-  bool zero_pad = (pad_h == 0) && (pad_w == 0);
-
-  bool pad_equal =
-      ((paddings[0] == paddings[1]) && (paddings[1] == paddings[2]) &&
-       (paddings[2] == paddings[3]));
-  bool stride_equal = stride_h == stride_w;
-  bool dilation_equal = dilations[0] == dilations[1];
+
+  auto paddings = *conv_param_->paddings;
+  pad_up_ = paddings[0];
+  pad_down_ = paddings[1];
+  pad_left_ = paddings[2];
+  pad_right_ = paddings[3];
+
+  auto dilations = *conv_param_->dilations;
+  dilation_h_ = dilations[0];
+  dilation_w_ = dilations[1];
+
+  stride_h_ = conv_param_->strides[0];
+  stride_w_ = conv_param_->strides[1];
+
+  groups_ = conv_param_->groups;
+  relu_fused_ = conv_param_->fuse_relu;
+  has_bias_ = (conv_param_->bias) != nullptr;
+  offset_ = filter_tensor_h_ / 2 - pad_up_;
+
+  bool pad_equal = ((pad_left_ == pad_up_) && (pad_up_ == pad_left_) &&
+                    (pad_left_ == pad_right_));
+  bool stride_equal = stride_h_ == stride_w_;
+  bool dilation_equal = dilation_h_ == dilation_w_;
 
   VLOG(3) << "Is arm mali  / " << (is_mali ? "Yes" : "No");
-  VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
-  VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
-          << " stride_w:" << stride_w << " pad_h:" << pad_h
-          << " pad_w:" << pad_w << " kernel_h:" << kernel_h
-          << " kernel_h:" << kernel_h;
-  VLOG(3) << "x_dims:" << x_dims[0] << " " << x_dims[1] << " " << x_dims[2]
-          << " " << x_dims[3];
-  VLOG(3) << "dialtion:" << dilations[0] << " " << dilations[1];
-  VLOG(3) << "output_dims:" << output_dims[0] << " " << output_dims[1] << " "
-          << output_dims[2] << " " << output_dims[3];
-  VLOG(3) << "filter_dims:" << filter_dims[0] << " " << filter_dims[1] << " "
-          << filter_dims[2] << " " << filter_dims[3];
+  VLOG(3) << "Is relu fused? / " << (relu_fused_ ? "Yes" : "No");
+  VLOG(3) << "groups:" << groups_ << " stride_h_:" << stride_h_
+          << " stride_w_:" << stride_w_ << " pad_left_:" << pad_left_
+          << " pad_up_:" << pad_up_ << " filter_tensor_h_:" << filter_tensor_h_
+          << " filter_tensor_h_:" << filter_tensor_h_;
+  VLOG(3) << "input_tensor_nchw:" << input_tensor_n_ << " " << input_tensor_c_
+          << " " << input_tensor_h_ << " " << input_tensor_w_;
+  VLOG(3) << "dialtion:" << dilation_h_ << " " << dilation_w_;
+  VLOG(3) << "output_dims:" << output_tensor_n_ << " " << output_tensor_c_
+          << " " << output_tensor_h_ << " " << output_tensor_w_;
+  VLOG(3) << "filter_dims:" << filter_tensor_n_ << " " << filter_tensor_c_
+          << " " << filter_tensor_h_ << " " << filter_tensor_w_;
   VLOG(3) << "pad_equal:" << pad_equal;
   VLOG(3) << "stride_equal:" << stride_equal;
   VLOG(3) << "dilation_equal:" << dilation_equal;
-  VLOG(3) << "padding :" << paddings[0] << " " << paddings[1] << " "
-          << paddings[2] << " " << paddings[3];
+  VLOG(3) << "padding :" << pad_up_ << " " << pad_down_ << " " << pad_left_
+          << " " << pad_right_;
   CHECK(pad_equal && stride_equal && dilation_equal);
+  CHECK_GE(conv_param_->dilations->size(), 2);
+  CHECK(dilation_h_ == dilation_w_);
+  CHECK_GE(conv_param_->paddings->size(), 2);
+  CHECK(pad_left_ == pad_up_);
+  CHECK_GE(conv_param_->strides.size(), 2);
+  CHECK(stride_h_ == stride_w_);
+
+  if (!is_mali) {
+    use_tune_ = false;
+  }
 
-  // general gws..
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-
-  const std::vector<size_t>& default_work_size =
-      DefaultWorkSize(output_dims,
-                      DDim(std::vector<DDim::value_type>{
-                          static_cast<int64_t>(out_image_shape["width"]),
-                          static_cast<int64_t>(out_image_shape["height"])}));
-
-  default_c_blk_ = default_work_size[0];
-  default_w_blk_ = default_work_size[1];
-  default_nh_blk_ = default_work_size[2];
-  c_blk_ = default_c_blk_;
-  w_blk_ = default_w_blk_;
-  nh_blk_ = default_nh_blk_;
-  global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                  static_cast<size_t>(w_blk_),
-                                  static_cast<size_t>(nh_blk_)};
-
-  if (kernel_h == 1 && kernel_w == 1) {
-    // conv2d_1x1
-    // if (param.x->dims()[1] % 4 == 0) {
-    //   kernel_func_names_.push_back("conv2d_1x1_simple");
-    // } else {
-    //   kernel_func_names_.push_back("conv2d_1x1_opt");
-    // }
+  /*********************************************
+   * Upload filter, bias to opencl device
+   *********************************************/
+  float* filter_cpu = conv_param_->filter->mutable_data<float>();
+  filter_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
+  tensor_hold_filter_image_ = std::unique_ptr<Tensor>(new Tensor);
+  tensor_hold_bias_image_ = std::unique_ptr<Tensor>(new Tensor);
 
-    if (param.x->dims()[1] % 4 == 0) {
+  if (filter_tensor_h_ == 1 && filter_tensor_h_ == 1) {
+    if (input_tensor_c_ % 4 == 0) {
       kernel_func_names_.push_back("conv2d_1x1_simple");
     } else {
       kernel_func_names_.push_back("conv2d_1x1_opt");
@@ -121,89 +113,49 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    // std::vector<half_t> filter_image_v(filter_image_dims[0] *
-    //                                    filter_image_dims[1] * 4);  // 4 :
-    //                                    RGBA
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
-
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d1x1opt;
-    {
-      // calc 1x1 gws
-      w_blk_ = maptofactor(default_w_blk_, 4);
-      c_blk_ = default_c_blk_;
-      nh_blk_ = default_nh_blk_;
-      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                      static_cast<size_t>(w_blk_),
-                                      static_cast<size_t>(nh_blk_)};
-    }
 #define DEPTH_CONV_USE_SPL
 #ifdef DEPTH_CONV_USE_SPL
-  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1] &&
-             kernel_h == 3 && kernel_w == 3 && groups > 1) {
+  } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_ &&
+             filter_tensor_h_ == 3 && filter_tensor_w_ == 3 && groups_ > 1) {
     // depth_conv2d_3x3s1, depth_conv2d_3x3
-    if (stride_h == 1 && dilations[0] == 1) {
+    if (stride_h_ == 1 && dilation_h_ == 1) {
       kernel_func_names_.push_back("depth_conv2d_3x3s1");
       impl_ = &ConvImageCompute::DepthwiseConv2d3x3s1;
-      {
-        // depthwise spl gws s1
-        int c_block = (output_dims[1] + 3) / 4;
-        int w = output_dims[3];
-        int nh = output_dims[0] * output_dims[2];
-        int w_blk_size = 2;
-        int w_blk = (w + w_blk_size - 1) / w_blk_size;
-
-        c_blk_ = c_block;
-        w_blk_ = w_blk;
-        nh_blk_ = nh;
-        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                        static_cast<size_t>(w_blk_),
-                                        static_cast<size_t>(nh_blk_)};
-      }
     } else {
       kernel_func_names_.push_back("depth_conv2d_3x3");
       impl_ = &ConvImageCompute::DepthwiseConv2d3x3;
-      {
-        // depthwise spl gws
-        int c_block = (output_dims[1] + 3) / 4;
-        int w = output_dims[3];
-        int nh = output_dims[0] * output_dims[2];
-
-        c_blk_ = c_block;
-        w_blk_ = w;
-        nh_blk_ = nh;
-
-        global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                        static_cast<size_t>(w_blk_),
-                                        static_cast<size_t>(nh_blk_)};
-      }
     }
     kernel_func_paths_.push_back("image/depthwise_conv2d_kernel.cl");
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
 #endif
-  } else if (filter_dims[1] == 1 && x_dims[1] == output_dims[1]
+  } else if (filter_tensor_c_ == 1 && input_tensor_c_ == output_tensor_c_
 #ifdef DEPTH_CONV_USE_SPL
              &&
-             kernel_h != 3
+             filter_tensor_h_ != 3
 #endif
 #undef DEPTH_CONV_USE_SPL
              ) {
@@ -213,75 +165,61 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterNWBlock converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::DepthwiseConv2d;
-  } else if (kernel_w == 3 && kernel_h == 3) {
+  } else if (filter_tensor_h_ == 3 && filter_tensor_w_ == 3) {
 // #define CONV3x3OPT_FALL_BACK
 #ifndef CONV3x3OPT_FALL_BACK
     // conv2d_3x3
-    kernel_func_names_.push_back(bs > 1 ? "conv2d_3x3_multi_batch"
-                                        : "conv2d_3x3_opt");
+    kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_3x3_multi_batch"
+                                                     : "conv2d_3x3_opt");
     kernel_func_paths_.push_back("image/conv2d_3x3_opt_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d3x3opt;
-
-    {
-      int w_blk_size = 5;
-      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
-
-      int h_blk_size = 1;
-      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
-
-      c_blk_ = default_c_blk_;
-      w_blk_ = w_blk;
-      nh_blk_ = h_blk;
-
-      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                      static_cast<size_t>(w_blk_),
-                                      static_cast<size_t>(nh_blk_)};
-    }
 #else
     kernel_func_names_.push_back("conv2d_3x3");
     kernel_func_paths_.push_back("image/conv2d_3x3_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d3x3;
-
 #endif
 #undef CONV3x3OPT_FALL_BACK
-  } else if (kernel_h == 5 && kernel_w == 5) {
+  } else if (filter_tensor_h_ == 5 && filter_tensor_w_ == 5) {
 #define CONV_5x5_OPT
 #ifndef CONV_5x5_OPT
     // conv2d_5x5
@@ -290,55 +228,42 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d5x5;
 #else
     // conv2d_5x5_opt
 
-    kernel_func_names_.push_back(bs > 1 ? "conv2d_5x5_multi_batch"
-                                        : "conv2d_5x5_opt");
+    kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_5x5_multi_batch"
+                                                     : "conv2d_5x5_opt");
     kernel_func_paths_.push_back("image/conv2d_5x5_opt_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d5x5opt;
-    {
-      int w_blk_size = 5;
-      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
-
-      int h_blk_size = 1;
-      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
-
-      c_blk_ = default_c_blk_;
-      w_blk_ = w_blk;
-      nh_blk_ = h_blk;
-
-      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                      static_cast<size_t>(w_blk_),
-                                      static_cast<size_t>(nh_blk_)};
-    }
 #endif
 #undef CONV_5x5_OPT
-  } else if (kernel_h == 7 && kernel_w == 7) {
+  } else if (filter_tensor_h_ == 7 && filter_tensor_w_ == 7) {
 #define CONV_7x7_OPT
 #ifndef CONV_7x7_OPT
     // conv2d_7x7
@@ -347,52 +272,39 @@ void ConvImageCompute::PrepareForRun() {
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d7x7;
 
 #else
     // conv2d_7x7
-    kernel_func_names_.push_back(bs > 1 ? "conv2d_7x7_multi_batch"
-                                        : "conv2d_7x7_opt");
+    kernel_func_names_.push_back(input_tensor_n_ > 1 ? "conv2d_7x7_multi_batch"
+                                                     : "conv2d_7x7_opt");
     kernel_func_paths_.push_back("image/conv2d_7x7_opt_kernel.cl");
 
     CLImageConverterFolder converter;
     const DDim& filter_image_dims = converter.InitImageDimInfoWith(filter_dims);
-    tensor_hold_filter_image_->Resize(
-        {1, filter_image_dims[0], filter_image_dims[1], 4});
+    filter_image_h_ = filter_image_dims[1];
+    filter_image_w_ = filter_image_dims[0];
+    tensor_hold_filter_image_->Resize({1, filter_image_w_, filter_image_h_, 4});
 
     half_t* filter_image_data =
         tensor_hold_filter_image_->mutable_data<half_t>();
 
     converter.NCHWToImage(filter_cpu, filter_image_data, filter_dims);
     filter_gpu_image_->mutable_data<half_t, cl::Image2D>(
-        filter_image_dims[0], filter_image_dims[1], filter_image_data);
+        filter_image_w_, filter_image_h_, filter_image_data);
 
     impl_ = &ConvImageCompute::Conv2d7x7opt;
-    {
-      int w_blk_size = 5;
-      int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
-
-      int h_blk_size = 1;
-      int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
-
-      c_blk_ = default_c_blk_;
-      w_blk_ = w_blk;
-      nh_blk_ = h_blk;
-
-      global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
-                                      static_cast<size_t>(w_blk_),
-                                      static_cast<size_t>(nh_blk_)};
-    }
 #endif
 #undef CONV_7x7_OPT
   } else {
@@ -404,30 +316,30 @@ void ConvImageCompute::PrepareForRun() {
   // build options
   std::string build_options_single(" -DCL_DTYPE_half");
   // relu options
-  VLOG(3) << "relu_fused:" << relu_fused
-          << " param.activation_param.active_type:"
-          << static_cast<int>(param.activation_param.active_type)
-          << " param.activation_param.has_active:"
-          << param.activation_param.has_active;
-  if (param.activation_param.has_active) {
-    if (param.activation_param.active_type ==
-        lite_api::ActivationType::kRelu) {  // Note: judge using `relu_fused`
+  VLOG(3) << "relu_fused_:" << relu_fused_
+          << " conv_param_->activation_param.active_type:"
+          << static_cast<int>(conv_param_->activation_param.active_type)
+          << " conv_param_->activation_param.has_active:"
+          << conv_param_->activation_param.has_active;
+  if (conv_param_->activation_param.has_active) {
+    if (conv_param_->activation_param.active_type ==
+        lite_api::ActivationType::kRelu) {  // Note: judge using `relu_fused_`
                                             // also is ok
       build_options_single += " -DRELU";
-    } else if (param.activation_param.active_type ==
+    } else if (conv_param_->activation_param.active_type ==
                lite_api::ActivationType::kRelu6) {
       build_options_single += " -DRELU6";
     } else {
       LOG(FATAL) << "Unsupported activation type:"
-                 << static_cast<int>(param.activation_param.active_type);
+                 << static_cast<int>(conv_param_->activation_param.active_type);
     }
   }
+  GetGlobalWorkSize();
 
   // bias options
-  const bool has_bias = param.bias != nullptr;
   const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  if (has_bias) {
+      has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims();
+  if (has_bias_) {
     bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
     build_options_single +=
         is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
@@ -435,21 +347,36 @@ void ConvImageCompute::PrepareForRun() {
     // convert cpu buffer bias --> gpu image
     CLImageConverterFolder bias_converter;
     const DDim& bias_image_dims =
-        bias_converter.InitImageDimInfoWith(param.bias->dims());
-
+        bias_converter.InitImageDimInfoWith(conv_param_->bias->dims());
+    bias_image_h_ = bias_image_dims[1];
+    bias_image_w_ = bias_image_dims[0];
     tensor_hold_bias_image_->Resize(
         {1, bias_image_dims[0], bias_image_dims[1], 4});
 
     half_t* bias_image_data = tensor_hold_bias_image_->mutable_data<half_t>();
 
-    float* bias_cpu_data = param.bias->mutable_data<float>();
+    float* bias_cpu_data = conv_param_->bias->mutable_data<float>();
     bias_converter.NCHWToImage(
-        bias_cpu_data, bias_image_data, param.bias->dims());
+        bias_cpu_data, bias_image_data, conv_param_->bias->dims());
     this->bias_gpu_image_->mutable_data<half_t, cl::Image2D>(
         bias_image_dims[0], bias_image_dims[1], bias_image_data);
     // convert cpu buffer bias --> gpu image --- end ----
+  } else {
+    bias_gpu_image_ = std::unique_ptr<Tensor>(new Tensor);
+    CLImageConverterFolder bias_converter;
+    tensor_hold_bias_image_->Resize({1, 1, 1, 4});
+    half_t* bias_image_data = tensor_hold_bias_image_->mutable_data<half_t>();
+    this->bias_gpu_image_->mutable_data<half_t, cl::Image2D>(
+        1, 1, bias_image_data);
   }
 
+  // define image pointer for filter, bias
+  input_image_p_ = conv_param_->x->data<half_t, cl::Image2D>();
+  filter_image_p_ = filter_gpu_image_->data<half_t, cl::Image2D>();
+  bias_image_p_ = bias_gpu_image_->data<half_t, cl::Image2D>();
+  output_image_p_ = conv_param_->output->mutable_data<half_t, cl::Image2D>(
+      output_image_w_, output_image_h_);
+
   build_options_.push_back(build_options_single);
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
@@ -475,55 +402,55 @@ void ConvImageCompute::PrepareForRun() {
   VLOG(4) << "max_work_group_size: " << max_work_group_size;
 
   if (max_work_group_size > 0 && use_lws_) {
-    double min_turn_time = DBL_MAX;
+    double min_tune_time = DBL_MAX;
     cl::NDRange best_local_work_size = context.cl_context()->LocalWorkSize(
         global_work_size_, max_work_group_size);
     VLOG(3) << "origin  :local_work_size_ : " << best_local_work_size[0] << " "
             << best_local_work_size[1] << " " << best_local_work_size[2];
     cl::NDRange last_local_work_size = cl::NDRange{
         static_cast<size_t>(0), static_cast<size_t>(0), static_cast<size_t>(0)};
-    if (use_turn_) {
+    if (use_tune_) {
       for (size_t i = 1; i < 15; i++) {
-        if (kernel_h == 1 && kernel_w == 1) {
+        if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
           // todo use diff logics
-          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+          local_work_size_ = context.cl_context()->LocalWorkSizeTune(
               global_work_size_, max_work_group_size, i);
         } else {
-          local_work_size_ = context.cl_context()->LocalWorkSizeTurn(
+          local_work_size_ = context.cl_context()->LocalWorkSizeTune(
               global_work_size_, max_work_group_size, i);
         }
         if (last_local_work_size[0] == local_work_size_[0] &&
             last_local_work_size[1] == local_work_size_[1] &&
             last_local_work_size[2] == local_work_size_[2]) {
-          // skiped turned lws
+          // skiped tuneed lws
           continue;
         }
-        auto turn_time = this->Turn(10);
-        if (min_turn_time > turn_time) {
-          min_turn_time = turn_time;
+        auto tune_time = this->Tune(10);
+        if (min_tune_time > tune_time) {
+          min_tune_time = tune_time;
           best_local_work_size = local_work_size_;
         }
         last_local_work_size = local_work_size_;
       }
       // reverse
       for (size_t i = 1; i < 15; i++) {
-        if (kernel_h == 1 && kernel_w == 1) {
+        if (filter_tensor_h_ == 1 && filter_tensor_w_ == 1) {
           // todo use diff logics
-          local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
+          local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse(
               global_work_size_, max_work_group_size, i);
         } else {
-          local_work_size_ = context.cl_context()->LocalWorkSizeTurnReverse(
+          local_work_size_ = context.cl_context()->LocalWorkSizeTuneReverse(
               global_work_size_, max_work_group_size, i);
         }
         if (last_local_work_size[0] == local_work_size_[0] &&
             last_local_work_size[1] == local_work_size_[1] &&
             last_local_work_size[2] == local_work_size_[2]) {
-          // skiped turned lws
+          // skiped tuneed lws
           continue;
         }
-        auto turn_time = this->Turn(10);
-        if (min_turn_time > turn_time) {
-          min_turn_time = turn_time;
+        auto tune_time = this->Tune(10);
+        if (min_tune_time > tune_time) {
+          min_tune_time = tune_time;
           best_local_work_size = local_work_size_;
         }
         last_local_work_size = local_work_size_;
@@ -537,548 +464,316 @@ void ConvImageCompute::PrepareForRun() {
   }
 }
 
-void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
-
+void ConvImageCompute::ReInitWhenNeeded() {
+  conv_param_ = param_.get_mutable<param_t>();
+  auto x_dims = conv_param_->x->dims();
 #ifdef LITE_WITH_LOG
-  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
-          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+  LOG(INFO) << "is_first_epoch_for_run_:" << is_first_epoch_for_run_
+            << ", last_input_dims_:" << last_input_dims_
+            << ", x_dims:" << x_dims;
 #endif
-#ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d_1x1 params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-// VLOG(4) << "default work size{c_block, w, nh}: "
-//         << "{" << c_block << ", " << w << ", " << nh << ""
-//         << "}";
-#endif
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  // handle bias  use buffer for channel wise , use image for element wise
-  const cl::Buffer* bias_buf = nullptr;
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto kernel = kernel_;
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, default_w_blk_);
-  CL_CHECK_FATAL(status);
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-  if (is_turn) {
-    CLRuntime::Global()->command_queue().finish();
-  }
-}
 
-void ConvImageCompute::Conv2d3x3(bool is_turn) {
-  auto kernel = kernel_;
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int input_channel = input_dims[1];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int output_channel = output_dims[1];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
-  int filter_channel = filter_dims[1];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
-
-  // re-calc group
-  int new_groups{param.groups};
-  if (filter_dims[0] == output_dims[1] && filter_dims[1] == input_dims[1]) {
-    new_groups = 1;
-  } else if (!(filter_dims[0] == input_dims[1] && filter_dims[1] == 1)) {
-    new_groups = input_channel / filter_channel;
-  }
-  /* TODO(ysh329): mobile has no case below
-     else {
-      LOG(FATAL) << "Not support conv3x3 case with"
-                 << " input_dims:" << input_dims << " output_dims:" <<
-    output_dims
-                 << " filter_dims:" << filter_dims;
+  if (is_first_epoch_for_run_ || last_input_dims_ != x_dims) {
+    is_first_epoch_for_run_ = false;
+    last_input_dims_ = x_dims;
+
+    input_tensor_n_ = x_dims[0];
+    input_tensor_c_ = x_dims[1];
+    input_tensor_h_ = x_dims[2];
+    input_tensor_w_ = x_dims[3];
+    auto x_image_shape = InitImageDimInfoWith(x_dims);
+    input_image_h_ = x_image_shape["height"];
+    input_image_w_ = x_image_shape["width"];
+
+    auto output_dims = conv_param_->output->dims();
+    output_tensor_n_ = output_dims[0];
+    output_tensor_c_ = output_dims[1];
+    output_tensor_h_ = output_dims[2];
+    output_tensor_w_ = output_dims[3];
+    auto output_image_shape = InitImageDimInfoWith(output_dims);
+    output_image_h_ = output_image_shape["height"];
+    output_image_w_ = output_image_shape["width"];
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    CHECK_GE(conv_param_->x->dims().size(), 4);
+    CHECK_GE(conv_param_->output->dims().size(), 4);
+    if (kernel_func_names_.size() > 0 &&
+        kernel_func_names_[0] == "conv2d_3x3") {
+      groups_ = conv_param_->groups;
+      if (filter_tensor_n_ == output_tensor_c_ &&
+          filter_tensor_c_ == input_tensor_c_) {
+        groups_ = 1;
+      } else if (!(filter_tensor_n_ == input_tensor_c_ &&
+                   filter_tensor_c_ == 1)) {
+        groups_ = input_tensor_c_ / filter_tensor_c_;
+      }
     }
-  */
-
-  // const std::vector<size_t>& default_work_size =
-  //     DefaultWorkSize(output_dims,
-  //                     DDim(std::vector<DDim::value_type>{
-  //                         static_cast<int64_t>(out_image_shape["width"]),
-  //                         static_cast<int64_t>(out_image_shape["height"])}));
-
-  // int c_block = default_work_size[0];
-  // int w = default_work_size[1];
-  // int nh = default_work_size[2];
-
-  // VLOG(4) << "============ conv2d params ============";
-  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-  //         << input_image_shape["height"];
-  // VLOG(4) << "input_c_block: " << input_c_block;
-  // VLOG(4) << "input_c: " << input_c;
-  // VLOG(4) << "input_image: " << input_image;
-  // VLOG(4) << "input_dims: " << input_dims;
-  // VLOG(4) << "filter_dims: " << filter_dims;
-  // VLOG(4) << "filter_image: " << filter_image;
-  // VLOG(4) << "output_dims: " << output_dims;
-  // VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-  //         << out_image_shape["height"];
-  // VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  // VLOG(4) << "has bias: " << has_bias;
-  // VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  // VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  // VLOG(4) << "offset: " << offset;
-  // VLOG(4) << "dilations.size : " << dilations.size();
-  // VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-  // VLOG(4) << "param.groups(groups):" << param.groups;
-  // VLOG(4) << "new_groups:" << new_groups;
-  // VLOG(4) << "default work size{c_block, w, nh}: "
-  //         << "{" << c_block << ", " << w << ", " << nh << ""
-  //         << "}";
-
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
 
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  // STL::stringstream kernel_key;
-  // kernel_key << kernel_func_names_[0] << build_options_[0];
-  // auto kernel = context.cl_context()->GetKernel(kernel_key.str());
-  // VLOG(4) << "kernel_key: " << kernel_key.str();
-  // VLOG(4) << "kernel ready ... " << kernel_key.str();
-  // VLOG(4) << "w: " << w;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-    VLOG(4) << "set bias_image: ";
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
+    // define image pointer for input, output
+    input_image_p_ = conv_param_->x->data<half_t, cl::Image2D>();
+    output_image_p_ = conv_param_->output->mutable_data<half_t, cl::Image2D>(
+        output_image_w_, output_image_h_);
+
+    GetGlobalWorkSize();
   }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, new_groups);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<int>(input_dims[1]));
-  CL_CHECK_FATAL(status);
-
-  // auto global_work_size =
-  //     cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
-  //                 static_cast<size_t>(default_work_size.data()[1]),
-  //                 static_cast<size_t>(default_work_size.data()[2])};
-
-  // VLOG(4) << "out_image: " << out_image;
-  // VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
-  //         << global_work_size[1] << "," << global_work_size[2] << "}";
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
 }
-void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int input_channel = input_dims[1];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int output_channel = output_dims[1];
-  CHECK_EQ(input_dims[0], output_dims[0]);
-  int batch = input_dims[0];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
 
-#ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d params ============";
-  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-  //         << input_image_shape["height"];
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-#endif
+void ConvImageCompute::GetGlobalWorkSize() {
+  if (kernel_func_names_.size() <= 0) return;
+  // general input_c_block
+  input_c_block_ = static_cast<int>(input_image_w_ / input_tensor_w_);
 
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
+  // general gws
+  auto output_dims = conv_param_->output->dims();
+  const std::vector<size_t>& default_work_size =
+      DefaultWorkSize(output_dims,
+                      DDim(std::vector<DDim::value_type>{
+                          static_cast<int64_t>(output_image_w_),
+                          static_cast<int64_t>(output_image_h_)}));
+  default_c_blk_ = default_work_size[0];
+  default_w_blk_ = default_work_size[1];
+  default_nh_blk_ = default_work_size[2];
+  c_blk_ = default_c_blk_;
+  w_blk_ = default_w_blk_;
+  nh_blk_ = default_nh_blk_;
+  global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                  static_cast<size_t>(w_blk_),
+                                  static_cast<size_t>(nh_blk_)};
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
-#endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
+  if (kernel_func_names_[0] == "conv2d_1x1_simple" ||
+      kernel_func_names_[0] == "conv2d_1x1_opt") {
+    w_blk_ = maptofactor(default_w_blk_, 4);
+    c_blk_ = default_c_blk_;
+    nh_blk_ = default_nh_blk_;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+
+  } else if (kernel_func_names_[0] == "depth_conv2d_3x3s1") {
+    // depthwise spl gws s1
+    int c_block = (output_tensor_c_ + 3) / 4;
+    int w = output_tensor_w_;
+    int nh = output_tensor_n_ * output_tensor_h_;
+    int w_blk_size = 2;
+    int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+    c_blk_ = c_block;
+    w_blk_ = w_blk;
+    nh_blk_ = nh;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+  } else if (kernel_func_names_[0] == "depth_conv2d_3x3") {
+    // depthwise spl gws
+    int c_block = (output_tensor_c_ + 3) / 4;
+    int w = output_tensor_w_;
+    int nh = output_tensor_n_ * output_tensor_h_;
+
+    c_blk_ = c_block;
+    w_blk_ = w;
+    nh_blk_ = nh;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+    input_c_block_ = static_cast<const int>((input_tensor_c_ + 3) / 4);
+  } else if (kernel_func_names_[0] == "conv2d_3x3_multi_batch" ||
+             kernel_func_names_[0] == "conv2d_3x3_opt") {
+    int w_blk_size = 5;
+    int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+    int h_blk_size = 1;
+    int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+    c_blk_ = default_c_blk_;
+    w_blk_ = w_blk;
+    nh_blk_ = h_blk;
+
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+  } else if (kernel_func_names_[0] == "conv2d_5x5_multi_batch" ||
+             kernel_func_names_[0] == "conv2d_5x5_opt") {
+    int w_blk_size = 5;
+    int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+    int h_blk_size = 1;
+    int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+    c_blk_ = default_c_blk_;
+    w_blk_ = w_blk;
+    nh_blk_ = h_blk;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
+  } else if (kernel_func_names_[0] == "conv2d_7x7_multi_batch" ||
+             kernel_func_names_[0] == "conv2d_7x7_opt") {
+    int w_blk_size = 5;
+    int w_blk = (default_w_blk_ + w_blk_size - 1) / w_blk_size;
+
+    int h_blk_size = 1;
+    int h_blk = (default_nh_blk_ + h_blk_size - 1) / h_blk_size;
+
+    c_blk_ = default_c_blk_;
+    w_blk_ = w_blk;
+    nh_blk_ = h_blk;
+    global_work_size_ = cl::NDRange{static_cast<size_t>(c_blk_),
+                                    static_cast<size_t>(w_blk_),
+                                    static_cast<size_t>(nh_blk_)};
   }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, paddings[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, batch);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
+}
 
+void ConvImageCompute::Conv2d1x1opt(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
-          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(16, default_w_blk_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::Conv2d5x5(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
-
+void ConvImageCompute::Conv2d3x3(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(16, filter_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(17, filter_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(18, filter_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(19, groups_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(20, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+}
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
+void ConvImageCompute::Conv2d3x3opt(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
+  PrintConvInfo();
 #endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
+  auto& context = ctx_->As<OpenCLContext>();
+
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, pad_left_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_n_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
 
 #ifdef LITE_WITH_LOG
   //  VLOG(4) << "out_image: " << out_image;
@@ -1086,697 +781,406 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) {
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
 #endif
 
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-  if (is_turn) {
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int input_channel = input_dims[1];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int output_channel = output_dims[1];
-  CHECK_EQ(input_dims[0], output_dims[0]);
-  int batch = input_dims[0];
-
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-
-// default_work_size[2] = h_blk;
+void ConvImageCompute::Conv2d5x5(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d params ============";
-  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-  //         << input_image_shape["height"];
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  PrintConvInfo();
 #endif
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto kernel = kernel_;
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, paddings[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, batch);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-
-  //  VLOG(4) << "out_image: " << out_image;
+  auto& context = ctx_->As<OpenCLContext>();
 
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::Conv2d7x7(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
-
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
-
+void ConvImageCompute::Conv2d5x5opt(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, pad_left_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_n_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
+    CLRuntime::Global()->command_queue().finish();
   }
+}
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
+void ConvImageCompute::Conv2d7x7(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
+  PrintConvInfo();
 #endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
+  auto& context = ctx_->As<OpenCLContext>();
+
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+  if (enable_tune) {
+    CLRuntime::Global()->command_queue().finish();
   }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
+}
 
+void ConvImageCompute::Conv2d7x7opt(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  //  VLOG(4) << "out_image: " << out_image;
-  VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
-          << global_work_size_[1] << "," << global_work_size_[2] << "}";
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, pad_left_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_n_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
-void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int input_channel = input_dims[1];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int output_channel = output_dims[1];
-  CHECK_EQ(input_dims[0], output_dims[0]);
-  int batch = input_dims[0];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
 
+void ConvImageCompute::DepthwiseConv2d3x3s1(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "============ conv2d 7x7 params ============";
-  // VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-  //         << input_image_shape["height"];
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "input_dims: " << input_dims;
-  VLOG(4) << "filter_dims: " << filter_dims;
-  //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
-  VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+  PrintConvInfo();
 #endif
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
+  auto& context = ctx_->As<OpenCLContext>();
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, paddings[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, batch);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_channel);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, pad_left_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_tensor_c_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 local_work_size_,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
-void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-
-  auto* input_img = param.x->data<half_t, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
-
-  const cl::Image2D* bias_img = nullptr;
-  if (param.bias) {
-    bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto image_shape = InitImageDimInfoWith(output_dims);
-
-  auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
-      image_shape["width"], image_shape["height"]);
 
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_img);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_img);
-  CL_CHECK_FATAL(status);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
+void ConvImageCompute::DepthwiseConv2d3x3(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
+  PrintConvInfo();
 #endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *output_img);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
-  CL_CHECK_FATAL(status);
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                local_work_size_,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-
-  if (is_turn) {
+  auto& context = ctx_->As<OpenCLContext>();
+
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto x_dims = param.x->dims();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto dilations = *param.dilations;
-  int offset = filter_dims[2] / 2 - paddings[0];
-  int input_c_block = (x_dims[1] + 3) / 4;
-
-  auto* input_img = param.x->data<half_t, cl::Image2D>();
-  auto* filter_img = filter_gpu_image_->data<half_t, cl::Image2D>();
-
-  const cl::Image2D* bias_img = nullptr;
-  if (param.bias) {
-    bias_img = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto image_shape = InitImageDimInfoWith(output_dims);
-
-  auto* output_img = param.output->mutable_data<half_t, cl::Image2D>(
-      image_shape["width"], image_shape["height"]);
-
-  auto kernel = kernel_;
-
+void ConvImageCompute::DepthwiseConv2d(bool enable_tune) {
 #ifdef LITE_WITH_LOG
-  VLOG(4) << "setArg";
-  VLOG(4) << "strides = " << strides[0];
-  VLOG(4) << "offset = " << offset;
-  VLOG(4) << "dilations = " << dilations[0];
-  VLOG(4) << "input_c_block = " << input_c_block;
-  VLOG(4) << "x_dims[3] = " << x_dims[3];
-  VLOG(4) << "x_dims[2] = " << x_dims[2];
-  VLOG(4) << "output_dims[3] = " << output_dims[3];
-  VLOG(4) << "output_dims[2] = " << output_dims[2];
+  PrintConvInfo();
 #endif
+  auto& context = ctx_->As<OpenCLContext>();
 
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_img);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_img);
-  CL_CHECK_FATAL(status);
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
-#endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *output_img);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
-  CL_CHECK_FATAL(status);
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
-
-  if (is_turn) {
+  status_ = kernel_.setArg(0, c_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(1, w_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(2, nh_blk_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(3, *input_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(4, *filter_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(5, *bias_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(6, *output_image_p_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(7, stride_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(8, offset_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(9, input_c_block_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(10, dilation_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(11, input_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(12, input_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(13, output_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(14, output_tensor_h_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(15, filter_tensor_w_);
+  CL_CHECK_FATAL(status_);
+  status_ = kernel_.setArg(16, filter_tensor_h_);
+  CL_CHECK_FATAL(status_);
+
+  status_ = EnqueueNDRangeKernel(context,
+                                 kernel_,
+                                 cl::NullRange,
+                                 global_work_size_,
+                                 cl::NullRange,
+                                 nullptr,
+                                 event_);
+  CL_CHECK_FATAL(status_);
+
+  if (enable_tune) {
     CLRuntime::Global()->command_queue().finish();
   }
 }
 
-void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
-  auto& context = ctx_->As<OpenCLContext>();
-  CHECK(context.cl_context() != nullptr);
-  const auto& param = *param_.get_mutable<param_t>();
-  auto input_dims = param.x->dims();
-  auto paddings = *param.paddings;
-  auto strides = param.strides;
-  auto* input_image = param.x->data<half_t, cl::Image2D>();
-  auto* filter_image = filter_gpu_image_->data<half_t, cl::Image2D>();
-  auto filter_dims = param.filter->dims();
-  auto output_dims = param.output->dims();
-
-  int input_width = input_dims[3];
-  int input_height = input_dims[2];
-  int output_width = output_dims[3];
-  int output_height = output_dims[2];
-  int filter_width = filter_dims[3];
-  int filter_height = filter_dims[2];
-  auto out_image_shape = InitImageDimInfoWith(output_dims);
-  auto* out_image = param.output->mutable_data<half_t, cl::Image2D>(
-      out_image_shape["width"], out_image_shape["height"]);
-
-  const bool has_bias = param.bias != nullptr;
-  const bool is_element_wise_bias =
-      has_bias && param.output->dims() == param.bias->dims();
-  int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
-               static_cast<int>(paddings[0]);
+void ConvImageCompute::Run() { (this->*impl_)(false); }
 
-  // calc input_c_block
-  auto input_image_shape = InitImageDimInfoWith(input_dims);
-  int input_c_block = input_image_shape["width"] / input_dims[3];
-  int input_c = input_dims[1];
-  auto dilations = *param.dilations;
+void ConvImageCompute::PrintConvInfo() {
+  const bool is_element_wise_bias =
+      has_bias_ && conv_param_->output->dims() == conv_param_->bias->dims();
 
-#ifdef LITE_WITH_LOG
-  VLOG(4) << "============ depthwise conv2d params ============";
-  VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
-          << input_image_shape["height"];
-  VLOG(4) << "input_c_block: " << input_c_block;
-  VLOG(4) << "input_c: " << input_c;
-  //  VLOG(4) << "input_image: " << input_image;
-  VLOG(4) << "filter_dims: " << filter_dims;
+  VLOG(4) << "input_image_shape: " << input_image_w_ << "," << input_image_h_;
+  //  VLOG(4) << "input_image: " << input_image_p_;
+  VLOG(4) << "input_dims: " << conv_param_->x->dims();
+  VLOG(4) << "filter_dims: " << conv_param_->filter->dims();
   //  VLOG(4) << "filter_image: " << filter_image;
-  VLOG(4) << "output_dims: " << output_dims;
-  VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
-          << out_image_shape["height"];
-  VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
-  VLOG(4) << "has bias: " << has_bias;
+  VLOG(4) << "output_dims: " << conv_param_->output->dims();
+  VLOG(4) << "out_image_shape: " << output_image_w_ << ", " << output_image_h_;
+  VLOG(4) << "paddings: " << pad_left_ << "," << pad_up_;
+  VLOG(4) << "has bias: " << has_bias_;
   VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
-  VLOG(4) << "strides: " << strides[0] << "," << strides[1];
-  VLOG(4) << "offset: " << offset;
-  VLOG(4) << "dilations.size : " << dilations.size();
-  VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
-#endif
-
-  CHECK_GE(dilations.size(), 2);
-  CHECK(dilations[0] == dilations[1]);
-  CHECK_GE(input_dims.size(), 4);
-  CHECK_GE(paddings.size(), 2);
-  CHECK(paddings[0] == paddings[1]);
-  CHECK_GE(strides.size(), 2);
-  CHECK(strides[0] == strides[1]);
-
-  // handle bias  use buffer for channel wise , use image for element wise
-  const cl::Buffer* bias_buf = nullptr;
-  const cl::Image2D* bias_image = nullptr;
-  if (has_bias) {
-    bias_image = bias_gpu_image_->data<half_t, cl::Image2D>();
-  }
-
-  auto kernel = kernel_;
-
-  cl_int status;
-  int arg_idx = 0;
-  status = kernel.setArg(arg_idx, c_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, w_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, nh_blk_);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *input_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, *filter_image);
-  CL_CHECK_FATAL(status);
-  if (has_bias) {
-#ifdef LITE_WITH_LOG
-    VLOG(4) << "set bias_image: ";
-#endif
-    status = kernel.setArg(++arg_idx, *bias_image);
-    CL_CHECK_FATAL(status);
-  }
-  status = kernel.setArg(++arg_idx, *out_image);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, strides[0]);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, offset);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_c_block);
-  CL_CHECK_FATAL(status);
-
-  status = kernel.setArg(++arg_idx, dilations[0]);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, input_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, output_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(++arg_idx, filter_height);
-  CL_CHECK_FATAL(status);
-
-#ifdef LITE_WITH_LOG
+  VLOG(4) << "strides: " << stride_h_ << "," << stride_w_;
+  VLOG(4) << "offset: ";
+  VLOG(4) << "dilations.size : " << conv_param_->dilations->size();
+  VLOG(4) << "dilations: " << dilation_h_ << ", " << dilation_w_;
   VLOG(4) << "global_work_size_[3D]: {" << global_work_size_[0] << ","
           << global_work_size_[1] << "," << global_work_size_[2] << "}";
-#endif
-
-  status = EnqueueNDRangeKernel(context,
-                                kernel,
-                                cl::NullRange,
-                                global_work_size_,
-                                cl::NullRange,
-                                nullptr,
-                                event_);
-  CL_CHECK_FATAL(status);
 }
 
-void ConvImageCompute::Run() { (this->*impl_)(false); }
-
-double ConvImageCompute::Turn(int times) {
+double ConvImageCompute::Tune(int times) {
   auto GetCurrentUS = []() -> double {
     struct timeval time;
     gettimeofday(&time, NULL);
diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h
index 64276a5721cb20718604d91d3cfac31e583ddbf1..4eab7be1f1ac6459250c6df984160f0f6060ea1c 100644
--- a/lite/kernels/opencl/conv_image_compute.h
+++ b/lite/kernels/opencl/conv_image_compute.h
@@ -33,6 +33,7 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace opencl {
+
 class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
                                            PRECISION(kFP16),
                                            DATALAYOUT(kImageDefault)> {
@@ -42,8 +43,11 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
 
   void PrepareForRun() override;
 
+  void ReInitWhenNeeded() override;
+
   void Run() override;
-  double Turn(int times = 5);
+
+  double Tune(int times = 5);
 
 #ifdef LITE_WITH_PROFILE
   void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
@@ -56,16 +60,20 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
 #endif
 
  private:
-  void Conv2d1x1opt(bool is_turn = false);
-  void Conv2d3x3(bool is_turn = false);
-  void Conv2d3x3opt(bool is_turn = false);
-  void Conv2d5x5(bool is_turn = false);
-  void Conv2d5x5opt(bool is_turn = false);
-  void Conv2d7x7(bool is_turn = false);
-  void Conv2d7x7opt(bool is_turn = false);
-  void DepthwiseConv2d3x3s1(bool is_turn = false);
-  void DepthwiseConv2d3x3(bool is_turn = false);
-  void DepthwiseConv2d(bool is_turn = false);
+  void PrintConvInfo();
+  void GetGlobalWorkSize();
+  void Conv2d1x1opt(bool enable_tune = false);
+  void Conv2d3x3(bool enable_tune = false);
+  void Conv2d3x3opt(bool enable_tune = false);
+  void Conv2d5x5(bool enable_tune = false);
+  void Conv2d5x5opt(bool enable_tune = false);
+  void Conv2d7x7(bool enable_tune = false);
+  void Conv2d7x7opt(bool enable_tune = false);
+  void DepthwiseConv2d3x3s1(bool enable_tune = false);
+  void DepthwiseConv2d3x3(bool enable_tune = false);
+  void DepthwiseConv2d(bool enable_tune = false);
+
+  param_t* conv_param_{nullptr};
 
   kernel_t impl_;
   std::vector<std::string> kernel_func_names_{};
@@ -79,19 +87,72 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
   std::unique_ptr<Tensor> tensor_hold_bias_image_{nullptr};
   cl::NDRange global_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
+
+  // opencl kernel args
   int c_blk_ = 1;
   int w_blk_ = 1;
   int nh_blk_ = 1;
 
+  const cl::Image2D* input_image_p_{nullptr};
+  const cl::Image2D* filter_image_p_{nullptr};
+  const cl::Image2D* bias_image_p_{nullptr};
+  const cl::Image2D* output_image_p_{nullptr};
+
+  int stride_h_{-1};
+  int stride_w_{-1};
+
+  int dilation_h_{-1};
+  int dilation_w_{-1};
+
+  int pad_up_{-1};
+  int pad_down_{-1};
+  int pad_left_{-1};
+  int pad_right_{-1};
+
+  int offset_{-1};
+  int groups_{-1};
+  bool relu_fused_{false};
+  bool has_bias_{false};
+
+  int input_tensor_n_{-1};
+  int input_tensor_c_{-1};
+  int input_tensor_h_{-1};
+  int input_tensor_w_{-1};
+  int input_image_h_{-1};
+  int input_image_w_{-1};
+  int input_c_block_{-1};
+
+  int output_tensor_n_{-1};
+  int output_tensor_c_{-1};
+  int output_tensor_h_{-1};
+  int output_tensor_w_{-1};
+  int output_image_h_{-1};
+  int output_image_w_{-1};
+
+  int filter_tensor_n_{-1};
+  int filter_tensor_c_{-1};
+  int filter_tensor_h_{-1};
+  int filter_tensor_w_{-1};
+  int filter_image_h_{-1};
+  int filter_image_w_{-1};
+
+  int bias_image_h_{-1};
+  int bias_image_w_{-1};
+
   int default_c_blk_ = 1;
   int default_w_blk_ = 1;
   int default_nh_blk_ = 1;
+  // =================
+
+  DDim last_input_dims_{};
+  bool is_first_epoch_for_run_{true};
 
   cl::Kernel kernel_;
+  cl_int status_;
   cl::NDRange local_work_size_ = cl::NDRange{
       static_cast<size_t>(1), static_cast<size_t>(1), static_cast<size_t>(1)};
   bool use_lws_{true};
-  bool use_turn_{false};
+  bool use_tune_{false};
 };
 
 }  // namespace opencl
diff --git a/lite/kernels/opencl/expand_image_compute_test.cc b/lite/kernels/opencl/expand_image_compute_test.cc
index e3188777df9752c8ac6fd2849bdaddced975bda1..c372855193e938081208addce058e3e38b692cbb 100644
--- a/lite/kernels/opencl/expand_image_compute_test.cc
+++ b/lite/kernels/opencl/expand_image_compute_test.cc
@@ -11,9 +11,9 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include <random>
 
 #include <gtest/gtest.h>
+#include <random>
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
diff --git a/lite/kernels/opencl/fc_buffer_compute.cc b/lite/kernels/opencl/fc_buffer_compute.cc
index 9763faf2f33f578e6f62b07a8c89390e1b80c159..3a31c8993d77388b95260ad5c0be65f791c433eb 100644
--- a/lite/kernels/opencl/fc_buffer_compute.cc
+++ b/lite/kernels/opencl/fc_buffer_compute.cc
@@ -35,10 +35,27 @@ class FcCompute
  public:
   using param_t = operators::FcParam;
 
-  void PrepareForRun() override {}
+  void PrepareForRun() override {
+    fc_param_ = param_.get_mutable<param_t>();
+    auto w_t = fc_param_->w;
+    auto bias_t = fc_param_->bias;
+
+    w_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
+    auto w_gpu_data =
+        w_gpu_t_->mutable_data(TARGET(kOpenCL), w_t->memory_size());
+    TargetWrapperCL::MemcpySync(
+        w_gpu_data, w_t->raw_data(), w_t->memory_size(), IoDirection::HtoD);
+
+    bias_gpu_t_ = std::unique_ptr<Tensor>(new Tensor);
+    auto b_gpu_data =
+        bias_gpu_t_->mutable_data(TARGET(kOpenCL), bias_t->memory_size());
+    TargetWrapperCL::MemcpySync(b_gpu_data,
+                                bias_t->raw_data(),
+                                bias_t->memory_size(),
+                                IoDirection::HtoD);
+  }
 
   void ReInitWhenNeeded() override {
-    fc_param_ = param_.get_mutable<param_t>();
     const auto x_dims = fc_param_->input->dims();
     if ((!first_epoch_for_reinit_ && x_dims != last_x_dims_) ||
         first_epoch_for_reinit_) {
@@ -93,7 +110,7 @@ class FcCompute
   }
 
   void GetGlobalWorkSize() {
-    if (m_ == 1) {  // gemv
+    if (kernel_func_name_ == "fc_gemv_1x4") {  // gemv
       global_work_size_ = cl::NDRange{static_cast<size_t>((n_ + 3) / 4)};
     } else {  // gemm
       global_work_size_ = cl::NDRange{static_cast<size_t>((m_ + 3) / 4),
@@ -103,8 +120,8 @@ class FcCompute
 
   void Run() override {
     auto* x_buf = fc_param_->input->data<float, cl::Buffer>();
-    auto* w_buf = fc_param_->w->data<float, cl::Buffer>();
-    auto* bias_buf = fc_param_->bias->data<float, cl::Buffer>();
+    auto* w_buf = w_gpu_t_->data<float, cl::Buffer>();
+    auto* bias_buf = bias_gpu_t_->data<float, cl::Buffer>();
     auto* out_buf =
         fc_param_->output->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
@@ -154,6 +171,10 @@ class FcCompute
   std::string time_stamp_{GetTimeStamp()};
   bool first_epoch_for_reinit_{true};
   DDim last_x_dims_;
+
+  std::unique_ptr<Tensor> w_gpu_t_{nullptr};
+  std::unique_ptr<Tensor> bias_gpu_t_{nullptr};
+
   cl::NDRange global_work_size_;
   cl::Kernel kernel_;
 };
@@ -166,7 +187,7 @@ class FcCompute
 REGISTER_LITE_KERNEL(
     fc, kOpenCL, kFloat, kNCHW, paddle::lite::kernels::opencl::FcCompute, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kOpenCL))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .Finalize();
diff --git a/lite/kernels/opencl/fc_buffer_compute_test.cc b/lite/kernels/opencl/fc_buffer_compute_test.cc
index 4c9c8c47e4306c92486dd1b847884200959453dd..85793dffee9e4717e257ad8c73258ce35ad61d54 100644
--- a/lite/kernels/opencl/fc_buffer_compute_test.cc
+++ b/lite/kernels/opencl/fc_buffer_compute_test.cc
@@ -126,9 +126,11 @@ TEST(fc, compute) {
         out.Resize(out_dim);
         out_ref.Resize(out_dim);
 
+        VLOG(2) << "out.dims():" << out.dims() << ", out_dim:" << out_dim;
+
         auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* w_data = w.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
-        auto* bias_data = bias.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+        auto* w_data = w.mutable_data<float>();
+        auto* bias_data = bias.mutable_data<float>();
         auto* out_data = out.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
 
         std::default_random_engine engine;
@@ -148,17 +150,15 @@ TEST(fc, compute) {
         }
         for (size_t i = 0; i < w_dim.production(); ++i) {
           w_source[i] = static_cast<int>(dist(engine));
+          w_data[i] = w_source[i];
         }
         for (size_t i = 0; i < bias_dim.production(); ++i) {
           bias_source[i] = 10;  // static_cast<int>(dist(engine));
+          bias_data[i] = 10;
         }
 
         TargetWrapperCL::MemcpySync(
             x_data, x_source.data(), x_size, IoDirection::HtoD);
-        TargetWrapperCL::MemcpySync(
-            w_data, w_source.data(), w_size, IoDirection::HtoD);
-        TargetWrapperCL::MemcpySync(
-            bias_data, bias_source.data(), bias_size, IoDirection::HtoD);
 
         // run opencl kernel
         kernel->Launch();
@@ -186,8 +186,10 @@ TEST(fc, compute) {
 #endif
 
         std::vector<float> out_data_from_gpu(out_dim.production());
-        TargetWrapperCL::MemcpySync(
-            out_data_from_gpu.data(), out_data, bias_size, IoDirection::DtoH);
+        TargetWrapperCL::MemcpySync(out_data_from_gpu.data(),
+                                    out_data,
+                                    out_data_from_gpu.size() * sizeof(float),
+                                    IoDirection::DtoH);
 
         // run cpu ref
         auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
diff --git a/lite/kernels/opencl/transpose_image_compute.cc b/lite/kernels/opencl/transpose_image_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..31184092efa40cea47c3cacb6a65f03d15a229b2
--- /dev/null
+++ b/lite/kernels/opencl/transpose_image_compute.cc
@@ -0,0 +1,395 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_half.h"
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/profiler.h"
+#endif
+#include "lite/backends/opencl/cl_utility.h"
+
+#undef LITE_WITH_LOG
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+// transpose operator
+class TransposeComputeFloatImage
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void PrepareForRun() override {
+    auto& param = *param_.get_mutable<param_t>();
+    Tensor* const output = param.output;
+    const DDimLite& out_dims = output->dims();
+    if (out_dims.size() == 4) {
+      kernel_func_name_ = "transpose_4d";
+    } else {
+      kernel_func_name_ = "transpose";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    VLOG(1) << "kernel_func_name_:" << kernel_func_name_;
+    context.cl_context()->AddKernel(kernel_func_name_,
+                                    "image/transpose_kernel.cl",
+                                    build_options_,
+                                    time_stamp_);
+  }
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
+    ch->kernel_func_name = kernel_func_name_;
+    ch->cl_event =
+        event_;  // `event_` defined in `kernel.h`, valid after kernel::Run
+  }
+#endif
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const Tensor* const x = param.x;
+    const auto x_dims = x->dims();
+    const std::map<std::string, size_t>& input_image_shape =
+        InitImageDimInfoWith(x_dims);
+    const int64_t& input_image_width = input_image_shape.at("width");
+    const int64_t& input_image_height = input_image_shape.at("height");
+    const cl::Image2D* const x_image = x->data<half_t, cl::Image2D>();
+
+    Tensor* const output = param.output;
+    const DDimLite& out_dims = output->dims();
+    VLOG(4) << "out_dims= " << out_dims;
+    const std::map<std::string, size_t>& out_image_shape =
+        InitImageDimInfoWith(out_dims);
+    cl::Image2D* const out_image = output->mutable_data<half_t, cl::Image2D>(
+        out_image_shape.at("width"), out_image_shape.at("height"));
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "out_dims=   " << out_dims;
+#endif
+    const std::vector<size_t>& default_work_size = DefaultWorkSize(
+        out_dims,
+        DDim(std::vector<DDim::value_type>{
+            static_cast<int64_t>(out_image_shape.at("width")),
+            static_cast<int64_t>(out_image_shape.at("height"))}));
+
+    int out_C = 0, out_H = 0, out_W = 0, in_W = 0;
+    if (param.output->dims().size() == 4) {
+      out_C = out_dims[1];
+      out_H = out_dims[2];
+      out_W = out_dims[3];
+      in_W = x_dims[3];
+    } else if (param.output->dims().size() == 3) {
+      out_C = out_dims[0];
+      out_H = out_dims[1];
+      out_W = out_dims[2];
+      in_W = x_dims[2];
+    } else if (param.output->dims().size() == 2) {
+      out_C = 1;
+      out_H = out_dims[0];
+      out_W = out_dims[1];
+      in_W = x_dims[1];
+    }
+
+#ifdef LITE_WITH_LOG
+    VLOG(4) << "out_C=" << out_C;
+    VLOG(4) << "out_H=" << out_H;
+    VLOG(4) << "out_W=" << out_W;
+    VLOG(4) << "in_W=" << in_W;
+    VLOG(4) << "default_work_size= " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+#endif
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_ << time_stamp_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+#ifdef LITE_WITH_LOG
+    VLOG(4) << TargetToStr(x->target());
+    VLOG(4) << TargetToStr(param.output->target());
+#endif
+
+    int arg_idx = 0;
+    cl_int status;
+    status = kernel.setArg(arg_idx, *x_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_C);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_H);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_W);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_W);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                    static_cast<size_t>(default_work_size.data()[1]),
+                    static_cast<size_t>(default_work_size.data()[2])};
+
+    status = EnqueueNDRangeKernel(context,
+                                  kernel,
+                                  cl::NullRange,
+                                  global_work_size,
+                                  cl::NullRange,
+                                  nullptr,
+                                  event_);
+    CL_CHECK_FATAL(status);
+  }
+
+ private:
+  std::string kernel_func_name_{"transpose"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::string time_stamp_{GetTimeStamp()};
+};
+
+// transpose2 operator
+class Transpose2ComputeFloatImage
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void PrepareForRun() override {}
+
+#ifdef LITE_WITH_PROFILE
+  void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {}
+#endif
+
+  bool IsShuffleChannel(const std::vector<int>& axis) {
+    bool is_shuffle_channel = true;
+    if (axis.size() > 2 && axis[0] == 0 && axis[1] == 2 && axis[2] == 1) {
+      for (int i = 3; i < axis.size(); ++i) {
+        if (axis[i] != i) {
+          is_shuffle_channel = false;
+          break;
+        }
+      }
+    } else {
+      return false;
+    }
+    return is_shuffle_channel;
+  }
+
+  template <typename Dtype>
+  void DeviceTensorToHostTensor(const Tensor* device_tensor,
+                                Tensor* host_tensor) {
+    host_tensor->Resize(device_tensor->dims());
+    Dtype* host_ptr = host_tensor->mutable_data<Dtype>();
+    CLRuntime::Global()->command_queue().finish();
+    CLImageConverterDefault default_converter;
+    auto device_tensor_image_dim =
+        default_converter.InitImageDimInfoWith(device_tensor->dims());
+    half_t* image_data = new half_t[device_tensor_image_dim.production() * 4];
+    TargetWrapperCL::ImgcpySync(image_data,
+                                device_tensor->data<half_t, cl::Image2D>(),
+                                device_tensor_image_dim[0],
+                                device_tensor_image_dim[1],
+                                0,
+                                0,
+                                IoDirection::DtoH);
+    default_converter.ImageToNCHW(
+        image_data, host_ptr, device_tensor_image_dim, host_tensor->dims());
+    delete[] image_data;
+  }
+
+  template <typename Dtype>
+  void HostTensorToDeviceTensor(const Tensor* host_tensor,
+                                Tensor* device_tensor) {
+    Dtype* host_ptr = const_cast<Dtype*>(host_tensor->data<Dtype>());
+    CLImageConverterDefault default_converter;
+    auto device_tensor_image_dim =
+        default_converter.InitImageDimInfoWith(device_tensor->dims());
+    device_tensor->mutable_data<half_t, cl::Image2D>(
+        device_tensor_image_dim[0], device_tensor_image_dim[1]);
+    half_t* image_data = new half_t[device_tensor->dims().production() * 4];
+    default_converter.NCHWToImage(host_ptr, image_data, device_tensor->dims());
+
+    TargetWrapperCL::ImgcpySync(
+        device_tensor->mutable_data<half_t, cl::Image2D>(),
+        image_data,
+        device_tensor_image_dim[0],
+        device_tensor_image_dim[1],
+        0,
+        0,
+        IoDirection::HtoD);
+
+    delete[] image_data;
+  }
+
+  template <typename Dtype>
+  void ShuffleChannelCompute(const operators::TransposeParam& param) {
+    const Tensor* input = param.x;
+    Tensor* input_tensor = new Tensor();
+    DeviceTensorToHostTensor<Dtype>(input, input_tensor);
+    Dtype* input_ptr = input_tensor->mutable_data<Dtype>();
+
+    Tensor* output = param.output;
+    Tensor* output_tensor = new Tensor();
+    output_tensor->Resize(output->dims());
+    Dtype* output_ptr = output_tensor->mutable_data<Dtype>();
+
+    // input and output's shape dimension must >= 2 && <= 6.
+    const DDim& in_dim = input->dims();
+    const DDim& out_dim = output->dims();
+    size_t offset = 1;
+    for (int i = 3; i < param.axis.size(); ++i) {
+      offset *= in_dim[i];
+    }
+#pragma omp parallel for collapse(3)
+    for (int batch = 0; batch < out_dim[0]; ++batch) {
+      for (int c1 = 0; c1 < out_dim[1]; ++c1) {
+        for (int c2 = 0; c2 < out_dim[2]; ++c2) {
+          size_t out_offset =
+              ((batch * out_dim[1] + c1) * out_dim[2] + c2) * offset;
+          size_t in_offset =
+              ((batch * in_dim[1] + c2) * in_dim[2] + c1) * offset;
+          memcpy(output_ptr + out_offset,
+                 input_ptr + in_offset,
+                 offset * sizeof(Dtype));
+        }
+      }
+    }
+    HostTensorToDeviceTensor<Dtype>(output_tensor, output);
+    delete input_tensor;
+    delete output_tensor;
+  }
+
+  template <typename Dtype>
+  void Transpose2Compute(const operators::TransposeParam& param) {
+    const Tensor* input = param.x;
+    Tensor* input_tensor = new Tensor();
+    DeviceTensorToHostTensor<Dtype>(input, input_tensor);
+    Dtype* input_ptr = input_tensor->mutable_data<Dtype>();
+
+    Tensor* output = param.output;
+    Tensor* output_tensor = new Tensor();
+    output_tensor->Resize(output->dims());
+    Dtype* output_ptr = output_tensor->mutable_data<Dtype>();
+
+    // input and output's shape dimension must >= 2 && <= 6.
+    const DDim& in_dim = input->dims();
+    const DDim& out_dim = output->dims();
+
+    // precompute inverted output dim and strides
+    size_t rout_dim[6], strides[6];
+    auto& axis = param.axis;
+    int permute = axis.size();  // permute must >=2 && <= 6.
+    for (int i = 0; i < permute; ++i) {
+      int k = permute - 1 - i;
+      strides[k] = 1;
+      for (int j = axis[i] + 1; j < permute; ++j) {
+        strides[k] *= in_dim[j];
+      }
+      rout_dim[k] = out_dim[i];
+    }
+
+    // unroll the first 2 dimensions
+    int reamin_dim = 1;
+    for (int i = 2; i < out_dim.size(); ++i) {
+      reamin_dim *= out_dim[i];
+    }
+
+#pragma omp parallel for collapse(2)
+    for (int batch = 0; batch < out_dim[0]; ++batch) {
+      for (int j = 0; j < out_dim[1]; ++j) {
+        size_t offset = batch * strides[permute - 1] + j * strides[permute - 2];
+        Dtype* out_ptr = output_ptr + (batch * out_dim[1] + j) * reamin_dim;
+        int indics[4] = {0, 0, 0, 0};
+        for (int k = 0; k < reamin_dim; ++k) {
+          out_ptr[k] = input_ptr[offset];
+          indics[0] += 1;
+          offset += strides[0];
+          for (int p = 0; p < permute - 3; ++p) {
+            if (indics[p] == rout_dim[p]) {
+              indics[p + 1] += 1;
+              indics[p] = 0;
+              offset += strides[p + 1];
+              offset -= rout_dim[p] * strides[p];
+            } else {
+              break;
+            }
+          }
+        }
+      }
+    }
+    HostTensorToDeviceTensor<Dtype>(output_tensor, output);
+    delete input_tensor;
+    delete output_tensor;
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const std::vector<int> axis = param.axis;
+
+    bool shuffle_channel = IsShuffleChannel(axis);
+    if (shuffle_channel) {
+      ShuffleChannelCompute<float>(param);
+    } else {
+      Transpose2Compute<float>(param);
+    }
+  }
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(transpose,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::TransposeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(transpose2,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::Transpose2ComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
+
+#define LITE_WITH_LOG
diff --git a/lite/kernels/opencl/transpose_image_compute_test.cc b/lite/kernels/opencl/transpose_image_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9db9b3732d44aa3f342a8cf8b7b2fe5819586a5f
--- /dev/null
+++ b/lite/kernels/opencl/transpose_image_compute_test.cc
@@ -0,0 +1,172 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/opencl/test_helper.h"
+#include "lite/operators/reshape_op.h"
+#include "lite/utils/logging.h"
+
+#define FP16_MAX_DIFF (5e-1)
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+static inline void TestWithKernel(
+    const std::unique_ptr<paddle::lite::KernelBase>& kernel) {
+  int64_t batch_size = 1;
+  int64_t ic = 2;
+  int64_t ih = 3;
+  int64_t iw = 4;
+
+  int64_t oc = 3;
+  int64_t oh = 4;
+  int64_t ow = 2;
+
+  lite::Tensor input, output;
+  operators::TransposeParam param;
+
+  param.x = &input;
+  param.output = &output;
+  param.axis = std::vector<int>({0, 2, 3, 1});
+  const DDim input_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+  input.Resize(input_dim);
+  const DDim output_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, oc, oh, ow})};
+  param.output->Resize(output_dim);
+
+  LOG(INFO) << "prepare kernel SetParam------";
+  kernel->SetParam(param);
+
+  size_t input_image_width = iw * ((ic + 3) / 4);
+  size_t input_image_height = ih * batch_size;
+
+  size_t output_image_width = ow * ((oc + 3) / 4);
+  size_t output_image_height = oh * batch_size;
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  std::vector<float> input_v(batch_size * ic * ih * iw);
+
+  LOG(INFO) << "gen input ...";
+
+  float* input_v_data = &input_v[0];
+  auto index = 0;
+  for (auto& i : input_v) {
+    i = index++;
+  }
+
+  paddle::lite::CLImageConverterDefault default_convertor;
+
+  std::vector<half_t> x_image_data(input_image_width * input_image_height *
+                                   4);  // 4 : RGBA
+
+  LOG(INFO) << "set mapped input  ...";
+  default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
+
+  auto* input_image = input.mutable_data<half_t, cl::Image2D>(
+      input_image_width, input_image_height, x_image_data.data());
+
+  LOG(INFO) << "prepare kernel ready";
+
+  LOG(INFO) << "mutable output ...";
+  CLImageConverterDefault default_converter;
+  DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = output.mutable_data<half_t, cl::Image2D>(
+      out_image_shape[0], out_image_shape[1]);
+
+  LOG(INFO) << "kernel context ...";
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  std::unique_ptr<KernelContext> transpose_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(transpose_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(transpose_context));
+
+  LOG(INFO) << "kernel launch ...";
+  kernel->Launch();
+
+  CLRuntime::Global()->command_queue().finish();
+
+  half_t* out_image_data = new half_t[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              output.data<half_t, cl::Image2D>(),
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter.ImageToNCHW(
+      out_image_data, out_data, out_image_shape, output_dim);
+
+  // check output data
+  index = 0;
+  auto hxw = ih * iw;
+  auto cxhxw = ic * hxw;
+  for (auto n = 0; n < batch_size; n++) {
+    for (auto h = 0; h < ih; h++) {
+      for (auto w = 0; w < iw; w++) {
+        for (auto c = 0; c < ic; c++) {
+          auto input_index = n * cxhxw + c * hxw + h * iw + w;
+          auto input_value = input_v_data[input_index];
+          auto output_value = out_data[index];
+          auto abs_diff = abs(input_value - output_value);
+          auto relative_diff = COMPUTE_RELATIVE_DIFF(input_value, output_value);
+          EXPECT_EQ(
+              (relative_diff <= FP16_MAX_DIFF) || (abs_diff <= FP16_MAX_DIFF),
+              true);
+          index++;
+        }
+      }
+    }
+  }
+}
+
+TEST(transpose_opencl, compute) {
+  auto kernels = KernelRegistry::Global().Create("transpose",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFP16),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  TestWithKernel(kernel);
+}
+
+TEST(transpose2_opencl, compute) {
+  auto kernels = KernelRegistry::Global().Create("transpose2",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFP16),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+  TestWithKernel(kernel);
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(transpose, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/rknpu/subgraph_compute.cc b/lite/kernels/rknpu/subgraph_compute.cc
index e0b63205705609b6899918ce8e254ccdf6cbad47..a50505c38c0740f762256cd71e006caf9249838e 100644
--- a/lite/kernels/rknpu/subgraph_compute.cc
+++ b/lite/kernels/rknpu/subgraph_compute.cc
@@ -28,13 +28,36 @@ namespace lite {
 namespace kernels {
 namespace rknpu {
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_itensors_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    device_itensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_itensors_[i]);
+  }
+  device_otensors_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    device_otensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_otensors_[i]);
+  }
+  return true;
+}
+
+bool SubgraphEngine::BuildDeviceProgram() {
   LOG(INFO) << "[RKNPU]:BuildDeviceProgram";
   int status = 0;
   // Convert all of ops and their input vars and weights and added into the NPU
   // RKNPU IR graph
   subgraph::rknpu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
+  if (origin_program_.empty()) {
+    BuildOriginProgram();
+  }
   for (auto& inst : origin_program_) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
@@ -42,13 +65,13 @@ int SubgraphEngine::BuildDeviceProgram() {
     op->InferShape();
     std::string op_type = op->op_info()->Type();
     if (!bridges.Exists(op_type, TARGET(kRKNPU))) {
-      return subgraph::FAILED;
+      return false;
     }
     auto kernel = inst.kernel();
     status |= bridges.Select(op_type, TARGET(kRKNPU))(
         reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
   // Collect the valid input and output nodes in the RKNPU IR graph and update
@@ -91,7 +114,7 @@ int SubgraphEngine::BuildDeviceProgram() {
       model_name_, graph.GetHandle(), device_itensors_, device_otensors_);
   if (device_program_ == nullptr) {
     LOG(WARNING) << "[RKNPU] Build model failed!";
-    return subgraph::FAILED;
+    return false;
   }
 
   // input
@@ -165,10 +188,10 @@ int SubgraphEngine::BuildDeviceProgram() {
         break;
     }
   }
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
   LOG(INFO) << "[RKNPU]:LaunchDeviceProgram";
   std::vector<rk::nn::InputInfo> inputs;
   std::vector<rk::nn::OutputInfo> outputs;
@@ -195,7 +218,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
   device_program_->SetInputs(inputs);
   device_program_->Run();
   device_program_->GetOutputs(outputs);
-  return 0;
+  return true;
 }
 
 void SubgraphCompute::PrepareForRun() {
@@ -208,13 +231,12 @@ void SubgraphCompute::PrepareForRun() {
                                    param.output_data_names,
                                    param.scope));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   LOG(INFO) << "[RKNPU]:Run";
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace rknpu
diff --git a/lite/kernels/rknpu/subgraph_compute.h b/lite/kernels/rknpu/subgraph_compute.h
index 863e6aef39ad54f0e9d94d4b507c6fca4128ebb8..a4bdadc658a81decd8107072f7b5948613d0c68a 100644
--- a/lite/kernels/rknpu/subgraph_compute.h
+++ b/lite/kernels/rknpu/subgraph_compute.h
@@ -42,14 +42,15 @@ class SubgraphEngine : public subgraph::Engine {
             ctx, block_idx, block_desc, input_names, output_names, scope) {}
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool PrepareWorkspaceForDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
   std::string model_name_;
   std::vector<std::string> device_inames_;
   std::vector<std::string> device_onames_;
-  std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_;
-  std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_;
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_itensors_{};
+  std::vector<std::shared_ptr<rk::nn::Tensor>> device_otensors_{};
   std::unique_ptr<rk::nn::Exection> device_program_{nullptr};
 };
 
diff --git a/lite/kernels/x86/activation_compute.cc b/lite/kernels/x86/activation_compute.cc
index 2910364f37b74d94977e2397e31eb97fd367825e..9b4c2fadd9ce427db272a9bb0cfd0e0a10716f11 100644
--- a/lite/kernels/x86/activation_compute.cc
+++ b/lite/kernels/x86/activation_compute.cc
@@ -78,3 +78,13 @@ REGISTER_LITE_KERNEL(softsign,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(sigmoid,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SoftsignCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/activation_compute_test.cc b/lite/kernels/x86/activation_compute_test.cc
index 8cc2607e73e605214e08e42e70de457a206e2468..550cf299f676105271e758eb1a13e880045ee1cc 100644
--- a/lite/kernels/x86/activation_compute_test.cc
+++ b/lite/kernels/x86/activation_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/activation_compute.cc"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/activation_compute.cc"
 
 namespace paddle {
 namespace lite {
@@ -26,8 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(relu_x86, retrive_op) {
-  auto relu =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("relu");
+  auto relu = KernelRegistry::Global().Create("relu");
   ASSERT_FALSE(relu.empty());
   ASSERT_TRUE(relu.front());
 }
diff --git a/lite/kernels/x86/attention_padding_mask_compute_test.cc b/lite/kernels/x86/attention_padding_mask_compute_test.cc
index 35ce822e010fc3ce2dc756b86e3a437789cc8359..5c672a1ee05116ccefec074f54d0726a7cd010ea 100644
--- a/lite/kernels/x86/attention_padding_mask_compute_test.cc
+++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/attention_padding_mask_compute.cc"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/attention_padding_mask_compute.cc"
 
 namespace paddle {
 namespace lite {
@@ -81,8 +83,7 @@ int get_max_len(const LoD& lod) {
 
 TEST(attention_padding_mask_x86, retrive_op) {
   auto attention_padding_mask =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "attention_padding_mask");
+      KernelRegistry::Global().Create("attention_padding_mask");
   ASSERT_FALSE(attention_padding_mask.empty());
   ASSERT_TRUE(attention_padding_mask.front());
 }
diff --git a/lite/kernels/x86/batch_norm_compute_test.cc b/lite/kernels/x86/batch_norm_compute_test.cc
index 5ec2cdcdda0e9ff3698c80584b36396b38328e03..dd70f78efa7334355c459fd1d85a7da4f5b05b60 100644
--- a/lite/kernels/x86/batch_norm_compute_test.cc
+++ b/lite/kernels/x86/batch_norm_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/batch_norm_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/batch_norm_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(batch_norm_x86, retrive_op) {
-  auto batch_norm =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "batch_norm");
+  auto batch_norm = KernelRegistry::Global().Create("batch_norm");
   ASSERT_FALSE(batch_norm.empty());
   ASSERT_TRUE(batch_norm.front());
 }
diff --git a/lite/kernels/x86/cast_compute_test.cc b/lite/kernels/x86/cast_compute_test.cc
index f7aa52ca6d0dde603357f009220b4a3a53f56833..b039cf5d3b01032e60ef7bdcf31a45c8ed302215 100644
--- a/lite/kernels/x86/cast_compute_test.cc
+++ b/lite/kernels/x86/cast_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/cast_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/cast_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,8 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(cast_x86, retrive_op) {
-  auto cast =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("cast");
+  auto cast = KernelRegistry::Global().Create("cast");
   ASSERT_FALSE(cast.empty());
   ASSERT_TRUE(cast.front());
 }
diff --git a/lite/kernels/x86/concat_compute_test.cc b/lite/kernels/x86/concat_compute_test.cc
index 468e9422752561ff6416e8859b485462b9e2abbe..4be51dff6ed613842de431cce8a7960182073c4f 100644
--- a/lite/kernels/x86/concat_compute_test.cc
+++ b/lite/kernels/x86/concat_compute_test.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/concat_compute.h"
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/concat_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -23,9 +25,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(concat_x86, retrive_op) {
-  auto concat =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "concat");
+  auto concat = KernelRegistry::Global().Create("concat");
   ASSERT_FALSE(concat.empty());
   ASSERT_TRUE(concat.front());
 }
diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc
index 2827c6577e5bf311b4002526d4ac10f636162d96..cd46571a2a9fd6b428f84ca278a453c8675d6ed6 100644
--- a/lite/kernels/x86/conv_compute_test.cc
+++ b/lite/kernels/x86/conv_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/conv_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/conv_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(conv_x86, retrive_op) {
-  auto conv2d =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "conv2d");
+  auto conv2d = KernelRegistry::Global().Create("conv2d");
   ASSERT_FALSE(conv2d.empty());
   ASSERT_TRUE(conv2d.front());
 }
diff --git a/lite/kernels/x86/dropout_compute_test.cc b/lite/kernels/x86/dropout_compute_test.cc
index 279f639f40ece0a10e45fe16f36fcb443cea550a..d30fbbea670d9509e722e3a27fd3dbf1d89a308c 100644
--- a/lite/kernels/x86/dropout_compute_test.cc
+++ b/lite/kernels/x86/dropout_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/dropout_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/dropout_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(dropout_x86, retrive_op) {
-  auto dropout =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "dropout");
+  auto dropout = KernelRegistry::Global().Create("dropout");
   ASSERT_FALSE(dropout.empty());
   ASSERT_TRUE(dropout.front());
 }
diff --git a/lite/kernels/x86/elementwise_compute_test.cc b/lite/kernels/x86/elementwise_compute_test.cc
index 9850c0ce86756cd12e28ab95688b79a1c539189c..6379faacad75f98f73eafbdfc2f8c9deb4d086cb 100644
--- a/lite/kernels/x86/elementwise_compute_test.cc
+++ b/lite/kernels/x86/elementwise_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/elementwise_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/elementwise_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(elementwise_add_x86, retrive_op) {
-  auto elementwise_add =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "elementwise_add");
+  auto elementwise_add = KernelRegistry::Global().Create("elementwise_add");
   ASSERT_FALSE(elementwise_add.empty());
   ASSERT_TRUE(elementwise_add.front());
 }
diff --git a/lite/kernels/x86/elementwise_op_function.h b/lite/kernels/x86/elementwise_op_function.h
index f736248ed3632af92dea2823439e6e7d28ff3e1b..4cb7160097e320798c1b1e2ee94d7fec8aedc6d6 100644
--- a/lite/kernels/x86/elementwise_op_function.h
+++ b/lite/kernels/x86/elementwise_op_function.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "lite/fluid/for_range.h"
 #include "lite/fluid/transform.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/paddle_enforce.h"
 #include "lite/utils/variant.h"
 
 namespace paddle {
@@ -66,9 +65,8 @@ inline void get_mid_dims(const lite::DDim &x_dims,
     for (size_t i = 0; i < y_dims.size(); ++i) {
       if (x_dims[i + axis] != y_dims[i]) {
         // only support single y_dims[i] = 1 now.
-        PADDLE_ENFORCE_EQ(
-            *mid_flag, 0, "Broadcast support y_dims with single 1.");
-        PADDLE_ENFORCE_EQ(y_dims[i], 1, "Broadcast dimension mismatch.");
+        CHECK_EQ(*mid_flag, 0) << "Broadcast support y_dims with single 1.";
+        CHECK_EQ(y_dims[i], 1) << "Broadcast dimension mismatch.";
         // m*n*k m*1*k
         for (size_t j = 0; j < i; ++j) {
           (*pre) *= y_dims[j];
@@ -95,8 +93,7 @@ inline void get_mid_dims(const lite::DDim &x_dims,
     }
 
     for (size_t i = 0; i < y_dims.size(); ++i) {
-      PADDLE_ENFORCE_EQ(
-          x_dims[i + axis], y_dims[i], "Broadcast dimension mismatch.");
+      CHECK_EQ(x_dims[i + axis], y_dims[i]) << "Broadcast dimension mismatch.";
       (*n) *= y_dims[i];
     }
 
@@ -314,17 +311,16 @@ void ElementwiseComputeEx(const lite::Context<Target> &ctx,
   TransformFunctor<Functor, T, Target, OutType> functor(x, y, z, ctx, func);
   auto x_dims = x->dims();
   auto y_dims_untrimed = y->dims();
-  PADDLE_ENFORCE_GE(x_dims.size(),
-                    y_dims_untrimed.size(),
-                    "Rank of first input must >= rank of second input.");
+  CHECK_GE(x_dims.size(), y_dims_untrimed.size())
+      << "Rank of first input must >= rank of second input.";
   if (x_dims == y_dims_untrimed) {
     functor.Run();
     return;
   }
 
   axis = (axis == -1 ? x_dims.size() - y_dims_untrimed.size() : axis);
-  PADDLE_ENFORCE(axis >= 0 && axis < static_cast<int>(x_dims.size()),
-                 "Axis should be in range [0, x_dims)");
+  CHECK(axis >= 0 && axis < static_cast<int>(x_dims.size()))
+      << "Axis should be in range [0, x_dims)";
   auto y_dims = trim_trailing_singular_dims(y_dims_untrimed);
   axis = (y_dims.size() == 0) ? x_dims.size() : axis;
   int pre, n, post, mid_flag = 0;
@@ -560,9 +556,8 @@ void FusedElemwiseAndActComputeEx(const lite::Context<Target> &ctx,
                                   lite::Tensor *out,
                                   lite::Tensor *intermediate_out) {
   if (KeepIntermediateOut) {
-    PADDLE_ENFORCE(intermediate_out,
-                   "The save_intermediate_out is opened, "
-                   "intermediate_out should not be nullptr.");
+    CHECK(intermediate_out) << "The save_intermediate_out is opened, "
+                               "intermediate_out should not be nullptr.";
   }
 
   const lite::DDim &x_dim = x.dims();
diff --git a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
index 16bec18a1c1c4d0075e1ed1dcc4f3a3462917868..e3e8b13413808b447018ac14acf9d4a16c0f47a6 100644
--- a/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/kernels/x86/fill_constant_batch_size_like_compute_test.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/fill_constant_batch_size_like_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/fill_constant_batch_size_like_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -26,8 +29,7 @@ namespace x86 {
 
 TEST(fill_constant_batch_size_like_x86, retrive_op) {
   auto fill_constant_batch_size_like =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "fill_constant_batch_size_like");
+      KernelRegistry::Global().Create("fill_constant_batch_size_like");
   ASSERT_FALSE(fill_constant_batch_size_like.empty());
   ASSERT_TRUE(fill_constant_batch_size_like.front());
 }
diff --git a/lite/kernels/x86/gather_compute_test.cc b/lite/kernels/x86/gather_compute_test.cc
index 286dfcb08a0c2c7bc038e0ad3b5673bd7c0f8b19..63284452244b19b807f8b101cab5cbabbbf68476 100644
--- a/lite/kernels/x86/gather_compute_test.cc
+++ b/lite/kernels/x86/gather_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/gather_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/gather_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(gather_x86, retrive_op) {
-  auto gather =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "gather");
+  auto gather = KernelRegistry::Global().Create("gather");
   ASSERT_FALSE(gather.empty());
   int cnt = 0;
   for (auto item = gather.begin(); item != gather.end(); ++item) {
diff --git a/lite/kernels/x86/gelu_compute_test.cc b/lite/kernels/x86/gelu_compute_test.cc
index e930cd32df91196fa9f4559ee6ba22bd8b82d337..9bda9ac4c1c0cee84141095b3100bb82a99661b7 100644
--- a/lite/kernels/x86/gelu_compute_test.cc
+++ b/lite/kernels/x86/gelu_compute_test.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/kernels/x86/activation_compute.cc"
 
@@ -26,8 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(gelu_x86, retrive_op) {
-  auto gelu =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("gelu");
+  auto gelu = KernelRegistry::Global().Create("gelu");
   ASSERT_FALSE(gelu.empty());
   ASSERT_TRUE(gelu.front());
 }
diff --git a/lite/kernels/x86/gru_compute_test.cc b/lite/kernels/x86/gru_compute_test.cc
index 3e0e944f23bafda6a5eb742a8e4b023c268c9955..c4a0045b3c1b27dfb1b518aede7dad2872cd1dc2 100644
--- a/lite/kernels/x86/gru_compute_test.cc
+++ b/lite/kernels/x86/gru_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/gru_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/gru_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,8 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(gru_x86, retrive_op) {
-  auto gru =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("gru");
+  auto gru = KernelRegistry::Global().Create("gru");
   ASSERT_FALSE(gru.empty());
   ASSERT_TRUE(gru.front());
 }
diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h
index 46d151bbc406e19b498b87420029da7f9c1c2f12..ba75dad11b75441dc09b75224bfc4dfb271396a8 100644
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
@@ -63,10 +63,10 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     out.ShareDataWith(*y);
     out.Resize(matrix_shape);
 
-    PADDLE_ENFORCE_EQ(Mean->numel(), left);
-    PADDLE_ENFORCE_EQ(Var->numel(), left);
-    PADDLE_ENFORCE_EQ(Scale->numel(), right);
-    PADDLE_ENFORCE_EQ(Bias->numel(), right);
+    CHECK_EQ(Mean->numel(), left);
+    CHECK_EQ(Var->numel(), left);
+    CHECK_EQ(Scale->numel(), right);
+    CHECK_EQ(Bias->numel(), right);
 
     auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<T>,
                                               lite::fluid::CPUPlace>::Cache()
diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc
index d39500a5e8827230ddeecd6bbe30f8c0a47ee929..617f1fae066aa6dc5068d293f8e977a2d37fe496 100644
--- a/lite/kernels/x86/layer_norm_compute_test.cc
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
@@ -12,15 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/layer_norm_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/backends/x86/jit/helper.h"
 #include "lite/backends/x86/jit/kernel_base.h"
 #include "lite/backends/x86/jit/kernels.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/layer_norm_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -74,9 +76,7 @@ std::vector<float> ref(lite::Tensor* x,
 
 // layer_norm
 TEST(layer_norm_x86, retrive_op) {
-  auto layer_norm =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "layer_norm");
+  auto layer_norm = KernelRegistry::Global().Create("layer_norm");
   ASSERT_FALSE(layer_norm.empty());
   ASSERT_TRUE(layer_norm.front());
 }
diff --git a/lite/kernels/x86/leaky_relu_compute_test.cc b/lite/kernels/x86/leaky_relu_compute_test.cc
index 76daf4ff9ffc5dea8b532610abc917406356b3a5..75ebcf071298d072682b6ea535b3c8244c328500 100644
--- a/lite/kernels/x86/leaky_relu_compute_test.cc
+++ b/lite/kernels/x86/leaky_relu_compute_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/kernels/x86/activation_compute.h"
 
@@ -24,9 +26,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(leaky_relu_x86, retrive_op) {
-  auto leaky_relu =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "leaky_relu");
+  auto leaky_relu = KernelRegistry::Global().Create("leaky_relu");
   ASSERT_FALSE(leaky_relu.empty());
   ASSERT_TRUE(leaky_relu.front());
 }
diff --git a/lite/kernels/x86/match_matrix_tensor_compute_test.cc b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
index 0c3f3ad50940ab0059ab04fb507a786f735584b9..02ed8e1b4bb3a7bccc8560cb1f51166d3833e6bf 100644
--- a/lite/kernels/x86/match_matrix_tensor_compute_test.cc
+++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/match_matrix_tensor_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/match_matrix_tensor_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(match_matrix_tensor_x86, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "match_matrix_tensor");
+  auto kernel = KernelRegistry::Global().Create("match_matrix_tensor");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/x86/matmul_compute_test.cc b/lite/kernels/x86/matmul_compute_test.cc
index 53d2d1a47a0cdbdaf5dfa83a79987d908171a36d..1e98702193af11ea8678bdfbc2382c7845c49b38 100644
--- a/lite/kernels/x86/matmul_compute_test.cc
+++ b/lite/kernels/x86/matmul_compute_test.cc
@@ -12,22 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/matmul_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/matmul_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace x86 {
 
 TEST(matmul_x86, retrive_op) {
-  auto matmul =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "matmul");
+  auto matmul = KernelRegistry::Global().Create("matmul");
   ASSERT_FALSE(matmul.empty());
   ASSERT_TRUE(matmul.front());
 }
diff --git a/lite/kernels/x86/mul_compute_test.cc b/lite/kernels/x86/mul_compute_test.cc
index 32d82cbb77aeb71dcd1c172ec0c1e343c3954fea..0d66a2dbd6eb27dac6acde47cc395c3c6245b1b5 100644
--- a/lite/kernels/x86/mul_compute_test.cc
+++ b/lite/kernels/x86/mul_compute_test.cc
@@ -12,21 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/mul_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/mul_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace x86 {
 
 TEST(mul_x86, retrive_op) {
-  auto mul =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("mul");
+  auto mul = KernelRegistry::Global().Create("mul");
   ASSERT_FALSE(mul.empty());
   ASSERT_TRUE(mul.front());
 }
diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc
index 4ea727cedd5206f5f1ac2685297f72c3019bb313..d67d3a1de2248a1f8c180867c76b5d31affc11b9 100644
--- a/lite/kernels/x86/pool_compute_test.cc
+++ b/lite/kernels/x86/pool_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/pool_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/pool_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,9 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(pool_x86, retrive_op) {
-  auto pool2d =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "pool2d");
+  auto pool2d = KernelRegistry::Global().Create("pool2d");
   ASSERT_FALSE(pool2d.empty());
   ASSERT_TRUE(pool2d.front());
 }
diff --git a/lite/kernels/x86/relu_compute_test.cc b/lite/kernels/x86/relu_compute_test.cc
index 37ed6db7f919e31828f89462fa46d5263c480fcc..c2233bd04cf33c983db521335d88339592d2ce6b 100644
--- a/lite/kernels/x86/relu_compute_test.cc
+++ b/lite/kernels/x86/relu_compute_test.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/kernels/x86/activation_compute.h"
 
@@ -24,8 +26,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(relu_x86, retrive_op) {
-  auto relu =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("relu");
+  auto relu = KernelRegistry::Global().Create("relu");
   ASSERT_FALSE(relu.empty());
   ASSERT_TRUE(relu.front());
 }
diff --git a/lite/kernels/x86/reshape_compute_test.cc b/lite/kernels/x86/reshape_compute_test.cc
index 16fc8f31aded0ef62fdf14aa671a73ccf6635fb7..88f38adee4aa413ac91bfdec0294c816020942b5 100644
--- a/lite/kernels/x86/reshape_compute_test.cc
+++ b/lite/kernels/x86/reshape_compute_test.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/reshape_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/reshape_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -26,9 +29,7 @@ namespace x86 {
 
 // reshape
 TEST(reshape_x86, retrive_op) {
-  auto reshape =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "reshape");
+  auto reshape = KernelRegistry::Global().Create("reshape");
   ASSERT_FALSE(reshape.empty());
   ASSERT_TRUE(reshape.front());
 }
@@ -86,9 +87,7 @@ TEST(reshape_x86, run_test) {
 
 // reshape2
 TEST(reshape2_x86, retrive_op) {
-  auto reshape2 =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "reshape2");
+  auto reshape2 = KernelRegistry::Global().Create("reshape2");
   ASSERT_FALSE(reshape2.empty());
   ASSERT_TRUE(reshape2.front());
 }
diff --git a/lite/kernels/x86/scale_compute_test.cc b/lite/kernels/x86/scale_compute_test.cc
index 6da27f444c7ed4c5a86e5f08a6c1612110bb02b9..dafb1e590f27f14208cff1e9aef79b28256cd048 100644
--- a/lite/kernels/x86/scale_compute_test.cc
+++ b/lite/kernels/x86/scale_compute_test.cc
@@ -12,11 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/scale_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/scale_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -24,8 +26,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(scale_x86, retrive_op) {
-  auto scale =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("scale");
+  auto scale = KernelRegistry::Global().Create("scale");
   ASSERT_FALSE(scale.empty());
   ASSERT_TRUE(scale.front());
 }
diff --git a/lite/kernels/x86/search_fc_compute_test.cc b/lite/kernels/x86/search_fc_compute_test.cc
index 425df2a0f0544d7345923cb2efdce96074845311..515a5e30c81e9edd6b9ebb8e52955b5de6ec9e24 100644
--- a/lite/kernels/x86/search_fc_compute_test.cc
+++ b/lite/kernels/x86/search_fc_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/search_fc_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/search_fc_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -53,9 +55,7 @@ void fc_cpu_base(const lite::Tensor* X,
 }
 
 TEST(search_fc_x86, retrive_op) {
-  auto search_fc =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "search_fc");
+  auto search_fc = KernelRegistry::Global().Create("search_fc");
   ASSERT_FALSE(search_fc.empty());
   ASSERT_TRUE(search_fc.front());
 }
diff --git a/lite/kernels/x86/search_grnn_compute_test.cc b/lite/kernels/x86/search_grnn_compute_test.cc
index b85d97e3f1be1f2f02837d347e42ce6731c58414..d120ca7500513bc99b71bf0003ec31bcf1e2ac19 100644
--- a/lite/kernels/x86/search_grnn_compute_test.cc
+++ b/lite/kernels/x86/search_grnn_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/search_grnn_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/search_grnn_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(search_grnn_x86, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "search_grnn");
+  auto kernel = KernelRegistry::Global().Create("search_grnn");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/x86/search_group_padding_compute_test.cc b/lite/kernels/x86/search_group_padding_compute_test.cc
index f4c36c2a63488a6bb902a2b8b4ad81fa32b37672..ae2007e463c0fc97a099cd5ae902b623e361066c 100644
--- a/lite/kernels/x86/search_group_padding_compute_test.cc
+++ b/lite/kernels/x86/search_group_padding_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/search_group_padding_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/search_group_padding_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -26,8 +28,7 @@ namespace x86 {
 
 TEST(search_group_padding_x86, retrieve_op) {
   auto search_group_padding =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "search_group_padding");
+      KernelRegistry::Global().Create("search_group_padding");
   ASSERT_FALSE(search_group_padding.empty());
   ASSERT_TRUE(search_group_padding.front());
 }
diff --git a/lite/kernels/x86/search_seq_depadding_compute_test.cc b/lite/kernels/x86/search_seq_depadding_compute_test.cc
index 0d978b35ed040d6b7c44354f37999e6e34e2e3ef..32bf3276bb378beafbf273ffe7142b9b8fc493ac 100644
--- a/lite/kernels/x86/search_seq_depadding_compute_test.cc
+++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/search_seq_depadding_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/search_seq_depadding_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(search_seq_depadding_x86, retrive_op) {
-  auto kernel =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "search_seq_depadding");
+  auto kernel = KernelRegistry::Global().Create("search_seq_depadding");
   ASSERT_FALSE(kernel.empty());
   ASSERT_TRUE(kernel.front());
 }
diff --git a/lite/kernels/x86/sequence_arithmetic_compute_test.cc b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
index 3b41e7d7ce37ebaf6a3f8518bc248ff4ec5c1aec..d80d3c2d1097fe2bbb47eb4c9d1384ae54d7fe8c 100644
--- a/lite/kernels/x86/sequence_arithmetic_compute_test.cc
+++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_arithmetic_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_arithmetic_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -77,8 +79,7 @@ void prepare_input(Tensor* x, const LoD& x_lod) {
 
 TEST(sequence_arithmetic_x86, retrive_op) {
   auto sequence_arithmetic =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_arithmetic");
+      KernelRegistry::Global().Create("sequence_arithmetic");
   ASSERT_FALSE(sequence_arithmetic.empty());
   ASSERT_TRUE(sequence_arithmetic.front());
 }
diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc
index eb6678a655ed1eb5a7bcda1dc2a6b8afe4477d2d..9899e6c08a1d1af9dea3728b5105ff78286de819 100644
--- a/lite/kernels/x86/sequence_concat_compute_test.cc
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_concat_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_concat_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -94,9 +97,7 @@ static void sequence_concat_ref(const std::vector<lite::Tensor*>& xs,
 }  // namespace
 
 TEST(sequence_concat_x86, retrive_op) {
-  auto sequence_concat =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_concat");
+  auto sequence_concat = KernelRegistry::Global().Create("sequence_concat");
   ASSERT_FALSE(sequence_concat.empty());
   ASSERT_TRUE(sequence_concat.front());
 }
diff --git a/lite/kernels/x86/sequence_expand_as_compute_test.cc b/lite/kernels/x86/sequence_expand_as_compute_test.cc
index d49fdbb7a6164435abb9eb7189b18376066d55df..6eafb5f1e5275e375b7c61fda3c437b6959b8dd2 100644
--- a/lite/kernels/x86/sequence_expand_as_compute_test.cc
+++ b/lite/kernels/x86/sequence_expand_as_compute_test.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_expand_as_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_expand_as_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -27,8 +29,7 @@ namespace x86 {
 
 TEST(sequence_expand_as_x86, retrive_op) {
   auto sequence_expand_as =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_expand_as");
+      KernelRegistry::Global().Create("sequence_expand_as");
   ASSERT_FALSE(sequence_expand_as.empty());
   ASSERT_TRUE(sequence_expand_as.front());
 }
diff --git a/lite/kernels/x86/sequence_pool_compute_test.cc b/lite/kernels/x86/sequence_pool_compute_test.cc
index 372bfaf8741cdcdc902efb6b8380eb4c34dd49ad..35116adbf6f06b87482cfff99182ee6c675ba7ed 100644
--- a/lite/kernels/x86/sequence_pool_compute_test.cc
+++ b/lite/kernels/x86/sequence_pool_compute_test.cc
@@ -12,21 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_pool_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_pool_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace x86 {
 
 TEST(sequence_pool_x86, retrive_op) {
-  auto sequence_pool =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_pool");
+  auto sequence_pool = KernelRegistry::Global().Create("sequence_pool");
   ASSERT_FALSE(sequence_pool.empty());
   ASSERT_TRUE(sequence_pool.front());
 }
diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc
index adf9981b242bfbb7f60989369715354cc2043685..37c2f9571d486a36eccc1f01c06a1550d4609730 100644
--- a/lite/kernels/x86/sequence_reverse_compute_test.cc
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/sequence_reverse_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/sequence_reverse_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -44,9 +46,7 @@ static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) {
 }  // namespace
 
 TEST(sequence_reverse_x86, retrive_op) {
-  auto sequence_reverse =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "sequence_reverse");
+  auto sequence_reverse = KernelRegistry::Global().Create("sequence_reverse");
   ASSERT_FALSE(sequence_reverse.empty());
   ASSERT_TRUE(sequence_reverse.front());
 }
diff --git a/lite/kernels/x86/sgd_compute.cc b/lite/kernels/x86/sgd_compute.cc
index a3241468f9f09d66401aa83e0d738779e555dfba..dd056e30209953c1f360d714db50e3236f278510 100644
--- a/lite/kernels/x86/sgd_compute.cc
+++ b/lite/kernels/x86/sgd_compute.cc
@@ -41,8 +41,8 @@ class SGDCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto *param_out = &sgd_param.ParamOut->raw_tensor();
 
     auto sz = param_out->numel();
-    PADDLE_ENFORCE_EQ(param->numel(), sz);
-    PADDLE_ENFORCE_EQ(grad->numel(), sz);
+    CHECK_EQ(param->numel(), sz);
+    CHECK_EQ(grad->numel(), sz);
 
     paddle::operators::jit::sgd_attr_t attr(1, sz, 1, sz, 1);
     const T *lr = learning_rate->template data<T>();
diff --git a/lite/kernels/x86/shape_compute_test.cc b/lite/kernels/x86/shape_compute_test.cc
index 88bd98f33ffc7a727de584543bc7392cdbb2883f..9fe5e6c51eaee783072717cea055b00b75c59c07 100644
--- a/lite/kernels/x86/shape_compute_test.cc
+++ b/lite/kernels/x86/shape_compute_test.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/shape_compute.h"
 #include <gtest/gtest.h>
+
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/shape_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -23,8 +25,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(shape_x86, retrive_op) {
-  auto shape =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("shape");
+  auto shape = KernelRegistry::Global().Create("shape");
   ASSERT_FALSE(shape.empty());
   ASSERT_TRUE(shape.front());
 }
diff --git a/lite/kernels/x86/slice_compute.h b/lite/kernels/x86/slice_compute.h
index ad30215691cde66ab1c7c8c57930fc6d58de7cd5..d32327668bac389e42ff9411be50ce3df42e39ff 100644
--- a/lite/kernels/x86/slice_compute.h
+++ b/lite/kernels/x86/slice_compute.h
@@ -157,7 +157,7 @@ void slice_compute(const lite::Tensor* in,
     }
   }
 
-  out->mutable_data<float>(lite::TargetType::kX86);
+  out->mutable_data<float>();
 
   auto new_out_dims = out->dims();
   auto offsets = Eigen::array<int, D>();
diff --git a/lite/kernels/x86/slice_compute_test.cc b/lite/kernels/x86/slice_compute_test.cc
index a62a62cd88ce48c4d47d784ecbc2fd16d0f433d1..b978d4533ccb28ae8826b8304d93f9bdbe85d106 100644
--- a/lite/kernels/x86/slice_compute_test.cc
+++ b/lite/kernels/x86/slice_compute_test.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/slice_compute.h"
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/slice_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -79,8 +82,7 @@ static void slice_ref(const float* input,
 }
 
 TEST(slice_x86, retrive_op) {
-  auto slice =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("slice");
+  auto slice = KernelRegistry::Global().Create("slice");
   ASSERT_FALSE(slice.empty());
   ASSERT_TRUE(slice.front());
 }
diff --git a/lite/kernels/x86/softmax_compute_test.cc b/lite/kernels/x86/softmax_compute_test.cc
index 0debeecb3150dfdd2626b6f8f3f6b5ef63981d93..f3def92992c7ca01e75d12b86b2680768a9fd2ee 100644
--- a/lite/kernels/x86/softmax_compute_test.cc
+++ b/lite/kernels/x86/softmax_compute_test.cc
@@ -12,12 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/softmax_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/softmax_compute.h"
 
 namespace paddle {
 namespace lite {
@@ -25,9 +27,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(softmax_x86, retrive_op) {
-  auto softmax =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "softmax");
+  auto softmax = KernelRegistry::Global().Create("softmax");
   ASSERT_FALSE(softmax.empty());
   ASSERT_TRUE(softmax.front());
 }
diff --git a/lite/kernels/x86/stack_compute_test.cc b/lite/kernels/x86/stack_compute_test.cc
index d105165a98f936b7a6973e57f5199977a0b8bed3..33942fca96508d2868520e5b5e242b83a1f38b0e 100644
--- a/lite/kernels/x86/stack_compute_test.cc
+++ b/lite/kernels/x86/stack_compute_test.cc
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/stack_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/stack_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -25,8 +28,7 @@ namespace x86 {
 
 // stack
 TEST(stack_x86, retrive_op) {
-  auto stack =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("stack");
+  auto stack = KernelRegistry::Global().Create("stack");
   ASSERT_FALSE(stack.empty());
   ASSERT_TRUE(stack.front());
 }
diff --git a/lite/kernels/x86/tanh_compute_test.cc b/lite/kernels/x86/tanh_compute_test.cc
index 8132505fad6d93997c73ffb735a4a798c15d87a6..6cba531fd34df029a1cdaaf9d6925e379796260d 100644
--- a/lite/kernels/x86/tanh_compute_test.cc
+++ b/lite/kernels/x86/tanh_compute_test.cc
@@ -13,10 +13,12 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/kernels/x86/activation_compute.cc"
 
@@ -26,8 +28,7 @@ namespace kernels {
 namespace x86 {
 
 TEST(tanh_x86, retrive_op) {
-  auto tanh =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("tanh");
+  auto tanh = KernelRegistry::Global().Create("tanh");
   ASSERT_FALSE(tanh.empty());
   ASSERT_TRUE(tanh.front());
 }
diff --git a/lite/kernels/x86/transpose_compute.h b/lite/kernels/x86/transpose_compute.h
index 5f6faed2017b6bdef60e7505bf1f0088d86b3ec1..87e7fee7deec711914bd43039301f7180a4bcaa0 100644
--- a/lite/kernels/x86/transpose_compute.h
+++ b/lite/kernels/x86/transpose_compute.h
@@ -60,7 +60,7 @@ inline void TransCompute(const int dim,
       trans6(context, in, out, axis);
       break;
     default:
-      PADDLE_THROW("Tensors with rank at most 6 are supported");
+      LOG(FATAL) << "Tensors with rank at most 6 are supported";
   }
 }
 
diff --git a/lite/kernels/x86/transpose_compute_test.cc b/lite/kernels/x86/transpose_compute_test.cc
index d8533d98258637eba516974e03cd4d88fd452293..aa99db36c450326765d602aaf0b48f72a1a63e13 100644
--- a/lite/kernels/x86/transpose_compute_test.cc
+++ b/lite/kernels/x86/transpose_compute_test.cc
@@ -12,12 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/transpose_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
+#include "lite/kernels/x86/transpose_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -25,9 +28,7 @@ namespace x86 {
 
 // transpose
 TEST(transpose_x86, retrive_op) {
-  auto transpose =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "transpose");
+  auto transpose = KernelRegistry::Global().Create("transpose");
   ASSERT_FALSE(transpose.empty());
   ASSERT_TRUE(transpose.front());
 }
@@ -75,9 +76,7 @@ TEST(transpose_x86, run_test) {
 
 // transpose2
 TEST(transpose2_x86, retrive_op) {
-  auto transpose2 =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "transpose2");
+  auto transpose2 = KernelRegistry::Global().Create("transpose2");
   ASSERT_FALSE(transpose2.empty());
   ASSERT_TRUE(transpose2.front());
 }
diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc
index edef8cb2df75dfb45ad4964975365d4ddbbe9086..a6787b2e3e84360a63618f130305446316a08e01 100644
--- a/lite/kernels/x86/var_conv_2d_compute_test.cc
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
@@ -12,13 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/x86/var_conv_2d_compute.h"
 #include <gtest/gtest.h>
+
 #include <memory>
 #include <utility>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/x86/var_conv_2d_compute.h"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -197,9 +200,7 @@ static void var_conv_2d_ref(const lite::Tensor* bottom,
 }
 
 TEST(var_conv_2d_x86, retrive_op) {
-  auto var_conv_2d =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
-          "var_conv_2d");
+  auto var_conv_2d = KernelRegistry::Global().Create("var_conv_2d");
   ASSERT_FALSE(var_conv_2d.empty());
   ASSERT_TRUE(var_conv_2d.front());
 }
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index 7ded008387b7d7c92fb2ce6b18e73e1c1e51f29d..fdb485df02f366f7f4868965b1f20c6861b03d43 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -6,6 +6,7 @@ if(LITE_WITH_XTCL)
   add_subdirectory(bridges)
   add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
 else()
+  # basic
   add_kernel(conv_compute_xpu XPU basic SRCS conv_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(io_copy_compute_xpu XPU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} target_wrapper_xpu)
   add_kernel(batch_norm_compute_xpu XPU basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
@@ -15,15 +16,32 @@ else()
   add_kernel(mul_compute_xpu XPU basic SRCS mul_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(softmax_compute_xpu XPU basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(scale_compute_xpu XPU basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
-  add_kernel(lookup_table_compute_xpu XPU basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
-  add_kernel(layer_norm_compute_xpu XPU basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(dropout_compute_xpu XPU basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(matmul_compute_xpu XPU basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(stack_compute_xpu XPU basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(slice_compute_xpu XPU basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(cast_compute_xpu XPU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_topk_avg_pooling_compute_xpu XPU basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(concat_compute_xpu XPU basic SRCS concat_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(search_fc_compute_xpu XPU basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps})
+
+  # extra
+  add_kernel(lookup_table_compute_xpu XPU extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(layer_norm_compute_xpu XPU extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_reverse_compute_xpu XPU extra SRCS sequence_reverse_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_concat_compute_xpu XPU extra SRCS sequence_concat_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_arithmetic_compute_xpu XPU extra SRCS sequence_arithmetic_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(sequence_pool_compute_xpu XPU extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(match_matrix_tensor_compute_xpu XPU extra SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(var_conv_2d_compute_xpu XPU extra SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(search_grnn_compute_xpu XPU extra SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps})
+
+  # extra(fused kernel)
   add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__resnet_cbam_compute_xpu XPU extra SRCS __xpu__resnet_cbam_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__multi_encoder_compute_xpu XPU extra SRCS __xpu__multi_encoder_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__embedding_with_eltwise_add_compute_xpu XPU extra SRCS __xpu__embedding_with_eltwise_add_compute.cc DEPS ${lite_kernel_deps})
   add_kernel(__xpu__fc_compute_xpu XPU extra SRCS __xpu__fc_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__search_attention_compute_xpu XPU extra SRCS __xpu__search_attention_compute.cc DEPS ${lite_kernel_deps})
+  add_kernel(__xpu__mmdnn_compute_xpu XPU extra SRCS __xpu__mmdnn_compute.cc DEPS ${lite_kernel_deps})
 endif()
diff --git a/lite/kernels/xpu/__xpu__mmdnn_compute.cc b/lite/kernels/xpu/__xpu__mmdnn_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..39ddecb1139073cb1a0bd8e3c7afc89f1d739da8
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__mmdnn_compute.cc
@@ -0,0 +1,1386 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+namespace {
+
+void FillMax(float max, float* xpu_ptr) {
+  float maxs[4] = {max, 0.0f, 0.0f, 0.0f};
+  xpu_memcpy(
+      xpu_ptr, maxs, 4 * sizeof(float), XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+}
+
+void GrnnLayout(int batch,
+                const std::vector<int>& offset,
+                std::vector<int>* new_offset_ptr,
+                std::vector<int>* idx_sorted_ptr) {
+  auto& new_offset = *new_offset_ptr;
+  auto& idx_sorted = *idx_sorted_ptr;
+
+  std::vector<int> width;
+  width.resize(batch);
+  new_offset.clear();
+  idx_sorted.clear();
+
+  idx_sorted.resize(batch);
+  for (int i = 0; i < batch; i++) {
+    width[i] = offset[i + 1] - offset[i];
+    idx_sorted[i] = i;
+  }
+  std::sort(idx_sorted.data(),
+            idx_sorted.data() + batch,
+            [&width](int a, int b) { return width[a] > width[b]; });
+  int max_width = width[idx_sorted[0]];
+  new_offset.resize(max_width + 1);
+  new_offset[0] = 0;
+  int j = batch - 1;
+  int last_width = 0;
+  int sub_row = 0;
+  int sub_col = 0;
+
+  for (int i = 1; i <= max_width;) {
+    for (int k = j; k >= 0; --k) {
+      if (width[idx_sorted[k]] > last_width) {
+        sub_row = width[idx_sorted[k]] - last_width;
+        sub_col = k + 1;
+        for (int s = 0; s < sub_row; s++) {
+          new_offset[i] = new_offset[i - 1] + sub_col;
+          i++;
+        }
+        // move on
+        last_width = width[idx_sorted[k]];
+        j = k - 1;
+        break;
+      }
+    }
+  }
+}
+
+}  // anonymous namespace
+
+class MMDNNIdInfo {
+  XPUScratchPadGuard l3_buffer_guard_;
+  char* l3_buffer_{nullptr};
+  std::unique_ptr<char[]> cpu_buffer_guard_;
+  char* cpu_buffer_{nullptr};
+
+ public:
+  const int64_t* id0_64{nullptr};
+  const int64_t* id1_64{nullptr};
+  int64_t* lod_64{nullptr};
+  int* lod_32{nullptr};
+  int* new_offset_32{nullptr};
+  int* idx_sorted_32{nullptr};
+
+  std::vector<int> lod;
+  std::vector<int> new_offset;
+  std::vector<int> idx_sorted;
+  int batch;
+  int seqlen_max;
+  int seqlen_sum;
+  int seqlen_square_sum;
+
+  void Init(int upper_bound_batch, int upper_bound_seqlen) {
+    int ub_lod_64_size = (upper_bound_batch + 1) * sizeof(int64_t);
+    int ub_lod_32_size = (upper_bound_batch + 1) * sizeof(int);
+    int ub_new_offset_32_size = (upper_bound_seqlen + 1) * sizeof(int);
+    int ub_idx_sorted_32_size = (upper_bound_batch + 1) * sizeof(int);
+    int total_size = ub_lod_64_size + ub_lod_32_size + ub_new_offset_32_size +
+                     ub_idx_sorted_32_size;
+
+    // TODO(miaotianxiang): use l3?
+    l3_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(total_size, false);
+    l3_buffer_ = reinterpret_cast<char*>(l3_buffer_guard_->addr_);
+    cpu_buffer_guard_.reset(new char[total_size]);
+    cpu_buffer_ = cpu_buffer_guard_.get();
+  }
+
+  void Update(lite::Tensor* id0, lite::Tensor* id1) {
+    auto& id0_lod = id0->lod()[0];
+    lod.clear();
+    for (auto e : id0_lod) {
+      lod.push_back(e);
+    }
+
+    seqlen_max = 0;
+    seqlen_sum = 0;
+    seqlen_square_sum = 0;
+    batch = lod.size() - 1;
+    for (int i = 0; i < batch; i++) {
+      int seqlen = lod[i + 1] - lod[i];
+      seqlen_max = std::max(seqlen_max, seqlen);
+      seqlen_sum = seqlen_sum + seqlen;
+      seqlen_square_sum = seqlen_square_sum + seqlen * seqlen;
+    }
+    GrnnLayout(batch, lod, &new_offset, &idx_sorted);
+
+    id0_64 = id0->data<int64_t>();
+    id1_64 = id1->data<int64_t>();
+
+    int offset = 0;
+    lod_64 = reinterpret_cast<int64_t*>(l3_buffer_ + offset);
+    memcpy(
+        cpu_buffer_ + offset, id0_lod.data(), id0_lod.size() * sizeof(int64_t));
+    offset += id0_lod.size() * sizeof(int64_t);
+    lod_32 = reinterpret_cast<int*>(l3_buffer_ + offset);
+    memcpy(cpu_buffer_ + offset, lod.data(), lod.size() * sizeof(int));
+    offset += lod.size() * sizeof(int);
+    new_offset_32 = reinterpret_cast<int*>(l3_buffer_ + offset);
+    memcpy(cpu_buffer_ + offset,
+           new_offset.data(),
+           new_offset.size() * sizeof(int));
+    offset += new_offset.size() * sizeof(int);
+    idx_sorted_32 = reinterpret_cast<int*>(l3_buffer_ + offset);
+    memcpy(cpu_buffer_ + offset,
+           idx_sorted.data(),
+           idx_sorted.size() * sizeof(int));
+    offset += idx_sorted.size() * sizeof(int);
+    xpu_memcpy(
+        l3_buffer_, cpu_buffer_, offset, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  }
+};
+
+class MMDNNFcOp {
+  const int16_t* weight_{nullptr};
+  XPUScratchPadGuard weight_max_guard_;
+  float* weight_max_{nullptr};
+  const float* bias_{nullptr};
+  XPUScratchPadGuard in_max_guard_;
+  float* in_max_{nullptr};
+  int n_;
+  int k_;
+  xdnn::Activation_t::act_enum act_type_;
+  XPUScratchPadGuard out_max_guard_;
+
+ public:
+  float* out_max{nullptr};
+
+  void Init(const int16_t* weight,
+            float weight_max,
+            const float* bias,
+            int n,
+            int k,
+            xdnn::Activation_t::act_enum act_type) {
+    n_ = n;
+    k_ = k;
+    act_type_ = act_type;
+
+    weight_ = weight;
+    weight_max_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false);
+    weight_max_ = reinterpret_cast<float*>(weight_max_guard_->addr_);
+    FillMax(weight_max, weight_max_);
+
+    bias_ = bias;
+
+    in_max_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false);
+    out_max_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false);
+    in_max_ = reinterpret_cast<float*>(in_max_guard_->addr_);
+    out_max = reinterpret_cast<float*>(in_max_guard_->addr_);
+  }
+
+  void Init(lite::Tensor* weight,
+            float weight_max,
+            lite::Tensor* bias,
+            int n,
+            int k,
+            xdnn::Activation_t::act_enum act_type) {
+    Init(weight->data<int16_t>(),
+         weight_max,
+         bias ? bias->data<float>() : nullptr,
+         n,
+         k,
+         act_type);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             const float* in,
+             int m,
+             float* out,
+             const float* in_max_by_caller = nullptr) {
+    if (in_max_by_caller == nullptr) {
+      xdnn::findmax<float>(ctx, in, m * k_, in_max_);
+      in_max_by_caller = in_max_;
+    }
+    xdnn::gemm_int16_maxptr<float, int16_t, float>(ctx,
+                                                   false,
+                                                   true,
+                                                   m,
+                                                   n_,
+                                                   k_,
+                                                   1.0f,
+                                                   in,
+                                                   k_,
+                                                   weight_,
+                                                   k_,
+                                                   0.0f,
+                                                   out,
+                                                   n_,
+                                                   bias_,
+                                                   act_type_,
+                                                   in_max_by_caller,
+                                                   weight_max_,
+                                                   out_max);
+  }
+};
+
+class MMDNNGrnnOp {
+  MMDNNFcOp fc_e2h0_;
+  MMDNNFcOp fc_e2h1_;
+  MMDNNFcOp fc_e2h2_;
+  const int16_t* dense_h2h_{nullptr};
+  float dense_h2h_max_[3];
+  XPUScratchPadGuard input_max_guard_;
+  float* input_max_{nullptr};
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // require: cap_l * max(cap_e_, cap_h_) * 5
+  // seq2batch_out: [cap_l, cap_e_]
+  // fc_e2h_out: [3, cap_l, cap_h_]
+  // gru_out: [cap_l, cap_h_]
+  int cap_e_;
+  int cap_h_;
+  int max_cap_l_;
+
+ public:
+  void Init(lite::Tensor* wh,
+            const std::vector<float>& wh_maxs,
+            lite::Tensor* wi,
+            const std::vector<float>& wi_maxs,
+            int cap_e,
+            int cap_h,
+            int max_cap_l) {
+    cap_e_ = cap_e;
+    cap_h_ = cap_h;
+    max_cap_l_ = max_cap_l;
+
+    // weight
+    auto* dense_e2h = wi->data<int16_t>();
+    fc_e2h0_.Init(dense_e2h,
+                  wi_maxs[0],
+                  nullptr,
+                  cap_h_,
+                  cap_e_,
+                  xdnn::Activation_t::LINEAR);
+    fc_e2h1_.Init(dense_e2h + cap_e_ * cap_h_,
+                  wi_maxs[1],
+                  nullptr,
+                  cap_h_,
+                  cap_e_,
+                  xdnn::Activation_t::LINEAR);
+    fc_e2h2_.Init(dense_e2h + cap_e_ * cap_h_ * 2,
+                  wi_maxs[2],
+                  nullptr,
+                  cap_h_,
+                  cap_e_,
+                  xdnn::Activation_t::LINEAR);
+
+    dense_h2h_ = wh->data<int16_t>();
+    dense_h2h_max_[0] = wh_maxs[0];
+    dense_h2h_max_[1] = wh_maxs[1];
+    dense_h2h_max_[2] = wh_maxs[2];
+
+    input_max_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(float), false);
+    input_max_ = reinterpret_cast<float*>(input_max_guard_->addr_);
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        5 * std::max(cap_e_, cap_h_) * max_cap_l_ * sizeof(float), false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             const MMDNNIdInfo& sentense,
+             const float* in,
+             float* out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    int batch = sentense.batch;
+    int cap_l = sentense.seqlen_sum;
+    int max_width = sentense.seqlen_max;
+
+    int slot_size = cap_l * std::max(cap_e_, cap_h_);
+    float* seq2batch_out = hbm_buffer_;
+    float* fc_e2h_out = hbm_buffer_ + 1 * slot_size;
+    float* gru_out = hbm_buffer_ + 4 * slot_size;
+    if (l3_size > 0 && l3_size >= 5 * slot_size * sizeof(float)) {
+      seq2batch_out = l3_buffer;
+      fc_e2h_out = l3_buffer + 1 * slot_size;
+      gru_out = l3_buffer + 4 * slot_size;
+    }
+
+    xdnn::search_seq2batch(ctx,
+                           batch,
+                           max_width,
+                           cap_e_,
+                           sentense.idx_sorted_32,
+                           sentense.lod_32,
+                           sentense.new_offset_32,
+                           in,
+                           seq2batch_out);
+
+    xdnn::findmax<float>(ctx, in, cap_l * cap_e_, input_max_);
+    fc_e2h0_.Infer(ctx, seq2batch_out, cap_l, fc_e2h_out, input_max_);
+    fc_e2h1_.Infer(
+        ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_, input_max_);
+    fc_e2h2_.Infer(
+        ctx, seq2batch_out, cap_l, fc_e2h_out + cap_l * cap_h_ * 2, input_max_);
+    xdnn::search_grnn<float, int16_t>(ctx,
+                                      cap_l,
+                                      cap_h_,
+                                      cap_e_,
+                                      max_width,
+                                      sentense.new_offset_32,
+                                      fc_e2h_out,
+                                      dense_h2h_,
+                                      gru_out,
+                                      dense_h2h_max_[0],
+                                      dense_h2h_max_[1],
+                                      dense_h2h_max_[2]);
+
+    xdnn::search_batch2seq(ctx,
+                           batch,
+                           max_width,
+                           cap_h_,
+                           sentense.idx_sorted_32,
+                           sentense.lod_32,
+                           sentense.new_offset_32,
+                           gru_out,
+                           out);
+  }
+};
+
+class MMDNNAttentionOp {
+  int dim_;
+  float alpha0_;
+  float alpha1_;
+  MMDNNFcOp seqfc_;
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // require: cap_l * dim_ + seqlen_square_sum
+  // seqfc_out: [cap_l, dim_]
+  // batchgemm0_out: [seqlen_square_sum]
+  // seq_softmax_out: [seqlen_square_sum], reuse of batchgemm0_out
+  // batchgemm1_out: [cap_l, dim_], reuse of seqfc_out
+
+ public:
+  void Init(lite::Tensor* att_fc_w,
+            float att_fc_w_max,
+            lite::Tensor* att_fc_b,
+            int dim,
+            int upper_bound_batch,
+            int upper_bound_seqlen) {
+    dim_ = dim;
+    alpha0_ = 0.0883883461356163f;  // TODO(miaotianxiang):
+    alpha1_ = 1.0f;
+
+    seqfc_.Init(att_fc_w,
+                att_fc_w_max,
+                att_fc_b,
+                dim_,
+                dim_,
+                xdnn::Activation_t::LINEAR);
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch * (upper_bound_seqlen * dim_ +
+                              upper_bound_seqlen * upper_bound_seqlen)) *
+            sizeof(float),
+        false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             const MMDNNIdInfo& sentense,
+             const float* input,
+             float* pool_out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    int batch = sentense.batch;
+    int cap_l = sentense.seqlen_sum;
+    int max_width = sentense.seqlen_max;
+    int* lod_32 = sentense.lod_32;
+
+    float* seqfc_out = hbm_buffer_;
+    float* batchgemm0_out = hbm_buffer_ + cap_l * dim_;
+    float* seq_softmax_out = batchgemm0_out;
+    float* batchgemm1_out = seqfc_out;
+    if (l3_size > 0 &&
+        l3_size >=
+            (cap_l * dim_ + sentense.seqlen_square_sum) * sizeof(float)) {
+      seqfc_out = l3_buffer;
+      batchgemm0_out = l3_buffer + cap_l * dim_;
+      seq_softmax_out = batchgemm0_out;
+      batchgemm1_out = seqfc_out;
+    }
+
+    seqfc_.Infer(ctx, input, cap_l, seqfc_out);
+    xdnn::search_noaligned_mat_mul(ctx,
+                                   0,
+                                   1,
+                                   batch,
+                                   lod_32,
+                                   max_width,
+                                   dim_,
+                                   alpha0_,
+                                   input,
+                                   seqfc_out,
+                                   batchgemm0_out);
+    xdnn::search_seq_softmax(
+        ctx, batchgemm0_out, seq_softmax_out, lod_32, batch, max_width);
+    xdnn::search_noaligned_mat_mul(ctx,
+                                   0,
+                                   0,
+                                   batch,
+                                   lod_32,
+                                   max_width,
+                                   dim_,
+                                   alpha1_,
+                                   seq_softmax_out,
+                                   input,
+                                   batchgemm1_out);
+    xdnn::sequence_pooling_forward(ctx,
+                                   xdnn::Pooling_t::MAX_WITHOUT_INDEX,
+                                   batch,
+                                   lod_32,
+                                   dim_,
+                                   batchgemm1_out,
+                                   nullptr,
+                                   pool_out);
+  }
+};
+
+class MMDNNMatchConvTopk {
+  std::vector<int> topks_;
+  int dim_t_;
+  int dim_in_;
+  int out_channel_;
+
+  MMDNNFcOp xw_fc_;
+  const int16_t* conv_weight_{nullptr};
+  float conv_weight_max_;
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // xw_out: [sum(left_len), dim_t_ * dim_in_]
+  // xwy_out: [sum(left_len * right_len) * dim_t_]
+  // conv_out: [sum(left_len * right_len) * out_channel_]
+  // seq_concat_out: [sum(left_len * right_len) * (dim_t_ + out_channel_)]
+
+  XPUScratchPadGuard left_lod_32_guard_;
+  int* left_lod_32_{nullptr};
+  XPUScratchPadGuard right_lod_32_guard_;
+  int* right_lod_32_{nullptr};
+  XPUScratchPadGuard match_lod_32_guard_;
+  int* match_lod_32_{nullptr};
+  XPUScratchPadGuard conv_lod_32_guard_;
+  int* conv_lod_32_{nullptr};
+  XPUScratchPadGuard topk_offset_32_guard_;
+  int* topk_offset_32_{nullptr};
+  XPUScratchPadGuard topks_xpu_guard_;
+  int* topks_xpu_{nullptr};
+  XPUScratchPadGuard useless_topk_pos_guard_;
+  int* useless_topk_pos_{nullptr};
+
+ public:
+  float* seq_avg_topk_out{nullptr};
+
+  void Init(lite::Tensor* input_w,
+            float input_w_max,
+            lite::Tensor* conv_w,
+            float conv_w_max,
+            int dim_t,
+            int dim_in,
+            int upper_bound_batch,
+            int upper_bound_seqlen,
+            const std::vector<int>& topks) {
+    dim_t_ = dim_t;
+    dim_in_ = dim_in;
+    out_channel_ = 5;  // TODO(miaotianxiang):
+    topks_ = topks;
+
+    xw_fc_.Init(input_w,
+                input_w_max,
+                nullptr,
+                dim_t_ * dim_in_,
+                dim_in_,
+                xdnn::Activation_t::LINEAR);
+    conv_weight_ = conv_w->data<int16_t>();
+    conv_weight_max_ = conv_w_max;
+
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch * upper_bound_seqlen * dim_t_ * dim_in_ +
+         upper_bound_batch * upper_bound_seqlen * upper_bound_seqlen *
+             (dim_t_ + out_channel_) * 2) *
+            sizeof(float),
+        false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+
+    left_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    left_lod_32_ = reinterpret_cast<int*>(left_lod_32_guard_->addr_);
+    right_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    right_lod_32_ = reinterpret_cast<int*>(right_lod_32_guard_->addr_);
+    match_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    match_lod_32_ = reinterpret_cast<int*>(match_lod_32_guard_->addr_);
+    conv_lod_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    conv_lod_32_ = reinterpret_cast<int*>(conv_lod_32_guard_->addr_);
+    topk_offset_32_guard_ = TargetWrapperXPU::MallocScratchPad(
+        (upper_bound_batch + 1) * sizeof(int), false);
+    topk_offset_32_ = reinterpret_cast<int*>(topk_offset_32_guard_->addr_);
+    topks_xpu_guard_ =
+        TargetWrapperXPU::MallocScratchPad(topks_.size() * sizeof(int), false);
+    topks_xpu_ = reinterpret_cast<int*>(topks_xpu_guard_->addr_);
+    xpu_memcpy(topks_xpu_,
+               topks_.data(),
+               topks_.size() * sizeof(int),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    useless_topk_pos_guard_ =
+        TargetWrapperXPU::MallocScratchPad(4 * sizeof(int), false);
+    useless_topk_pos_ = reinterpret_cast<int*>(useless_topk_pos_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             lite::Tensor* left,
+             lite::Tensor* right,
+             lite::Tensor* out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    auto left_lod = left->lod()[0];
+    auto right_lod = right->lod()[0];
+    int batch = left_lod.size() - 1;
+
+    std::vector<int> left_lod_32_cpu;
+    for (auto e : left_lod) {
+      left_lod_32_cpu.push_back(e);
+    }
+    xpu_memcpy(left_lod_32_,
+               left_lod_32_cpu.data(),
+               left_lod_32_cpu.size() * sizeof(int),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    std::vector<int> right_lod_32_cpu;
+    for (auto e : right_lod) {
+      right_lod_32_cpu.push_back(e);
+    }
+    xpu_memcpy(right_lod_32_,
+               right_lod_32_cpu.data(),
+               right_lod_32_cpu.size() * sizeof(int),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+    std::vector<int> lod_match = {0};
+    std::vector<int> lod_conv = {0};
+    std::vector<int> lod_topk = {0};
+    int x_mul_y_sum = 0;
+    int left_seqlen_sum = 0;
+    int left_seqlen_max = 0;
+    int right_seqlen_sum = 0;
+    int right_seqlen_max = 0;
+    for (int i = 0; i < batch; i++) {
+      int len_x = left_lod[i + 1] - left_lod[i];
+      int len_y = right_lod[i + 1] - right_lod[i];
+      int imgsize = len_x * len_y;
+      x_mul_y_sum = x_mul_y_sum + imgsize;
+      lod_match.push_back(lod_match.back() + imgsize * dim_t_);
+      lod_conv.push_back(lod_conv.back() + imgsize * out_channel_);
+      lod_topk.push_back(lod_topk.back() + imgsize * (dim_t_ + out_channel_));
+
+      left_seqlen_max = std::max(left_seqlen_max, len_x);
+      right_seqlen_max = std::max(right_seqlen_max, len_y);
+      left_seqlen_sum += len_x;
+      right_seqlen_sum += len_y;
+    }
+    xpu_memcpy(match_lod_32_,
+               lod_match.data(),
+               lod_match.size() * sizeof(int),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    xpu_memcpy(conv_lod_32_,
+               lod_conv.data(),
+               lod_conv.size() * sizeof(int),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+    xpu_memcpy(topk_offset_32_,
+               lod_topk.data(),
+               lod_topk.size() * sizeof(int),
+               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+    float* xwy_out = hbm_buffer_;
+    float* conv_out = hbm_buffer_ + x_mul_y_sum * dim_t_;
+    float* seq_concat_out = hbm_buffer_ + x_mul_y_sum * (dim_t_ + out_channel_);
+    float* xw_out = hbm_buffer_ + x_mul_y_sum * (dim_t_ + out_channel_) * 2;
+    int total_len = x_mul_y_sum * (dim_t_ + out_channel_) * 2 +
+                    left_seqlen_sum * dim_t_ * dim_in_;
+    if (l3_size > 0 && l3_size >= total_len * sizeof(float)) {
+      xwy_out = l3_buffer;
+      conv_out = l3_buffer + x_mul_y_sum * dim_t_;
+      seq_concat_out = l3_buffer + x_mul_y_sum * (dim_t_ + out_channel_);
+      xw_out = l3_buffer + x_mul_y_sum * (dim_t_ + out_channel_) * 2;
+    }
+    seq_avg_topk_out = out->mutable_data<float>(TARGET(kXPU));
+
+    int max_width = std::max(left_seqlen_max, right_seqlen_max);
+    xw_fc_.Infer(ctx, left->data<float>(), left_seqlen_sum, xw_out);
+    xdnn::match_matrix_tensor(ctx,
+                              batch,
+                              xw_out,
+                              right->data<float>(),
+                              left_lod_32_,
+                              right_lod_32_,
+                              dim_t_,
+                              dim_in_,
+                              xwy_out,
+                              xw_fc_.out_max,
+                              xdnn::Activation_t::RELU,
+                              max_width);
+    xdnn::search_varconv<float, int16_t>(
+        ctx,
+        batch,
+        dim_t_,
+        out_channel_,
+        5,
+        5,
+        1,
+        1,
+        xwy_out,
+        conv_weight_,
+        right_lod_32_,
+        left_lod_32_,
+        conv_out,
+        conv_weight_max_,
+        xdnn::Activation_t::RELU);  // TODO(miaotianxiang):
+    xdnn::sequence_concat(ctx,
+                          xwy_out,
+                          match_lod_32_,
+                          conv_out,
+                          conv_lod_32_,
+                          seq_concat_out,
+                          batch);
+    xdnn::sequence_topk_avg_pooling(ctx,
+                                    seq_concat_out,
+                                    seq_avg_topk_out,
+                                    useless_topk_pos_,
+                                    batch,
+                                    dim_t_ + out_channel_,
+                                    topk_offset_32_,
+                                    left_lod_32_,
+                                    right_lod_32_,
+                                    topks_xpu_,
+                                    topks_.size());
+  }
+};
+
+class MMDNNBidEmbGrnnAtt {
+  const float* table_{nullptr};
+  int table_len_;
+  int emb_dim_;
+  int cap_h_;
+  MMDNNGrnnOp bi_fw_;
+  MMDNNGrnnOp bi_rv_;
+  MMDNNAttentionOp att_;
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // require at least: 4 * cap_l * emb_dim_
+  // emb_rv: [cap_l, emb_dim_]
+  // grnn_fw: [cap_l, emb_dim_]
+  // grnn_rv: [cap_l, emb_dim_]
+  // grnn_rv_rv: [cap_l, emb_dim_]
+  // concat_2in: [cap_l, 2 * emb_dim_]
+  // L3.bi_fw: 5 * cap_l * emb_dim_
+  // L3.bi_rv: 5 * cap_l * emb_dim_
+  // L3.att:   cap_l * 2 * emb_dim_ + seqlen_square_sum
+
+  // execution-plan:
+  // 1. bid_emb_ew,                   alloc(emb_rv)
+  // 2. bi_rv,                        alloc(grnn_rv)
+  // 3.                               free(emb_rv)
+  // 4. sequence_reverse,             alloc(grnn_rv_rv)
+  // 5. sequence_pooling(grnn_rv)
+  // 6.                               free(grnn_rv)
+  // 7. bi_fw                         alloc(grnn_fw)
+  // 8. sequence_pooling(grnn_fw)
+  // 9. concat_2                      alloc(concat_2in)
+  // 10. concat_3
+  // 11. att
+
+  // alloc-plan:
+  // [0]: emb_rv, grnn_rv_rv
+  // [1]: grnn_rv, grnn_fw
+  // [2, 3]: concat_2in
+  // [2, 3, 4, 5, 6]: L3.bi_fw, L3.bi_rv
+  // [4, 5, ..., ?]:  L3.att
+
+ public:
+  float* emb_fw{nullptr};
+  float* concat_3in{nullptr};
+  float* pool_fw{nullptr};
+  float* pool_rv{nullptr};
+  float* att_out{nullptr};
+
+  void Init(lite::Tensor* table,
+            lite::Tensor* fw_wh,
+            const std::vector<float>& fw_wh_maxs,
+            lite::Tensor* fw_wi,
+            const std::vector<float>& fw_wi_maxs,
+            lite::Tensor* rv_wh,
+            const std::vector<float>& rv_wh_maxs,
+            lite::Tensor* rv_wi,
+            const std::vector<float>& rv_wi_maxs,
+            lite::Tensor* att_fc_w,
+            float att_fc_w_max,
+            lite::Tensor* att_fc_b,
+            int upper_bound_batch,
+            int upper_bound_seqlen) {
+    table_ = table->data<float>();
+    table_len_ = table->dims()[0];
+    emb_dim_ = table->dims()[1];
+    cap_h_ = emb_dim_;
+    int max_cap_l = upper_bound_batch * upper_bound_seqlen;
+
+    bi_fw_.Init(
+        fw_wh, fw_wh_maxs, fw_wi, fw_wi_maxs, emb_dim_, cap_h_, max_cap_l);
+    bi_rv_.Init(
+        rv_wh, rv_wh_maxs, rv_wi, rv_wi_maxs, emb_dim_, cap_h_, max_cap_l);
+    att_.Init(att_fc_w,
+              att_fc_w_max,
+              att_fc_b,
+              2 * cap_h_,
+              upper_bound_batch,
+              upper_bound_seqlen);
+
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        4 * max_cap_l * cap_h_ * sizeof(float), false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             int batch,
+             const MMDNNIdInfo& sentense,
+             lite::Tensor* grnn_fw_pool_out,
+             lite::Tensor* grnn_rv_pool_out,
+             lite::Tensor* att_pool_out,
+             lite::Tensor* concat_3in1_out,
+             lite::Tensor* emb_fw_out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    int cap_l = sentense.seqlen_sum;
+    int slot_len = cap_l * cap_h_;
+
+    float* emb_rv = hbm_buffer_;
+    float* grnn_fw = hbm_buffer_ + slot_len;
+    float* grnn_rv = hbm_buffer_ + slot_len;
+    float* grnn_rv_rv = hbm_buffer_;
+    float* concat_2in = hbm_buffer_ + 2 * slot_len;
+    if (l3_size > 0 && l3_size >= 4 * slot_len * sizeof(float)) {
+      emb_rv = l3_buffer;
+      grnn_fw = l3_buffer + slot_len;
+      grnn_rv = l3_buffer + slot_len;
+      grnn_rv_rv = l3_buffer;
+    }
+    emb_fw = emb_fw_out->mutable_data<float>(TARGET(kXPU));
+    concat_3in = concat_3in1_out->mutable_data<float>(TARGET(kXPU));
+    pool_fw = grnn_fw_pool_out->mutable_data<float>(TARGET(kXPU));
+    pool_rv = grnn_rv_pool_out->mutable_data<float>(TARGET(kXPU));
+    att_out = att_pool_out->mutable_data<float>(TARGET(kXPU));
+
+    xdnn::search_bid_emb_ew(ctx,
+                            batch,
+                            sentense.lod_64,
+                            sentense.id0_64,
+                            sentense.id1_64,
+                            table_,
+                            table_len_,
+                            emb_dim_,
+                            emb_fw,
+                            emb_rv,
+                            table_len_ - 2,
+                            1);
+    bi_rv_.Infer(ctx,
+                 sentense,
+                 emb_rv,
+                 grnn_rv,
+                 l3_buffer + 2 * slot_len,
+                 l3_size - 2 * slot_len * sizeof(float));
+    xdnn::sequence_reverse(
+        ctx, batch, sentense.lod_32, cap_h_, grnn_rv, grnn_rv_rv);
+    xdnn::sequence_pooling_forward(ctx,
+                                   xdnn::Pooling_t::LAST,
+                                   batch,
+                                   sentense.lod_32,
+                                   cap_h_,
+                                   grnn_rv,
+                                   nullptr,
+                                   pool_rv);
+
+    bi_fw_.Infer(ctx,
+                 sentense,
+                 emb_fw,
+                 grnn_fw,
+                 l3_buffer + 2 * slot_len,
+                 l3_size - 2 * slot_len * sizeof(float));
+    xdnn::sequence_pooling_forward(ctx,
+                                   xdnn::Pooling_t::LAST,
+                                   batch,
+                                   sentense.lod_32,
+                                   cap_h_,
+                                   grnn_fw,
+                                   nullptr,
+                                   pool_fw);
+    const int concat_widths[] = {cap_h_, cap_h_, cap_h_};
+    const float* concat_ptrs[] = {emb_fw, grnn_fw, grnn_rv_rv};
+    xdnn::concat<float>(
+        ctx, cap_l, concat_widths + 1, 2, concat_ptrs + 1, concat_2in);
+    xdnn::concat<float>(ctx, cap_l, concat_widths, 3, concat_ptrs, concat_3in);
+    att_.Infer(ctx,
+               sentense,
+               concat_2in,
+               att_out,
+               l3_buffer + 4 * slot_len,
+               l3_size - 4 * slot_len * sizeof(float));
+  }
+};
+
+class MMDNNEmbAtt {
+  const float* table_{nullptr};
+  int table_len_;
+  int emb_dim_;
+  MMDNNAttentionOp att_;
+
+ public:
+  float* emb_fw{nullptr};
+  float* att_out{nullptr};
+
+  void Init(lite::Tensor* table,
+            lite::Tensor* att_fc_w,
+            float att_fc_w_max,
+            lite::Tensor* att_fc_b,
+            int upper_bound_batch,
+            int upper_bound_seqlen) {
+    table_ = table->data<float>();
+    table_len_ = table->dims()[0];
+    emb_dim_ = table->dims()[1];
+    att_.Init(att_fc_w,
+              att_fc_w_max,
+              att_fc_b,
+              emb_dim_,
+              upper_bound_batch,
+              upper_bound_seqlen);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             int batch,
+             const MMDNNIdInfo& sentense,
+             lite::Tensor* att_pool_out,
+             lite::Tensor* emb_fw_out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    emb_fw = emb_fw_out->mutable_data<float>(TARGET(kXPU));
+    att_out = att_pool_out->mutable_data<float>(TARGET(kXPU));
+
+    int cap_l = sentense.lod.back();
+    const float* emb_tables[] = {table_, table_};
+    const int64_t* emb_indices[] = {sentense.id0_64, sentense.id1_64};
+    xdnn::embedding_with_ewadd<float, int64_t, false, false>(ctx,
+                                                             emb_dim_,
+                                                             cap_l,
+                                                             2,
+                                                             table_len_ - 2,
+                                                             emb_tables,
+                                                             emb_indices,
+                                                             nullptr,
+                                                             nullptr,
+                                                             emb_fw);
+    att_.Infer(ctx, sentense, emb_fw, att_out, l3_buffer, l3_size);
+  }
+};
+
+class MMDNNMergeAll {
+  MMDNNGrnnOp coverage_fw_;
+  MMDNNGrnnOp coverage_rv_;
+  int cap_e_;
+  int cap_h_;
+
+  // TODO(miaotianxiang):
+  const int fc0_k_ = 1152;
+  const int fc0_n_ = 512;
+  const int fc1_k_ = 640;
+  const int fc1_n_ = 320;
+  const int fc2_k_ = 320;
+  const int fc2_n_ = 1;
+  MMDNNFcOp fc0_;
+  MMDNNFcOp fc1_;
+  MMDNNFcOp fc2_;
+
+  XPUScratchPadGuard hbm_buffer_guard_;
+  float* hbm_buffer_{nullptr};
+  // topk_concat_out_fw:  [cap_l, cap_e_] <= [cap_l, cap_h_]
+  // topk_concat_out_rv:  [cap_l, cap_e_] <= [cap_l, cap_h_]
+  // grnn_fw:             [cap_l, cap_h_]
+  // grnn_rv:             [cap_l, cap_h_]
+  // pool_fw:             [batch, cap_h_]
+  // pool_rv:             [batch, cap_h_]
+  // fc0_in:              [batch, fc0_k_]
+  // fc0_out:             [batch, fc0_n_]
+  // fc1_in:              [batch, fc1_k_]
+  // fc1_out:             [batch, fc1_n_]
+  // fc2_out:             [batch, fc2_n_]
+
+ public:
+  void Init(lite::Tensor* grnn_fw_wh,
+            std::vector<float> grnn_fw_wh_maxs,
+            lite::Tensor* grnn_fw_wi,
+            std::vector<float> grnn_fw_wi_maxs,
+            lite::Tensor* grnn_rv_wh,
+            std::vector<float> grnn_rv_wh_maxs,
+            lite::Tensor* grnn_rv_wi,
+            std::vector<float> grnn_rv_wi_maxs,
+            lite::Tensor* fc0_w,
+            float fc0_w_max,
+            lite::Tensor* fc0_b,
+            lite::Tensor* fc1_w,
+            float fc1_w_max,
+            lite::Tensor* fc1_b,
+            lite::Tensor* fc2_w,
+            float fc2_w_max,
+            lite::Tensor* fc2_b,
+            int upper_bound_batch,
+            int upper_bound_seqlen) {
+    int max_cap_l = upper_bound_batch * upper_bound_seqlen;
+    cap_e_ = grnn_fw_wi->dims()[2];
+    cap_h_ = grnn_fw_wi->dims()[1];
+
+    coverage_fw_.Init(grnn_fw_wh,
+                      grnn_fw_wh_maxs,
+                      grnn_fw_wi,
+                      grnn_fw_wi_maxs,
+                      cap_e_,
+                      cap_h_,
+                      max_cap_l);
+    coverage_rv_.Init(grnn_rv_wh,
+                      grnn_rv_wh_maxs,
+                      grnn_rv_wi,
+                      grnn_rv_wi_maxs,
+                      cap_e_,
+                      cap_h_,
+                      max_cap_l);
+
+    fc0_.Init(
+        fc0_w, fc0_w_max, fc0_b, fc0_n_, fc0_k_, xdnn::Activation_t::RELU);
+    fc1_.Init(
+        fc1_w, fc1_w_max, fc1_b, fc1_n_, fc1_k_, xdnn::Activation_t::RELU);
+    fc2_.Init(
+        fc2_w, fc2_w_max, fc2_b, fc2_n_, fc2_k_, xdnn::Activation_t::LINEAR);
+
+    int hbm_total_len = max_cap_l * cap_h_ * 4 +
+                        upper_bound_batch * (2 * cap_h_ + fc0_k_ + fc0_n_ +
+                                             fc1_k_ + fc1_n_ + fc2_n_);
+    hbm_buffer_guard_ = TargetWrapperXPU::MallocScratchPad(
+        hbm_total_len * sizeof(float), false);
+    hbm_buffer_ = reinterpret_cast<float*>(hbm_buffer_guard_->addr_);
+  }
+
+  void Infer(xdnn::Context* ctx,
+             const MMDNNIdInfo& sentense,
+             const std::vector<lite::Tensor*> concat_2in1_x,
+             const std::vector<lite::Tensor*> concat_7in1_x,
+             lite::Tensor* out,
+             float* l3_buffer = nullptr,
+             int l3_size = 0) {
+    int batch = sentense.batch;
+    int cap_l = sentense.seqlen_sum;
+
+    float* topk_concat_out_fw = hbm_buffer_;
+    int hbm_total_len =
+        cap_l * cap_h_ * 4 +
+        batch * (2 * cap_h_ + fc0_k_ + fc0_n_ + fc1_k_ + fc1_n_ + fc2_n_);
+    if (l3_size > 0 && l3_size >= hbm_total_len * sizeof(float)) {
+      topk_concat_out_fw = l3_buffer;
+    }
+    float* topk_concat_out_rv = topk_concat_out_fw + cap_l * cap_h_;
+    float* grnn_fw = topk_concat_out_rv + cap_l * cap_h_;
+    float* grnn_rv = grnn_fw + cap_l * cap_h_;
+    float* pool_fw = grnn_rv + cap_l * cap_h_;
+    float* pool_rv = pool_fw + batch * cap_h_;
+    float* fc0_in = pool_fw + batch * cap_h_ * 2;
+    float* fc0_out = fc0_in + batch * fc0_k_;
+    float* fc1_in = fc0_out + batch * fc0_n_;
+    float* fc1_out = fc1_in + batch * fc1_k_;
+    // float* fc2_out = fc1_out + batch * fc1_n_;
+    float* fc2_out = out->mutable_data<float>(TARGET(kXPU));
+
+    const int concat_widths[] = {static_cast<int>(concat_2in1_x[0]->dims()[1]),
+                                 static_cast<int>(concat_2in1_x[1]->dims()[1])};
+    const float* concat_ptrs[] = {concat_2in1_x[0]->data<float>(),
+                                  concat_2in1_x[1]->data<float>()};
+    xdnn::concat<float>(
+        ctx, cap_l, concat_widths, 2, concat_ptrs, topk_concat_out_fw);
+    xdnn::sequence_reverse(ctx,
+                           batch,
+                           sentense.lod_32,
+                           cap_e_,
+                           topk_concat_out_fw,
+                           topk_concat_out_rv);
+    coverage_fw_.Infer(ctx,
+                       sentense,
+                       topk_concat_out_fw,
+                       grnn_fw,
+                       l3_buffer + hbm_total_len,
+                       l3_size - hbm_total_len * sizeof(float));
+    coverage_rv_.Infer(ctx,
+                       sentense,
+                       topk_concat_out_rv,
+                       grnn_rv,
+                       l3_buffer + hbm_total_len,
+                       l3_size - hbm_total_len * sizeof(float));
+    xdnn::sequence_pooling_forward(ctx,
+                                   xdnn::Pooling_t::LAST,
+                                   batch,
+                                   sentense.lod_32,
+                                   cap_h_,
+                                   grnn_fw,
+                                   nullptr,
+                                   pool_fw);
+    xdnn::sequence_pooling_forward(ctx,
+                                   xdnn::Pooling_t::LAST,
+                                   batch,
+                                   sentense.lod_32,
+                                   cap_h_,
+                                   grnn_rv,
+                                   nullptr,
+                                   pool_rv);
+
+    const int concat_widths_fc0[] = {
+        static_cast<int>(concat_7in1_x[0]->dims()[1]),
+        static_cast<int>(concat_7in1_x[1]->dims()[1]),
+        static_cast<int>(concat_7in1_x[2]->dims()[1]),
+        static_cast<int>(concat_7in1_x[3]->dims()[1]),
+        static_cast<int>(concat_7in1_x[4]->dims()[1]),
+        static_cast<int>(concat_7in1_x[5]->dims()[1]),
+        static_cast<int>(concat_7in1_x[6]->dims()[1]),
+    };
+    const float* concat_ptrs_fc0[] = {
+        concat_7in1_x[0]->data<float>(),
+        concat_7in1_x[1]->data<float>(),
+        concat_7in1_x[2]->data<float>(),
+        concat_7in1_x[3]->data<float>(),
+        concat_7in1_x[4]->data<float>(),
+        concat_7in1_x[5]->data<float>(),
+        concat_7in1_x[6]->data<float>(),
+    };
+    const int concat_widths_fc1[] = {cap_h_, cap_h_, fc0_n_};
+    const float* concat_ptrs_fc1[] = {pool_fw, pool_rv, fc0_out};
+
+    xdnn::concat<float>(
+        ctx, batch, concat_widths_fc0, 7, concat_ptrs_fc0, fc0_in);
+    fc0_.Infer(ctx, fc0_in, batch, fc0_out);
+    xdnn::concat<float>(
+        ctx, batch, concat_widths_fc1, 3, concat_ptrs_fc1, fc1_in);
+    fc1_.Infer(ctx, fc1_in, batch, fc1_out);
+    fc2_.Infer(ctx, fc1_out, batch, fc2_out);
+  }
+};
+
+class XPUMmdnnBidEmbGrnnAttCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnBidEmbGrnnAttParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNIdInfo id_;
+  MMDNNBidEmbGrnnAtt compound_;
+  int upper_bound_batch_ = 40;
+  int upper_bound_seqlen_ = 512;
+};
+
+void XPUMmdnnBidEmbGrnnAttCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  id_.Init(upper_bound_batch_, upper_bound_seqlen_);
+  compound_.Init(param.emb_tbl,
+                 param.grnn_fw_wh,
+                 param.grnn_fw_wh_maxs,
+                 param.grnn_fw_wi,
+                 param.grnn_fw_wi_maxs,
+                 param.grnn_rv_wh,
+                 param.grnn_rv_wh_maxs,
+                 param.grnn_rv_wi,
+                 param.grnn_rv_wi_maxs,
+                 param.att_fc_w,
+                 param.att_fc_w_max,
+                 param.att_fc_b,
+                 upper_bound_batch_,
+                 upper_bound_seqlen_);
+}
+
+void XPUMmdnnBidEmbGrnnAttCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  int batch = param.id0->lod()[0].size() - 1;
+  id_.Update(param.id0, param.id1);
+  compound_.Infer(ctx.GetRawContext(),
+                  batch,
+                  id_,
+                  param.grnn_fw_pool_out,
+                  param.grnn_rv_pool_out,
+                  param.att_pool_out,
+                  param.concat_3in1_out,
+                  param.emb_fw_out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+}
+
+class XPUMmdnnBidEmbAttCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnBidEmbAttParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNIdInfo id_;
+  MMDNNEmbAtt compound_;
+  int upper_bound_batch_ = 40;
+  int upper_bound_seqlen_ = 512;
+};
+
+void XPUMmdnnBidEmbAttCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  id_.Init(upper_bound_batch_, upper_bound_seqlen_);
+  compound_.Init(param.emb_tbl,
+                 param.att_fc_w,
+                 param.att_fc_w_max,
+                 param.att_fc_b,
+                 upper_bound_batch_,
+                 upper_bound_seqlen_);
+}
+
+void XPUMmdnnBidEmbAttCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  int batch = param.id0->lod()[0].size() - 1;
+  id_.Update(param.id0, param.id1);
+  compound_.Infer(ctx.GetRawContext(),
+                  batch,
+                  id_,
+                  param.att_pool_out,
+                  param.emb_fw_out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+}
+
+class XPUMmdnnMatchConvTopkCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnMatchConvTopkParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNMatchConvTopk compound_;
+  int upper_bound_batch_ = 40;
+  int upper_bound_seqlen_ = 512;
+};
+
+void XPUMmdnnMatchConvTopkCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  compound_.Init(param.input_w,
+                 param.input_w_max,
+                 param.conv_w,
+                 param.conv_w_max,
+                 param.dim_t,
+                 param.input_w->dims()[0],
+                 upper_bound_batch_,
+                 upper_bound_seqlen_,
+                 param.topks);
+}
+
+void XPUMmdnnMatchConvTopkCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  compound_.Infer(ctx.GetRawContext(),
+                  param.input_x,
+                  param.input_y,
+                  param.topk_out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+}
+
+class XPUMmdnnMergeAllCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnMergeAllParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  MMDNNIdInfo id_;
+  MMDNNMergeAll compound_;
+  int upper_bound_batch_ = 40;
+  int upper_bound_seqlen_ = 512;
+};
+
+void XPUMmdnnMergeAllCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  id_.Init(upper_bound_batch_, upper_bound_seqlen_);
+  compound_.Init(param.grnn_fw_wh,
+                 param.grnn_fw_wh_maxs,
+                 param.grnn_fw_wi,
+                 param.grnn_fw_wi_maxs,
+                 param.grnn_rv_wh,
+                 param.grnn_rv_wh_maxs,
+                 param.grnn_rv_wi,
+                 param.grnn_rv_wi_maxs,
+                 param.fc0_w,
+                 param.fc0_w_max,
+                 param.fc0_b,
+                 param.fc1_w,
+                 param.fc1_w_max,
+                 param.fc1_b,
+                 param.fc2_w,
+                 param.fc2_w_max,
+                 param.fc2_b,
+                 upper_bound_batch_,
+                 upper_bound_seqlen_);
+}
+
+void XPUMmdnnMergeAllCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* xpu_ctx = ctx.GetRawContext();
+
+  id_.Update(param.concat_2in1_x[0], param.concat_2in1_x[1]);
+  compound_.Infer(ctx.GetRawContext(),
+                  id_,
+                  param.concat_2in1_x,
+                  param.concat_7in1_x,
+                  param.out,
+                  reinterpret_cast<float*>(
+                      reinterpret_cast<char*>(xpu_ctx->workspace_l3_ptr) +
+                      xpu_ctx->used_l3_size),
+                  xpu_ctx->workspace_l3_size - xpu_ctx->used_l3_size);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_grnn_att,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnBidEmbGrnnAttCompute,
+                     def)
+    .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("grnn_fw_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("grnn_rv_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_bid_emb_att,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnBidEmbAttCompute,
+                     def)
+    .BindInput("id0", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("id1", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("emb_tbl", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("att_fc_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("att_pool_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("concat_3in1_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("emb_fw_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_match_conv_topk,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnMatchConvTopkCompute,
+                     def)
+    .BindInput("input_x", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("input_y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("input_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("conv_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("topk_out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_merge_all,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnMergeAllCompute,
+                     def)
+    .BindInput("concat_7in1_x", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("concat_2in1_x", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_fw_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("grnn_rv_wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc0_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc0_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc1_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc1_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc2_w", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("fc2_b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d57445cd44953f504e292ad38d44d047daa3a7a
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__resnet_cbam_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUResNetCbamCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  for (auto* filter : param.filter) {
+    arg_filter_.push_back(
+        reinterpret_cast<const int16_t*>(filter->data<float>()));
+  }
+  for (auto* bias : param.bias) {
+    if (bias == nullptr) {
+      arg_bias_.push_back(nullptr);
+    } else {
+      arg_bias_.push_back(bias->data<float>());
+    }
+  }
+  for (auto* max_filter : param.max_filter) {
+    arg_max_filter_.push_back(max_filter->data<float>());
+  }
+}
+
+void XPUResNetCbamCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto input_dims = param.input->dims();
+  int batch_size = input_dims[0];
+  int height = input_dims[2];
+  int width = input_dims[3];
+
+  int r = xdnn::conv2d_int16_resnet_cbam<float, int16_t>(
+      ctx.GetRawContext(),                             /* context */
+      batch_size,                                      /* num */
+      height,                                          /* height */
+      width,                                           /* width */
+      param.input->data<float>(),                      /* bottom */
+      &arg_filter_[0],                                 /* weight_list */
+      param.output->mutable_data<float>(TARGET(kXPU)), /* top */
+      &arg_bias_[0],                                   /* bias_list */
+      &arg_max_filter_[0],                             /* max_filter_list */
+      param.pool_p,                                    /* pool_p */
+      true,                                            /* midtype_fp16 */
+      false /* dynamic_shape */);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__resnet_cbam,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUResNetCbamCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("MaxFilter", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__resnet_cbam_compute.h b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..b952bb088ea88399966c170cbeadebfa698889d8
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__resnet_cbam_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUResNetCbamCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUResNetCbamParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+ private:
+  std::vector<const int16_t *> arg_filter_;
+  std::vector<const float *> arg_max_filter_;
+  std::vector<const float *> arg_bias_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/__xpu__search_attention_compute.cc b/lite/kernels/xpu/__xpu__search_attention_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..515be8935637d89d58db830f96f2ea439e7d7e68
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.cc
@@ -0,0 +1,219 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/__xpu__search_attention_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUMmdnnSearchAttentionCompute::PrepareForRun() {
+  offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  pad_begin_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  w_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(8 * sizeof(float));
+  buffer_at_l3_guard_ = TargetWrapperXPU::MallocScratchPad(
+      5 * L3_SLOT_SIZE * sizeof(float), false /* use_l3 */);
+  buffer_at_gm_guard_ = TargetWrapperXPU::MallocScratchPad(
+      5 * GM_SLOT_SIZE * sizeof(float), false /* use_l3 */);
+
+  offset_cpu.reset(new int[64]);
+  pad_begin_cpu.reset(new int[64]);
+}
+
+void XPUMmdnnSearchAttentionCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* X = param.X;
+  auto* W = param.W;
+  auto* b = param.b;
+  float W_max = param.W_max;
+  float alpha0 = param.alpha0;
+  float alpha1 = param.alpha1;
+  float mask = param.mask;
+
+  const int16_t* w_data = W->data<int16_t>();
+  const float* b_data = b->data<float>();
+
+  int batch = X->lod()[0].size() - 1;
+  int dim0 = X->dims()[0];
+  int dim1 = X->dims()[1];
+  const auto offset = X->lod()[0];
+  int max_seq = 0;
+
+  auto* top = param.Out;
+  LoD top_lod;
+  top_lod.push_back(X->lod()[0]);
+  top->set_lod(top_lod);
+  top->Resize({dim0, dim1});
+  auto* top_data = top->mutable_data<float>(TARGET(kXPU));
+
+  float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, W_max, 0.0f, 0.0f, 0.0f};
+  for (int i = 0; i < batch; ++i) {
+    offset_cpu[i] = offset[i];  // type of offset is int64, not supported by xpu
+    pad_begin_cpu[i] = offset[i + 1] - offset[i];
+    if (offset[i + 1] - offset[i] > max_seq) {
+      max_seq = offset[i + 1] - offset[i];
+    }
+  }
+  offset_cpu[batch] = offset[batch];
+
+  xpu_memcpy(offset_xpu_guard_->addr_,
+             offset_cpu.get(),
+             offset.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  xpu_memcpy(pad_begin_xpu_guard_->addr_,
+             pad_begin_cpu.get(),
+             batch * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  xpu_memcpy(w_max_xpu_guard_->addr_,
+             maxs_cpu,
+             8 * sizeof(float),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+  int* offset_xpu = reinterpret_cast<int*>(offset_xpu_guard_->addr_);
+  int* pad_begin_xpu = reinterpret_cast<int*>(pad_begin_xpu_guard_->addr_);
+  float* maxs_xpu = reinterpret_cast<float*>(w_max_xpu_guard_->addr_);
+  float* buffer_at_l3 = reinterpret_cast<float*>(buffer_at_l3_guard_->addr_);
+  float* buffer_at_gm = reinterpret_cast<float*>(buffer_at_gm_guard_->addr_);
+
+  // when use l3, max_seq <= 128:
+  // group_padding:           batch * max_seq * dim1;     at (slot0, slot1)
+  // seq_fc:                  batch * max_seq * dim1;     at (slot2, slot3)
+  // batchgemm0:              batch * max_seq * max_seq;  at slot4
+  // attention_padding_mask:  batch * max_seq * max_seq;  at slot3
+  // seq_softmax:             batch * max_seq * max_seq;  at slot4
+  // batchgemm1:              batch * max_seq * dim1;     at (slot2, slot3)
+  float* group_padding_output = buffer_at_l3;
+  float* seq_fc_output = buffer_at_l3 + 2 * L3_SLOT_SIZE;
+  float* batchgemm0_output = buffer_at_l3 + 4 * L3_SLOT_SIZE;
+  float* attention_output = buffer_at_l3 + 3 * L3_SLOT_SIZE;
+  float* seq_softmax_output = buffer_at_l3 + 4 * L3_SLOT_SIZE;
+  float* batchgemm1_output = buffer_at_l3 + 2 * L3_SLOT_SIZE;
+
+  if (max_seq > 128) {
+    group_padding_output = buffer_at_gm;
+    seq_fc_output = buffer_at_gm + 1 * GM_SLOT_SIZE;
+    batchgemm0_output = buffer_at_gm + 2 * GM_SLOT_SIZE;
+    attention_output = buffer_at_gm + 1 * GM_SLOT_SIZE;
+    seq_softmax_output = buffer_at_gm + 3 * GM_SLOT_SIZE;
+    batchgemm1_output = buffer_at_gm + 4 * GM_SLOT_SIZE;
+  }
+
+  const auto* bottom_data = X->data<float>();
+  xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
+                                  const_cast<float*>(bottom_data),
+                                  group_padding_output,
+                                  offset_xpu,
+                                  max_seq,
+                                  batch,
+                                  dim1,
+                                  0);  // is_depad = 0
+  // do-findmax
+  xdnn::findmax<float>(ctx.GetRawContext(),
+                       group_padding_output,
+                       batch * max_seq * dim1,
+                       maxs_xpu);
+  xdnn::gemm_int16_maxptr<float, int16_t, float>(
+      ctx.GetRawContext(),
+      false,
+      true,  // trans_a, trans_b
+      batch * max_seq,
+      dim1,
+      dim1,  // m, n, k
+      1.0f,
+      group_padding_output,
+      dim1,  // alpha, data_a, lda
+      w_data,
+      dim1,
+      0.0f,  // data_b, ldb, beta
+      seq_fc_output,
+      dim1,
+      b_data,  // data_c, ldc, bias
+      xdnn::Activation_t::LINEAR,
+      maxs_xpu,
+      maxs_xpu + 4,
+      nullptr);  // max_a, max_b, max_c
+  xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
+                               0,
+                               1,
+                               batch,
+                               max_seq,
+                               max_seq,
+                               dim1,
+                               alpha0,
+                               group_padding_output,
+                               dim1,
+                               seq_fc_output,
+                               dim1,
+                               batchgemm0_output,
+                               max_seq);
+  xdnn::search_pad_mask(ctx.GetRawContext(),
+                        batchgemm0_output,
+                        attention_output,
+                        pad_begin_xpu,
+                        batch,
+                        max_seq,
+                        max_seq,
+                        batch,
+                        mask);
+  xdnn::softmax2d_forward(ctx.GetRawContext(),
+                          attention_output,
+                          seq_softmax_output,
+                          batch * max_seq,
+                          max_seq,
+                          true);
+  xdnn::search_aligned_mat_mul(ctx.GetRawContext(),
+                               0,
+                               0,
+                               batch,
+                               max_seq,
+                               dim1,
+                               max_seq,
+                               alpha1,
+                               seq_softmax_output,
+                               max_seq,
+                               group_padding_output,
+                               dim1,
+                               batchgemm1_output,
+                               dim1);
+  xdnn::search_sequence_pad_depad(ctx.GetRawContext(),
+                                  top_data,
+                                  batchgemm1_output,
+                                  offset_xpu,
+                                  max_seq,
+                                  batch,
+                                  dim1,
+                                  1);  // is_depad = 1
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(__xpu__mmdnn_search_attention,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUMmdnnSearchAttentionCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/__xpu__search_attention_compute.h b/lite/kernels/xpu/__xpu__search_attention_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..f9670dbab6247927acf6ac7d7b47f98a464a3489
--- /dev/null
+++ b/lite/kernels/xpu/__xpu__search_attention_compute.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUMmdnnSearchAttentionCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::XPUMmdnnSearchAttentionParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard offset_xpu_guard_;
+  XPUScratchPadGuard pad_begin_xpu_guard_;
+  XPUScratchPadGuard w_max_xpu_guard_;
+  XPUScratchPadGuard buffer_at_l3_guard_;
+  XPUScratchPadGuard buffer_at_gm_guard_;
+
+  std::unique_ptr<int[]> offset_cpu;
+  std::unique_ptr<int[]> pad_begin_cpu;
+
+  const int L3_SLOT_SIZE = 40 * 128 * 128;
+  const int GM_SLOT_SIZE = 40 * 512 * 512;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/concat_compute.cc b/lite/kernels/xpu/concat_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f088bb80f0c500c6f900726195bcb5903049d3fb
--- /dev/null
+++ b/lite/kernels/xpu/concat_compute.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/concat_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void ConcatCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto ins = param.x;
+  auto out = param.output;
+  int64_t axis = param.axis;
+
+  int n = ins.size();
+  int h = 1;
+  int w_except_axis = 1;
+  CHECK(n <= 8) << "XPU only surpport at most 8 tensors for now";
+  for (int i = 0; i < axis; ++i) {
+    h *= (ins[0]->dims())[i];
+  }
+  for (int i = axis + 1; i < ins[0]->dims().size(); ++i) {
+    w_except_axis *= (ins[0]->dims())[i];
+  }
+  CHECK(axis >= 0) << "concat: axis shoud >= 0!";
+  CHECK(axis < ins[0]->dims().size()) << "concat: axis shoud < ins[0]->dims()!";
+  for (int i = 0; i < n; ++i) {
+    int hh = 1;
+    int ww = 1;
+    for (int j = 0; j < axis; ++j) {
+      hh *= (ins[i]->dims())[j];
+    }
+    for (int j = axis + 1; j < ins[i]->dims().size(); ++j) {
+      ww *= (ins[i]->dims())[j];
+    }
+    CHECK(hh == h) << "concat: h should be eual!";
+    CHECK(ww == w_except_axis) << "concat: w should be eual except for axis!";
+  }
+
+  int in_w_host[n];      // NOLINT
+  const float* ptrs[n];  // NOLINT
+
+  for (int i = 0; i < n; ++i) {
+    ptrs[i] = ins[i]->data<float>();
+    in_w_host[i] = w_except_axis * (ins[i]->dims())[axis];
+  }
+
+  int r = xdnn::concat<float>(ctx.GetRawContext(), /* ctx */
+                              h,                   /* height */
+                              in_w_host,           /* width_x */
+                              n,                   /* n */
+                              ptrs,                /* lm_ptrs */
+                              out->mutable_data<float>(TARGET(kXPU)) /*y*/);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    concat, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::ConcatCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/concat_compute.h b/lite/kernels/xpu/concat_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..f29899a741194270272770d8b781cd9b0b54abc9
--- /dev/null
+++ b/lite/kernels/xpu/concat_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class ConcatCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  virtual void Run();
+
+  virtual ~ConcatCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/match_matrix_tensor_compute.cc b/lite/kernels/xpu/match_matrix_tensor_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3c4e896d23add6df99a7b66a830dc526dc808e95
--- /dev/null
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/match_matrix_tensor_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void MatchMatrixTensorCompute::PrepareForRun() {
+  wx_max_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  offset_l_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  offset_r_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+
+  offset_l_cpu.reset(new int[64]);
+  offset_r_cpu.reset(new int[64]);
+}
+
+void MatchMatrixTensorCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->As<XPUContext>();
+
+  auto* x = param.x;
+  auto* y = param.y;
+  auto* w = param.w;
+  auto* out = param.out;
+  auto* tmp = param.tmp;
+  int dim_t = param.dim_t;
+  float w_max = param.__xpu__w_max;
+  bool fuse_relu = param.fuse_relu;
+  bool float_to_fix = param.__xpu__float_to_fix;
+  CHECK(float_to_fix) << "W should be fixed point";
+
+  xdnn::Activation_t act = xdnn::Activation_t::LINEAR;
+  if (fuse_relu) {
+    act = xdnn::Activation_t::RELU;
+  }
+
+  int dim_in = x->dims()[1];
+  const auto& offset_l = x->lod()[0];
+  const auto& offset_r = y->lod()[0];
+
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    int len_l = offset_l[b + 1] - offset_l[b];
+    int len_r = offset_r[b + 1] - offset_r[b];
+    top_size += dim_t * len_l * len_r;
+    top_offset.push_back(top_size);
+  }
+  auto* bottom_l_data = x->data<float>();
+  auto* bottom_r_data = y->data<float>();
+  auto* w_data = w->data<int16_t>();
+  auto* out_data = out->mutable_data<float>(TARGET(kXPU));
+  auto* bottom_l_trans_data = tmp->mutable_data<float>(TARGET(kXPU));
+  int batch_size = x->lod()[0].size() - 1;
+
+  float* wx_max = reinterpret_cast<float*>(wx_max_xpu_guard_->addr_);
+  int* offset_l_xpu = reinterpret_cast<int*>(offset_l_xpu_guard_->addr_);
+  int* offset_r_xpu = reinterpret_cast<int*>(offset_r_xpu_guard_->addr_);
+
+  int r = xdnn::gemm_int16_tmp_api<float, int16_t, float>(
+      ctx.GetRawContext(), /* ctx */
+      false,
+      false, /* trans_a, trans_b */
+      x->dims()[0],
+      dim_t * dim_in,
+      dim_in, /* m, n, k */
+      1.0f,
+      bottom_l_data,
+      dim_in, /* alpha, data_a, lda */
+      w_data,
+      dim_t * dim_in,
+      0.0f, /* data_b, ldb, beta */
+      bottom_l_trans_data,
+      dim_t * dim_in, /* data_c, ldc */
+      nullptr,        /* bias */
+      xdnn::Activation_t::LINEAR,
+      0.0f,
+      w_max,
+      wx_max /* max_a, max_b, max_c */);
+  CHECK_EQ(r, 0);
+
+  int max_width = 0;
+  for (int i = 0; i < offset_l.size(); ++i) {
+    offset_l_cpu[i] = offset_l[i];
+    if (i != 0 && (offset_l_cpu[i] - offset_l_cpu[i - 1] > max_width)) {
+      max_width = offset_l_cpu[i] - offset_l_cpu[i - 1];
+    }
+  }
+  for (int i = 0; i < offset_r.size(); ++i) {
+    offset_r_cpu[i] = offset_r[i];
+    if (i != 0 && (offset_r_cpu[i] - offset_r_cpu[i - 1] > max_width)) {
+      max_width = offset_r_cpu[i] - offset_r_cpu[i - 1];
+    }
+  }
+  xpu_memcpy(offset_l_xpu,
+             offset_l_cpu.get(),
+             offset_l.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  xpu_memcpy(offset_r_xpu,
+             offset_r_cpu.get(),
+             offset_r.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+  r = xdnn::match_matrix_tensor(ctx.GetRawContext(),
+                                batch_size,
+                                bottom_l_trans_data,
+                                bottom_r_data,
+                                offset_l_xpu,
+                                offset_r_xpu,
+                                dim_t,
+                                dim_in,
+                                out_data,
+                                wx_max,
+                                act,
+                                max_width);
+  CHECK_EQ(r, 0);
+
+  int lod_lv1_size = batch_size * dim_t;
+  int lod_lv2_size = x->lod()[0].back() * dim_t;
+  std::vector<size_t> out_lod0(batch_size + 1, 0);
+  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  for (int i = 0; i < batch_size; i++) {
+    out_lod0[i + 1] = out_lod0[i] + dim_t;
+    int len_l = offset_l[i + 1] - offset_l[i];
+
+    for (int j = 0; j < dim_t; j++) {
+      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
+      int len_r = offset_r[i + 1] - offset_r[i];
+
+      for (int k = 0; k < len_l; k++) {
+        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+      }
+    }
+  }
+
+  paddle::lite::LoD out_lod;
+  out_lod.push_back(top_offset);
+  out_lod.push_back(offset_l);
+  out_lod.push_back(offset_r);
+  out->set_lod(out_lod);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(match_matrix_tensor,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::MatchMatrixTensorCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Tmp", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/match_matrix_tensor_compute.h b/lite/kernels/xpu/match_matrix_tensor_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3bd0b622db1fce178ea66604d89dc50d6477a105
--- /dev/null
+++ b/lite/kernels/xpu/match_matrix_tensor_compute.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class MatchMatrixTensorCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatchMatrixTensorParam;
+
+  virtual void PrepareForRun();
+
+  virtual void Run();
+
+ private:
+  XPUScratchPadGuard wx_max_xpu_guard_;
+  XPUScratchPadGuard offset_l_xpu_guard_;
+  XPUScratchPadGuard offset_r_xpu_guard_;
+
+  std::unique_ptr<int[]> offset_l_cpu;
+  std::unique_ptr<int[]> offset_r_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/search_fc_compute.cc b/lite/kernels/xpu/search_fc_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79f4c2d0d809ea9848fb383863d0f9dd2ec5a2ae
--- /dev/null
+++ b/lite/kernels/xpu/search_fc_compute.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/search_fc_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SearchFcCompute::PrepareForRun() {
+  maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(float));
+}
+
+void SearchFcCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* bottom = param.X;
+  auto* w = param.W;
+  auto* b = param.b;
+  auto* top = param.Out;
+  float w_max = param.__xpu__w_max;
+  int out_size = param.out_size;
+  bool fuse_relu = param.fuse_relu;
+  bool float_to_fix = param.__xpu__float_to_fix;
+  CHECK(float_to_fix) << "W should be fixed point";
+
+  int batch = bottom->dims()[0];
+  int _out = w->dims()[0];
+  int _in = w->dims()[1];
+
+  xdnn::Activation_t act = xdnn::Activation_t::LINEAR;
+  if (fuse_relu) {
+    act = xdnn::Activation_t::RELU;
+  }
+
+  std::vector<int64_t> top_dims{bottom->dims()[0], out_size};
+  top->Resize(top_dims);
+
+  const auto* bottom_data = bottom->data<float>();
+  const auto* weights = w->data<int16_t>();
+  const auto* bias_data = b->data<float>();
+  auto* top_data = top->mutable_data<float>(TARGET(kXPU));
+
+  float* maxs_xpu = reinterpret_cast<float*>(maxs_xpu_guard_->addr_);
+  float maxs_cpu[8] = {0.0f, 0.0f, 0.0f, 0.0f, w_max, 0.0f, 0.0f, 0.0f};
+  xpu_memcpy(maxs_xpu,
+             &maxs_cpu[0],
+             8 * sizeof(float),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+  int r = xdnn::findmax<float>(
+      ctx.GetRawContext(), bottom_data, batch * _in, maxs_xpu);
+  CHECK_EQ(r, 0);
+  r = xdnn::gemm_int16_maxptr<float, int16_t, float>(
+      ctx.GetRawContext(), /* ctx */
+      false,
+      true, /*trans_a, trans_b*/
+      batch,
+      _out,
+      _in, /*m, n, k*/
+      1.0f,
+      bottom_data,
+      _in, /*alpha, data_a, lda*/
+      weights,
+      _in,
+      0.0f, /*data_b, ldb, beta*/
+      top_data,
+      _out,
+      bias_data, /* data_c, ldc, bias*/
+      act,
+      maxs_xpu,
+      maxs_xpu + 4,
+      nullptr /*act, max_a, max_b, max_c*/);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_fc,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SearchFcCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/search_fc_compute.h b/lite/kernels/xpu/search_fc_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7ee06abd957187c18c1306f40a77735f40558e7
--- /dev/null
+++ b/lite/kernels/xpu/search_fc_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SearchFcCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchFcParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard maxs_xpu_guard_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/search_grnn_compute.cc b/lite/kernels/xpu/search_grnn_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c19f58da1b5deaa3d74791561494f13b681cf3a
--- /dev/null
+++ b/lite/kernels/xpu/search_grnn_compute.cc
@@ -0,0 +1,282 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/search_grnn_compute.h"
+#include <algorithm>
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SearchGrnnCompute::PrepareForRun() {
+  offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  new_offset_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(256 * sizeof(int));
+  maxs_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(16 * sizeof(float));
+
+  idx_sorted_by_width_data_cpu.reset(new int[64]);
+  offset_cpu.reset(new int[64]);
+  new_offset_cpu.reset(new int[256]);
+}
+
+void SearchGrnnCompute::prepare_layout(const operators::SearchGrnnParam& param,
+                                       const paddle::lite::Tensor* bottom) {
+  auto* idx_sorted_by_width = param.idx_sorted_by_width;
+  auto* layout_input = param.layout_input;
+
+  int dim0 = bottom->dims()[0];
+  int dim1 = 1;
+  if (bottom->dims().size() > 1) {
+    dim1 = bottom->dims()[1];
+  }
+  int batch = bottom->lod()[0].size() - 1;
+  auto& offset = bottom->lod()[0];
+
+  idx_sorted_by_width->Resize({batch});
+  std::vector<int> width;
+  width.resize(batch);
+
+  // sort sequences by width (descending) and find the largest width in the
+  // batch
+  for (int i = 0; i < batch; i++) {
+    width[i] = offset[i + 1] - offset[i];
+    idx_sorted_by_width_data_cpu[i] = i;
+  }
+  std::sort(idx_sorted_by_width_data_cpu.get(),
+            idx_sorted_by_width_data_cpu.get() + batch,
+            [&width](int a, int b) { return width[a] > width[b]; });
+  int max_width = width[idx_sorted_by_width_data_cpu[0]];
+
+  // start of reorganizing the input
+  std::vector<size_t> new_offset;
+  new_offset.resize(max_width + 1);
+  new_offset[0] = 0;
+  int j = batch - 1;
+  int last_width = 0;
+  int sub_row = 0;
+  int sub_col = 0;
+
+  for (int i = 1; i <= max_width;) {
+    for (int k = j; k >= 0; --k) {
+      if (width[idx_sorted_by_width_data_cpu[k]] > last_width) {
+        sub_row = width[idx_sorted_by_width_data_cpu[k]] - last_width;
+        sub_col = k + 1;
+        for (int s = 0; s < sub_row; s++) {
+          new_offset[i] = new_offset[i - 1] + sub_col;
+          i++;
+        }
+        // move on
+        last_width = width[idx_sorted_by_width_data_cpu[k]];
+        j = k - 1;
+        break;
+      }
+    }
+  }
+
+  // copying to the reorganized buffer
+  if (bottom->dims().size() == 1) {
+  } else {
+    LoD new_lod;
+    new_lod.push_back(new_offset);
+    layout_input->set_lod(new_lod);
+    layout_input->Resize({dim0, dim1});
+  }
+
+  xpu_memcpy(idx_sorted_by_width->mutable_data<int>(TARGET(kXPU)),
+             idx_sorted_by_width_data_cpu.get(),
+             idx_sorted_by_width->numel() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+}
+
+void SearchGrnnCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* bottom = param.x;
+  auto* wi = param.wi;
+  auto* wh = param.wh;
+  auto* top = param.out;
+  auto* tmp_buffer = param.tmp_buffer;
+  auto* idx_sorted_by_width = param.idx_sorted_by_width;
+  auto* layout_input = param.layout_input;
+  int cap_h = param.num_hidden;
+  int cap_e = param.num_input;
+  int cap_l = bottom->dims()[0];
+  auto wi_max = param.__xpu__wi_max;
+  auto wh_max = param.__xpu__wh_max;
+  bool float_to_fix = param.__xpu__float_to_fix;
+  CHECK(float_to_fix) << "W should be fixed point";
+
+  int dim = 1;
+  if (bottom->dims().size() > 1) {
+    dim = bottom->dims()[1];
+  }
+
+  const auto& offset = bottom->lod()[0];
+  LoD top_lod;
+  top_lod.push_back(offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{cap_l, cap_h};
+  top->Resize(top_dims_vec);
+  auto* top_hidden = top->mutable_data<float>(TARGET(kXPU));
+  const auto* dense_e2h = wi->data<int16_t>();
+  const auto* dense_h2h = wh->data<int16_t>();
+
+  // Prepare idx_sorted_by_width
+  prepare_layout(param, bottom);
+  int batch = bottom->lod()[0].size() - 1;
+  int max_width = layout_input->lod()[0].size() - 1;
+  const auto& new_offset = layout_input->lod()[0];
+  auto* new_emb = layout_input->mutable_data<float>(TARGET(kXPU));
+
+  // Prepare offset and new_offset
+  int* offset_xpu = reinterpret_cast<int*>(offset_xpu_guard_->addr_);
+  int* new_offset_xpu = reinterpret_cast<int*>(new_offset_xpu_guard_->addr_);
+  float* maxs_xpu = reinterpret_cast<float*>(maxs_xpu_guard_->addr_);
+  CHECK_LE(offset.size(), 64);
+  CHECK_LE(new_offset.size(), 256);
+
+  for (size_t i = 0; i < offset.size(); ++i) {
+    offset_cpu[i] = offset[i];
+  }
+  for (size_t i = 0; i < new_offset.size(); ++i) {
+    new_offset_cpu[i] = new_offset[i];
+  }
+  xpu_memcpy(offset_xpu,
+             offset_cpu.get(),
+             offset.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  xpu_memcpy(new_offset_xpu,
+             new_offset_cpu.get(),
+             new_offset.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+  int r = xdnn::search_seq2batch(ctx.GetRawContext(),
+                                 batch,
+                                 max_width,
+                                 dim,
+                                 idx_sorted_by_width->data<int>(),
+                                 offset_xpu,
+                                 new_offset_xpu,
+                                 bottom->data<float>(),
+                                 new_emb);
+  CHECK_EQ(r, 0);
+
+  // this buffer is used for book keeping info which will be used in bp
+  // buffer also needed in bp, so make it larger
+  tmp_buffer->Resize({20, cap_l, cap_h});
+  auto* buffer_data = tmp_buffer->mutable_data<float>(TARGET(kXPU));
+  // the internal hidden
+  auto* hidden = buffer_data + 19 * cap_l * cap_h;
+
+  // do-findmax
+  float maxs_cpu[16] = {0.0f,
+                        0.0f,
+                        0.0f,
+                        0.0f,
+                        wi_max[0],
+                        0.0f,
+                        0.0f,
+                        0.0f,
+                        wi_max[1],
+                        0.0f,
+                        0.0f,
+                        0.0f,
+                        wi_max[2],
+                        0.0f,
+                        0.0f,
+                        0.0f};
+  xpu_memcpy(maxs_xpu,
+             maxs_cpu,
+             16 * sizeof(float),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  r = xdnn::findmax<float>(
+      ctx.GetRawContext(), new_emb, cap_l * cap_e, maxs_xpu);
+  CHECK_EQ(r, 0);
+
+  // precompute embedding to hidden
+  for (int i = 0; i < 3; ++i) {
+    const int16_t* data_b = dense_e2h + i * cap_e * cap_h;  // e2h, e2hr, e2hz
+    float* data_c = buffer_data + i * cap_l * cap_h;  // w_x_e, wr_x_e, wz_x_e
+    int r = xdnn::gemm_int16_maxptr<float, int16_t, float>(
+        ctx.GetRawContext(),
+        false,
+        true,  // trans_a, trans_b
+        cap_l,
+        cap_h,
+        cap_e,  // m, n, k
+        1.0f,
+        new_emb,
+        cap_e,  // alpha, data_a, lda
+        data_b,
+        cap_e,
+        0.0f,  // data_b, ldb, beta
+        data_c,
+        cap_h,  // data_c, ldc
+        nullptr,
+        xdnn::Activation_t::LINEAR,  // bias, act
+        maxs_xpu,
+        maxs_xpu + 4 * (i + 1));  // max_a, max_b
+    CHECK_EQ(r, 0);
+  }
+
+  r = xdnn::search_grnn<float, int16_t>(ctx.GetRawContext(),
+                                        cap_l,
+                                        cap_h,
+                                        cap_e,
+                                        max_width,
+                                        new_offset_xpu,
+                                        buffer_data,
+                                        dense_h2h,
+                                        hidden,
+                                        wh_max[0],
+                                        wh_max[1],
+                                        wh_max[2]);
+  CHECK_EQ(r, 0);
+
+  r = xdnn::search_batch2seq(ctx.GetRawContext(),
+                             batch,
+                             max_width,
+                             cap_h,
+                             idx_sorted_by_width->data<int>(),
+                             offset_xpu,
+                             new_offset_xpu,
+                             hidden,
+                             top_hidden);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_grnn,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SearchGrnnCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("idx_sorted_by_width",
+                {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/search_grnn_compute.h b/lite/kernels/xpu/search_grnn_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7208e782474d39eabb41b4bc969d27a1d7d5f797
--- /dev/null
+++ b/lite/kernels/xpu/search_grnn_compute.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SearchGrnnCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGrnnParam;
+
+  void PrepareForRun() override;
+
+  void prepare_layout(const operators::SearchGrnnParam& param,
+                      const paddle::lite::Tensor* bottom);
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard offset_xpu_guard_;
+  XPUScratchPadGuard new_offset_xpu_guard_;
+  XPUScratchPadGuard maxs_xpu_guard_;
+
+  std::unique_ptr<int[]> idx_sorted_by_width_data_cpu;
+  std::unique_ptr<int[]> offset_cpu;
+  std::unique_ptr<int[]> new_offset_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_arithmetic_compute.cc b/lite/kernels/xpu/sequence_arithmetic_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..226c615dba57ae381ed2457e588c5df32f25e04b
--- /dev/null
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_arithmetic_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SequenceArithmeticCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* bottom0 = param.X;
+  auto* bottom1 = param.Y;
+  auto* top = param.Out;
+
+  int op_type = param.op_type;
+
+  auto len1 = bottom0->numel();
+  auto len2 = bottom1->numel();
+  const auto* bottom_data0 = bottom0->data<float>();
+  const auto* bottom_data1 = bottom1->data<float>();
+  auto* top_data = top->mutable_data<float>(TARGET(kXPU));
+
+  switch (op_type) {
+    case 1:  // addition: top[0] = bottom[0] + bottom[1]
+      if (len1 > len2) {
+        xdnn::elementwise_add(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
+        xdnn::memcpy_device(ctx.GetRawContext(),
+                            &top_data[len2],
+                            &bottom_data0[len2],
+                            (len1 - len2) * sizeof(float));
+      } else {
+        xdnn::elementwise_add(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
+      }
+      break;
+    case 2:  // substraction: top[0] = bottom[0] - bottom[1]
+      if (len1 > len2) {
+        xdnn::elementwise_sub(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
+        xdnn::memcpy_device(ctx.GetRawContext(),
+                            &top_data[len2],
+                            &bottom_data0[len2],
+                            (len1 - len2) * sizeof(float));
+      } else {
+        xdnn::elementwise_sub(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
+      }
+      break;
+    case 3:  // multiplication: top[0] = bottom[0] * bottom[1]
+      if (len1 > len2) {
+        xdnn::elementwise_mul(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len2);
+        xdnn::memcpy_device(ctx.GetRawContext(),
+                            &top_data[len2],
+                            &bottom_data0[len2],
+                            (len1 - len2) * sizeof(float));
+      } else {
+        xdnn::elementwise_mul(
+            ctx.GetRawContext(), bottom_data0, bottom_data1, top_data, len1);
+      }
+      break;
+    default:
+      break;
+  }
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_arithmetic,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SequenceArithmeticCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(search_seq_arithmetic,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SequenceArithmeticCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_arithmetic_compute.h b/lite/kernels/xpu/sequence_arithmetic_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..9526587ac48cd5025022d646e31c24cac6b59a13
--- /dev/null
+++ b/lite/kernels/xpu/sequence_arithmetic_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SequenceArithmeticCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceArithmeticParam;
+
+  void Run() override;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_concat_compute.cc b/lite/kernels/xpu/sequence_concat_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd7f5999a6ccb18efbcb0e96b50f2b31884fc21c
--- /dev/null
+++ b/lite/kernels/xpu/sequence_concat_compute.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_concat_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SequenceConcatCompute::PrepareForRun() {
+  lod0_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  lod1_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+
+  lod0_cpu.reset(new int[64]);
+  lod1_cpu.reset(new int[64]);
+}
+
+template <typename T>
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<uint64_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<T>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+void SequenceConcatCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto xs = param.X;
+  auto out = param.Out;
+
+  size_t lod_size = 0;
+  for (auto& x : xs) {
+    if (lod_size == 0) {
+      lod_size = x->lod()[0].size();
+    } else {
+      CHECK_EQ(lod_size, x->lod()[0].size())
+          << "The number of sequence must be same between each input";
+    }
+  }
+  CHECK_NE(lod_size, 0) << "Each input must have sequence information";
+
+  // TODO(miaotianxiang):
+  int64_t dim0 = 0;
+  int64_t feature_size = 0;
+  std::vector<int64_t> out_dims;
+  for (const auto& tensor : param.X) {
+    const auto x_dims = tensor->dims();
+    if (out_dims.empty()) {
+      out_dims = x_dims.data();
+    }
+    dim0 += x_dims[0];
+    if (feature_size == 0) {
+      feature_size = x_dims.production() / x_dims[0];
+    } else {
+      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+          << "Inputs of sequence concat must have same feature size";
+    }
+  }
+  out_dims[0] = dim0;
+  out->Resize(out_dims);
+  std::vector<lite::Tensor> x_in_order;
+  out->set_lod(ConcatLoD<float>(xs, &x_in_order));
+
+  CHECK(xs.size() == 2) << "XPU only support sequence_pool for 2 tensors";
+
+  auto lod0 = xs[0]->lod()[0];
+  auto lod1 = xs[1]->lod()[0];
+  int batch_size = lod0.size() - 1;
+
+  int* lod0_xpu = reinterpret_cast<int*>(lod0_xpu_guard_->addr_);
+  int* lod1_xpu = reinterpret_cast<int*>(lod1_xpu_guard_->addr_);
+  for (int i = 0; i < lod0.size(); ++i) {
+    lod0_cpu[i] = lod0[i];
+  }
+  for (int i = 0; i < lod1.size(); ++i) {
+    lod1_cpu[i] = lod1[i];
+  }
+  xpu_memcpy(lod0_xpu,
+             lod0_cpu.get(),
+             lod0.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  xpu_memcpy(lod1_xpu,
+             lod1_cpu.get(),
+             lod1.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+  int r = xdnn::sequence_concat(ctx.GetRawContext(),
+                                xs[0]->data<float>(),
+                                lod0_xpu,
+                                xs[1]->data<float>(),
+                                lod1_xpu,
+                                out->mutable_data<float>(TARGET(kXPU)),
+                                batch_size);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_concat,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SequenceConcatCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_concat_compute.h b/lite/kernels/xpu/sequence_concat_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..5726671975d546d1e549ecbe95790c11faafba7b
--- /dev/null
+++ b/lite/kernels/xpu/sequence_concat_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SequenceConcatCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConcatParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard lod0_xpu_guard_;
+  XPUScratchPadGuard lod1_xpu_guard_;
+
+  std::unique_ptr<int[]> lod0_cpu;
+  std::unique_ptr<int[]> lod1_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_pool_compute.cc b/lite/kernels/xpu/sequence_pool_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81d9b5873c3c42afe94acdd8eb5a292326b7a7b6
--- /dev/null
+++ b/lite/kernels/xpu/sequence_pool_compute.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_pool_compute.h"
+#include <string>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void XPUSequencePoolCompute::PrepareForRun() {
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  lod_cpu.reset(new int[64]);
+}
+
+void XPUSequencePoolCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* in = param.X;
+  auto* out = param.Out;
+  std::string pool_type_str = param.pool_type;
+
+  auto dims = in->dims();
+  auto lod = in->lod();
+  dims[0] = lod[0].size() - 1;
+
+  xdnn::Pooling_t pool_type = xdnn::Pooling_t::MAX_WITHOUT_INDEX;
+  if (pool_type_str == "MAX") {
+  } else if (pool_type_str == "LAST") {
+    pool_type = xdnn::Pooling_t::LAST;
+  } else {
+    CHECK(false);
+  }
+
+  int num_seq = out->dims()[0];
+  int dim = out->numel() / num_seq;
+
+  auto in_lod = in->lod()[0];
+  for (size_t i = 0; i < in_lod.size(); ++i) {
+    lod_cpu[i] = in_lod[i];
+  }
+  int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
+  xpu_memcpy(lod_xpu,
+             lod_cpu.get(),
+             in_lod.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+  int r =
+      xdnn::sequence_pooling_forward(ctx.GetRawContext(),
+                                     pool_type,
+                                     num_seq,
+                                     lod_xpu,
+                                     dim,
+                                     in->data<float>(),
+                                     nullptr /* index */,
+                                     out->mutable_data<float>(TARGET(kXPU)));
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_pool,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::XPUSequencePoolCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_pool_compute.h b/lite/kernels/xpu/sequence_pool_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..232634de0e387e764eccdeeda4cb8fd2d5dce598
--- /dev/null
+++ b/lite/kernels/xpu/sequence_pool_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class XPUSequencePoolCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequencePoolParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard lod_xpu_guard_;
+
+  std::unique_ptr<int[]> lod_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_reverse_compute.cc b/lite/kernels/xpu/sequence_reverse_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..11e4b80570c19fa90e7846d18a88f966f9a003b7
--- /dev/null
+++ b/lite/kernels/xpu/sequence_reverse_compute.cc
@@ -0,0 +1,96 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_reverse_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+void SequenceReverseCompute<T, PType>::PrepareForRun() {
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  lod_cpu.reset(new int[64]);
+}
+
+template <typename T, PrecisionType PType>
+void SequenceReverseCompute<T, PType>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* x = param.X;
+  auto* y = param.Out;
+
+  auto lod = x->lod()[0];
+  size_t limit = x->numel();
+  size_t ele_cnt_in_4_byte = limit / x->dims()[0];
+  auto* x_data = x->template data<T>();
+  auto* y_data = y->template mutable_data<T>(TARGET(kXPU));
+  int batch_size = lod.size() - 1;
+
+  if (std::is_same<T, uint8_t>::value) {
+    ele_cnt_in_4_byte /= 4;
+  } else if (std::is_same<T, int>::value) {
+    // remain the same
+  } else if (std::is_same<T, int64_t>::value) {
+    ele_cnt_in_4_byte *= 2;
+  } else if (std::is_same<T, float>::value) {
+    // remain the same
+  } else if (std::is_same<T, double>::value) {
+    ele_cnt_in_4_byte *= 2;
+  }
+
+  for (size_t i = 0; i < lod.size(); ++i) {
+    lod_cpu[i] = lod[i];
+  }
+  int* lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
+  xpu_memcpy(lod_xpu,
+             lod_cpu.get(),
+             lod.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+  int r = xdnn::sequence_reverse(ctx.GetRawContext(),
+                                 batch_size,
+                                 lod_xpu,
+                                 ele_cnt_in_4_byte,
+                                 reinterpret_cast<const float*>(x_data),
+                                 reinterpret_cast<float*>(y_data));
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+namespace xpu = paddle::lite::kernels::xpu;
+using SequenceReverseFp32 =
+    xpu::SequenceReverseCompute<float, PRECISION(kFloat)>;
+using SequenceReverseInt64 =
+    xpu::SequenceReverseCompute<int64_t, PRECISION(kInt64)>;
+
+REGISTER_LITE_KERNEL(
+    sequence_reverse, kXPU, kFloat, kNCHW, SequenceReverseFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    sequence_reverse, kXPU, kInt64, kNCHW, SequenceReverseInt64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_reverse_compute.h b/lite/kernels/xpu/sequence_reverse_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..91b285de767c65f93352380df7877e53d61ccd53
--- /dev/null
+++ b/lite/kernels/xpu/sequence_reverse_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+template <typename T, PrecisionType PType>
+class SequenceReverseCompute : public KernelLite<TARGET(kXPU), PType> {
+ public:
+  using param_t = operators::SequenceReverseParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard lod_xpu_guard_;
+  std::unique_ptr<int[]> lod_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54c74211f9738995a8191c77e879a85762d71b3b
--- /dev/null
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/sequence_topk_avg_pooling_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SequenceTopkAvgPoolingCompute::PrepareForRun() {
+  lod_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(256 * sizeof(int));
+  in_lod_cpu.reset(new int[64]);
+  row_lod_cpu.reset(new int[64]);
+  col_lod_cpu.reset(new int[64]);
+}
+
+void SequenceTopkAvgPoolingCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* in = param.X;
+  auto* row = param.ROW;
+  auto* col = param.COLUMN;
+  auto* out = param.Out;
+  auto* pos = param.pos;
+
+  auto channel_num = param.channel_num;
+  auto topks = param.topks;
+  auto k_num = topks.size();
+  auto max_k = topks[topks.size() - 1];
+  auto in_lod = in->lod()[0];
+
+  auto row_lod = row->lod()[0];
+  auto col_lod = col->lod()[0];
+  int batch_size = row_lod.size() - 1;
+  int pos_total_size = row_lod[batch_size] * channel_num * max_k;
+  std::vector<int64_t> vec_pos_shape;
+  vec_pos_shape.push_back(pos_total_size);
+  pos->Resize(vec_pos_shape);
+  auto pos_data = pos->mutable_data<int>(TARGET(kXPU));
+
+  int offset = 0;
+  std::vector<uint64_t> vec_out_lod;
+  vec_out_lod.reserve(batch_size + 1);
+  for (int i = 0; i <= batch_size; ++i) {
+    offset = row_lod[i];
+    vec_out_lod.push_back(offset);
+  }
+  LoD lod_temp;
+  lod_temp.push_back(vec_out_lod);
+  out->set_lod(lod_temp);
+
+  auto in_data = in->data<float>();
+  auto out_data = out->mutable_data<float>(TARGET(kXPU));
+
+  int* in_lod_xpu = reinterpret_cast<int*>(lod_xpu_guard_->addr_);
+  int* row_lod_xpu = in_lod_xpu + in_lod.size();
+  int* col_lod_xpu = row_lod_xpu + row_lod.size();
+  int* topks_xpu = col_lod_xpu + col_lod.size();
+  for (int i = 0; i < in_lod.size(); ++i) {
+    in_lod_cpu[i] = in_lod[i];
+  }
+  for (int i = 0; i < row_lod.size(); ++i) {
+    row_lod_cpu[i] = row_lod[i];
+  }
+  for (int i = 0; i < col_lod.size(); ++i) {
+    col_lod_cpu[i] = col_lod[i];
+  }
+  xpu_memcpy(in_lod_xpu,
+             in_lod_cpu.get(),
+             in_lod.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  xpu_memcpy(row_lod_xpu,
+             row_lod_cpu.get(),
+             row_lod.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  xpu_memcpy(col_lod_xpu,
+             col_lod_cpu.get(),
+             col_lod.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  xpu_memcpy(topks_xpu,
+             topks.data(),
+             topks.size() * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+  int r = xdnn::sequence_topk_avg_pooling(ctx.GetRawContext(),
+                                          in_data,
+                                          out_data,
+                                          pos_data,
+                                          batch_size,
+                                          channel_num,
+                                          in_lod_xpu,
+                                          row_lod_xpu,
+                                          col_lod_xpu,
+                                          topks_xpu,
+                                          k_num);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_topk_avg_pooling,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SequenceTopkAvgPoolingCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("pos", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7c54ca96225ee9ec37d6d0487a526347c19fdb2d
--- /dev/null
+++ b/lite/kernels/xpu/sequence_topk_avg_pooling_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SequenceTopkAvgPoolingCompute
+    : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceTopkAvgPoolingParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard lod_xpu_guard_;
+  std::unique_ptr<int[]> in_lod_cpu;
+  std::unique_ptr<int[]> row_lod_cpu;
+  std::unique_ptr<int[]> col_lod_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc
index 9c2191331c85a7f99ffb5a2e9662ed5831cb1dda..981922f8eacab57da4638e1fdcdd3df72465b379 100644
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -27,12 +27,35 @@ namespace lite {
 namespace kernels {
 namespace xpu {
 
-int SubgraphEngine::BuildDeviceProgram() {
+bool SubgraphEngine::PrepareWorkspaceForDeviceProgram() {
+  // Obtain the origin input tensors, and create the origin output
+  // tensors(Don't try to access them before launch the device program or the
+  // origin program)
+  PrepareWorkspaceForOriginProgram();
+  // Create the device input and output tensors, but don't initialize them
+  // with the dimensions
+  device_itensors_.resize(input_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    device_itensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_itensors_[i]);
+  }
+  device_otensors_.resize(output_names_.size());
+  for (int i = 0; i < output_names_.size(); i++) {
+    device_otensors_[i].reset(new hiai::AiTensor);
+    CHECK(device_otensors_[i]);
+  }
+  return true;
+}
+
+bool SubgraphEngine::BuildDeviceProgram() {
   int status = 0;
   // Convert all of ops and their input vars and weights and added into the XPU
   // IR graph
   subgraph::xpu::Graph graph;
   const auto& bridges = subgraph::Registry::Instance();
+  if (origin_program_.empty()) {
+    BuildOriginProgram();
+  }
   for (auto& inst : origin_program_) {
     auto op = const_cast<OpLite*>(inst.op());
     CHECK(op);
@@ -40,13 +63,13 @@ int SubgraphEngine::BuildDeviceProgram() {
     op->InferShape();
     std::string op_type = op->op_info()->Type();
     if (!bridges.Exists(op_type, TARGET(kXPU))) {
-      return subgraph::FAILED;
+      return false;
     }
     auto kernel = inst.kernel();
     status |= bridges.Select(op_type, TARGET(kXPU))(
         reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
-      return subgraph::FAILED;
+      return false;
     }
   }
   // Obtain the output nodes of the XPU IR graph and build the graph to the XPU
@@ -86,7 +109,7 @@ int SubgraphEngine::BuildDeviceProgram() {
       &graph.builder_, &graph.params_, &device_onodes);
   if (device_program_ == nullptr) {
     LOG(WARNING) << "[XPU] Build model failed!";
-    return subgraph::FAILED;
+    return false;
   }
 
   // Query and check the dimensions of input and output tensors
@@ -166,10 +189,10 @@ int SubgraphEngine::BuildDeviceProgram() {
     device_otensors_[i].strides = nullptr;
     device_otensors_[i].byte_offset = 0;
   }
-  return status;
+  return true;
 }
 
-int SubgraphEngine::LaunchDeviceProgram() {
+bool SubgraphEngine::LaunchDeviceProgram() {
   for (size_t i = 0; i < device_itensors_.size(); i++) {
     // Update the data pointer of DLTensor to track the origin input tensors
     device_itensors_[i].data =
@@ -191,7 +214,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
         const_cast<void*>(origin_otensors_[i]->raw_data());
     device_program_->CopyOutputTo(i, &device_otensors_[i]);
   }
-  return 0;
+  return true;
 }
 
 void SubgraphCompute::PrepareForRun() {
@@ -203,12 +226,11 @@ void SubgraphCompute::PrepareForRun() {
                                    param.output_data_names,
                                    param.scope));
   CHECK(engine_);
-  engine_->Build();
 }
 
 void SubgraphCompute::Run() {
   CHECK(engine_);
-  engine_->Launch();
+  engine_->Run();
 }
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h
index 601c8821bc826e350c233573bf7eff89cdf5c1f5..f09a06a85d5382c72e9efb20cede8bea1922f2da 100644
--- a/lite/kernels/xpu/subgraph_compute.h
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -39,13 +39,14 @@ class SubgraphEngine : public subgraph::Engine {
             ctx, block_idx, block_desc, input_names, output_names, scope) {}
 
  protected:
-  int BuildDeviceProgram() override;
-  int LaunchDeviceProgram() override;
+  bool PrepareWorkspaceForDeviceProgram() override;
+  bool BuildDeviceProgram() override;
+  bool LaunchDeviceProgram() override;
 
   std::vector<std::string> device_inames_;
   std::vector<std::string> device_onames_;
-  std::vector<DLTensor> device_itensors_;
-  std::vector<DLTensor> device_otensors_;
+  std::vector<DLTensor> device_itensors_{};
+  std::vector<DLTensor> device_otensors_{};
   std::unique_ptr<xtcl::network::xRuntimeInstance> device_program_{nullptr};
 };
 
diff --git a/lite/kernels/xpu/var_conv_2d_compute.cc b/lite/kernels/xpu/var_conv_2d_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b573c810922db98e901c9f9a1953116f3fdfc657
--- /dev/null
+++ b/lite/kernels/xpu/var_conv_2d_compute.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/var_conv_2d_compute.h"
+#include <vector>
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void VarConv2DCompute::PrepareForRun() {
+  offset_x_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  offset_y_xpu_guard_ = TargetWrapperXPU::MallocScratchPad(64 * sizeof(int));
+  offset_x_cpu.reset(new int[64]);
+  offset_y_cpu.reset(new int[64]);
+}
+
+void VarConv2DCompute::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+
+  auto* bottom = param.X;
+  auto* w = param.W;
+  auto* top = param.Out;
+
+  int output_channel = param.output_channel;
+  int input_channel = param.input_channel;
+  int kernel_h = param.kernel_h;
+  int kernel_w = param.kernel_w;
+  int stride_h = param.stride_h;
+  int stride_w = param.stride_w;
+  float w_max = param.__xpu__w_max;
+  bool fuse_relu = param.fuse_relu;
+  bool float_to_fix = param.__xpu__float_to_fix;
+  CHECK(float_to_fix) << "W should be fixed point";
+
+  xdnn::Activation_t act = xdnn::Activation_t::LINEAR;
+  if (fuse_relu) {
+    act = xdnn::Activation_t::RELU;
+  }
+
+  int batch = bottom->lod()[0].size() - 1;
+  const auto& offset_x = bottom->lod()[2];
+  const auto& offset_y = bottom->lod()[1];
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    int top_im_y = 0;
+    if (width != 0) {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    if (height != 0) {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_im_size = top_im_y * top_im_x;
+    top_size += output_channel * top_im_size;
+    top_offset.push_back(top_size);
+  }
+
+  LoD top_lod;
+  top_lod.push_back(top_offset);
+  top_lod.push_back(bottom->lod()[1]);
+  top_lod.push_back(bottom->lod()[2]);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{top_size};
+  top_dims_vec.push_back(1);
+  top->Resize(top_dims_vec);
+  auto* top_data = top->mutable_data<float>(TARGET(kXPU));
+
+  auto* bottom_data = bottom->data<float>();
+  auto* w_data = w->data<int16_t>();
+
+  int* offset_x_xpu = reinterpret_cast<int*>(offset_x_xpu_guard_->addr_);
+  int* offset_y_xpu = reinterpret_cast<int*>(offset_y_xpu_guard_->addr_);
+  for (int i = 0; i < (batch + 1); ++i) {
+    offset_x_cpu[i] = offset_x[i];
+    offset_y_cpu[i] = offset_y[i];
+  }
+  xpu_memcpy(offset_x_xpu,
+             offset_x_cpu.get(),
+             (batch + 1) * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+  xpu_memcpy(offset_y_xpu,
+             offset_y_cpu.get(),
+             (batch + 1) * sizeof(int),
+             XPUMemcpyKind::XPU_HOST_TO_DEVICE);
+
+  int r = xdnn::search_varconv<float, int16_t>(ctx.GetRawContext(),
+                                               batch,
+                                               input_channel,
+                                               output_channel,
+                                               kernel_h,
+                                               kernel_w,
+                                               stride_h,
+                                               stride_w,
+                                               bottom_data,
+                                               w_data,
+                                               offset_x_xpu,
+                                               offset_y_xpu,
+                                               top_data,
+                                               w_max,
+                                               act);
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(var_conv_2d,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::VarConv2DCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kXPU))})
+    .Finalize();
diff --git a/lite/kernels/xpu/var_conv_2d_compute.h b/lite/kernels/xpu/var_conv_2d_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..4d9f0ca7a9851a0c3071e72519c4ad1f40ea3483
--- /dev/null
+++ b/lite/kernels/xpu/var_conv_2d_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/backends/xpu/target_wrapper.h"  // XPUScratchPadGuard
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class VarConv2DCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::VarConv2DParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+ private:
+  XPUScratchPadGuard offset_x_xpu_guard_;
+  XPUScratchPadGuard offset_y_xpu_guard_;
+  std::unique_ptr<int[]> offset_x_cpu;
+  std::unique_ptr<int[]> offset_y_cpu;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/CMakeLists.txt b/lite/model_parser/CMakeLists.txt
index 34d524c5c1b86fb6b689b86089c355e3de42a34e..a83cecf4444910e710d0eb92b9c3449190f5bda2 100644
--- a/lite/model_parser/CMakeLists.txt
+++ b/lite/model_parser/CMakeLists.txt
@@ -1,8 +1,9 @@
 if (NOT LITE_ON_TINY_PUBLISH)
     add_subdirectory(pb)
 endif()
-add_subdirectory(cpp)
+add_subdirectory(general)
 add_subdirectory(naive_buffer)
+add_subdirectory(flatbuffers)
 
 #lite_cc_library(runtime_lite SRCS runtime.cc)
 
diff --git a/lite/model_parser/desc_apis.h b/lite/model_parser/base/apis.h
similarity index 95%
rename from lite/model_parser/desc_apis.h
rename to lite/model_parser/base/apis.h
index 28d7f84b2a574a0399046636c9b809c0878f8d4d..2ad6ff47ee17fcdfab335b3a6f87229811d971ae 100644
--- a/lite/model_parser/desc_apis.h
+++ b/lite/model_parser/base/apis.h
@@ -17,5 +17,6 @@
 #include "lite/model_parser/base/block_desc.h"
 #include "lite/model_parser/base/op_desc.h"
 #include "lite/model_parser/base/program_desc.h"
+#include "lite/model_parser/base/traits.h"
 #include "lite/model_parser/base/var_desc.h"
 #include "lite/utils/all.h"
diff --git a/lite/model_parser/base/block_desc.h b/lite/model_parser/base/block_desc.h
index f4ddfddf406e76905f0286441d09b50402513ac6..3fd7998aa392034173f7474bc6b4d106f9fbcbd4 100644
--- a/lite/model_parser/base/block_desc.h
+++ b/lite/model_parser/base/block_desc.h
@@ -17,6 +17,7 @@
 #include <cstdint>
 #include <string>
 #include <vector>
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -46,11 +47,11 @@ class BlockDescReadAPI {
 
 class BlockDescWriteAPI {
  public:
-  virtual void SetIdx(int32_t idx) = 0;
-  virtual void SetParentIdx(int32_t idx) = 0;
-  virtual void ClearVars() = 0;
-  virtual void ClearOps() = 0;
-  virtual void SetForwardBlockIdx(int32_t idx) = 0;
+  virtual void SetIdx(int32_t idx) { NotImplemented(); }
+  virtual void SetParentIdx(int32_t idx) { NotImplemented(); }
+  virtual void ClearVars() { NotImplemented(); }
+  virtual void ClearOps() { NotImplemented(); }
+  virtual void SetForwardBlockIdx(int32_t idx) { NotImplemented(); }
 
   template <typename T>
   T* AddVar();
@@ -59,6 +60,11 @@ class BlockDescWriteAPI {
   T* AddOp();
 
   virtual ~BlockDescWriteAPI() = default;
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "BlockDescWriteAPI is not available in model read-only mode.";
+  }
 };
 
 // The reading and writing of the model are one-time and separate.
diff --git a/lite/model_parser/base/op_desc.h b/lite/model_parser/base/op_desc.h
index 144f7064f07f16f58b1aa97da819862acb312a63..185f5917c46127de1e16e274d0be95073b1a37f6 100644
--- a/lite/model_parser/base/op_desc.h
+++ b/lite/model_parser/base/op_desc.h
@@ -15,56 +15,13 @@
 #pragma once
 #include <string>
 #include <vector>
+#include "lite/model_parser/base/traits.h"
+#include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
 
-// The AttrType is used to make the proto::AttrType portable.
-enum class OpAttrType {
-  INT = 0,
-  FLOAT = 1,
-  STRING = 2,
-  INTS = 3,
-  FLOATS = 4,
-  STRINGS = 5,
-  BOOLEAN = 6,
-  BOOLEANS = 7,
-  BLOCK = 8,
-  LONG = 9,
-  BLOCKS = 10,
-  LONGS = 11,
-  UNK,
-};
-
-template <OpAttrType Type>
-struct OpAttrTypeTrait;
-
-template <typename T>
-struct OpDataTypeTrait;
-
-#define TYPE_TRAIT_IMPL(T, type__)                  \
-  template <>                                       \
-  struct OpAttrTypeTrait<OpAttrType::T> {           \
-    typedef type__ DT;                              \
-  };                                                \
-  template <>                                       \
-  struct OpDataTypeTrait<type__> {                  \
-    static constexpr OpAttrType AT = OpAttrType::T; \
-    static constexpr const char* ATN = #T;          \
-  };
-
-TYPE_TRAIT_IMPL(INT, int32_t);
-TYPE_TRAIT_IMPL(FLOAT, float);
-TYPE_TRAIT_IMPL(STRING, std::string);
-TYPE_TRAIT_IMPL(BOOLEAN, bool);
-TYPE_TRAIT_IMPL(LONG, int64_t);
-TYPE_TRAIT_IMPL(INTS, std::vector<int>);
-TYPE_TRAIT_IMPL(FLOATS, std::vector<float>);
-TYPE_TRAIT_IMPL(STRINGS, std::vector<std::string>);
-TYPE_TRAIT_IMPL(LONGS, std::vector<int64_t>);
-#undef TYPE_TRAIT_IMPL
-
 class OpDescReadAPI {
  public:
   virtual std::string Type() const = 0;
@@ -105,16 +62,25 @@ class OpDescReadAPI {
 
 class OpDescWriteAPI {
  public:
-  virtual void SetType(const std::string& type) = 0;
+  virtual void SetType(const std::string& type) { NotImplemented(); }
   virtual void SetInput(const std::string& param,
-                        const std::vector<std::string>& args) = 0;
+                        const std::vector<std::string>& args) {
+    NotImplemented();
+  }
   virtual void SetOutput(const std::string& param,
-                         const std::vector<std::string>& args) = 0;
+                         const std::vector<std::string>& args) {
+    NotImplemented();
+  }
 
   template <typename T>
   void SetAttr(const std::string& name, const T& v);
 
   virtual ~OpDescWriteAPI() = default;
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "OpDescWriteAPI is not available in model read-only mode.";
+  }
 };
 
 // The reading and writing of the model are one-time and separate.
diff --git a/lite/model_parser/base/program_desc.h b/lite/model_parser/base/program_desc.h
index f04aa1ddf6f62e2eb3129c92f53c9401b6fdefc7..c4423f288d8ea90039ffad0db08342b594415fe6 100644
--- a/lite/model_parser/base/program_desc.h
+++ b/lite/model_parser/base/program_desc.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "lite/utils/cp_logging.h"
+
 namespace paddle {
 namespace lite {
 
@@ -34,13 +36,19 @@ class ProgramDescReadAPI {
 
 class ProgramDescWriteAPI {
  public:
-  virtual void ClearBlocks() = 0;
-  virtual void SetVersion(int64_t version) = 0;
+  virtual void ClearBlocks() { NotImplemented(); }
+  virtual void SetVersion(int64_t version) { NotImplemented(); }
 
   template <typename T>
   T* AddBlock();
 
   virtual ~ProgramDescWriteAPI() = default;
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL)
+        << "ProgramDescWriteAPI is not available in model read-only mode.";
+  }
 };
 
 // The reading and writing of the model are one-time and separate.
diff --git a/lite/model_parser/base/traits.h b/lite/model_parser/base/traits.h
new file mode 100644
index 0000000000000000000000000000000000000000..bda293686c7996abb9b0fe36edcc84407ed3b541
--- /dev/null
+++ b/lite/model_parser/base/traits.h
@@ -0,0 +1,82 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+
+// The AttrType is used to make the proto::AttrType portable.
+enum class OpAttrType {
+  INT = 0,
+  FLOAT = 1,
+  STRING = 2,
+  INTS = 3,
+  FLOATS = 4,
+  STRINGS = 5,
+  BOOLEAN = 6,
+  BOOLEANS = 7,
+  BLOCK = 8,
+  LONG = 9,
+  BLOCKS = 10,
+  LONGS = 11,
+  UNK,
+};
+
+struct Standard {};
+struct Flatbuffers {};
+
+template <typename T, typename U>
+class VectorView;
+
+template <typename T, typename U = Standard>
+struct OpDataTypeTrait;
+
+#define ATTR_TYPE_TRAIT_IMPL(T, type__)             \
+  template <typename U>                             \
+  struct OpDataTypeTrait<type__, U> {               \
+    typedef type__ ET;                              \
+    typedef type__ RT;                              \
+    static constexpr OpAttrType AT = OpAttrType::T; \
+    static constexpr const char* ATN = #T;          \
+  };
+#define ATTR_VECTOR_TYPE_TRAIT_IMPL(T, type__)      \
+  template <typename U>                             \
+  struct OpDataTypeTrait<std::vector<type__>, U> {  \
+    typedef type__ ET;                              \
+    typedef VectorView<type__, U> RT;               \
+    static constexpr OpAttrType AT = OpAttrType::T; \
+    static constexpr const char* ATN = #T;          \
+  };
+
+ATTR_TYPE_TRAIT_IMPL(BLOCK, int16_t);
+ATTR_TYPE_TRAIT_IMPL(INT, int32_t);
+ATTR_TYPE_TRAIT_IMPL(FLOAT, float);
+ATTR_TYPE_TRAIT_IMPL(STRING, std::string);
+ATTR_TYPE_TRAIT_IMPL(BOOLEAN, bool);
+ATTR_TYPE_TRAIT_IMPL(LONG, int64_t);
+
+ATTR_VECTOR_TYPE_TRAIT_IMPL(INTS, int32_t);
+ATTR_VECTOR_TYPE_TRAIT_IMPL(FLOATS, float);
+ATTR_VECTOR_TYPE_TRAIT_IMPL(STRINGS, std::string);
+ATTR_VECTOR_TYPE_TRAIT_IMPL(LONGS, int64_t);
+
+#undef ATTR_TYPE_TRAIT_IMPL
+#undef ATTR_VECTOR_TYPE_TRAIT_IMPL
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/base/var_desc.h b/lite/model_parser/base/var_desc.h
index 0aa88d02d540f297c2995b8e7c1ccf4eca8472c0..47596f8792a83677a036bcb3d937e67576204546 100644
--- a/lite/model_parser/base/var_desc.h
+++ b/lite/model_parser/base/var_desc.h
@@ -16,6 +16,7 @@
 
 #include <string>
 #include <vector>
+#include "lite/utils/cp_logging.h"
 
 namespace paddle {
 namespace lite {
@@ -62,11 +63,16 @@ class VarDescReadAPI {
 
 class VarDescWriteAPI {
  public:
-  virtual void SetName(std::string name) = 0;
-  virtual void SetType(VarDataType type) = 0;
-  virtual void SetPersistable(bool persistable) = 0;
-  virtual void SetShape(const std::vector<int64_t>& dims) = 0;
+  virtual void SetName(std::string name) { NotImplemented(); }
+  virtual void SetType(VarDataType type) { NotImplemented(); }
+  virtual void SetPersistable(bool persistable) { NotImplemented(); }
+  virtual void SetShape(const std::vector<int64_t>& dims) { NotImplemented(); }
   virtual ~VarDescWriteAPI() = default;
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "VarDescWriteAPI is not available in model read-only mode.";
+  }
 };
 
 // The reading and writing of the model are one-time and separate.
diff --git a/lite/model_parser/base/vector_view.h b/lite/model_parser/base/vector_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..c6337faa403a2c9a2758b90a4c1f7d092554b0b2
--- /dev/null
+++ b/lite/model_parser/base/vector_view.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <type_traits>
+#include <vector>
+#include "lite/model_parser/base/traits.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace vector_view {
+
+template <typename T, typename U = void>
+struct ElementTraits {
+  typedef T element_type;
+};
+
+template <typename T, typename U>
+struct VectorTraits;
+
+template <typename T>
+struct VectorTraits<T, Standard> {
+  typedef std::vector<T> vector_type;
+  typedef typename vector_type::const_iterator const_iterator;
+  typedef typename vector_type::const_reference const_reference;
+  typedef const_reference subscript_return_type;
+};
+
+}  // namespace vector_view
+
+// In the process of optimizing the performance of model loading, we found
+// that it was necessary to reduce the copying and construction of STL
+// containers. So use VectorView to simulate the operation of STL containers
+// without copying, such as iteration and subscripting.
+//
+// Currently, VectorView is applicable to STL vector and Flatbuffers Vector.
+// We used the template Traits to unify the behavior of the two, and provided
+// an implicit conversion operator from VectorView to STL vector. Please use
+// implicit conversion with caution because it will bring significant overhead.
+
+template <typename T, typename U = Flatbuffers>
+class VectorView {
+ public:
+  typedef vector_view::VectorTraits<T, U> Traits;
+  explicit VectorView(typename Traits::vector_type const* cvec) {
+    cvec_ = cvec;
+  }
+  typename Traits::subscript_return_type operator[](size_t i) const {
+    return cvec_->operator[](i);
+  }
+  typename Traits::const_iterator begin() const { return cvec_->begin(); }
+  typename Traits::const_iterator end() const { return cvec_->end(); }
+  size_t size() const { return cvec_->size(); }
+  operator std::vector<T>() const {
+    VLOG(5) << "Copying elements out of VectorView will damage performance.";
+    std::vector<T> tmp;
+    tmp.reserve(cvec_->size());
+    for (auto val : *cvec_) {
+      tmp.push_back(val);
+    }
+    return tmp;
+  }
+  ~VectorView() = default;
+
+ private:
+  typename Traits::vector_type const* cvec_;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/compatibility.cc b/lite/model_parser/compatibility.cc
index 67d7c9d69152d31d1381ea847ef859a08e4f82a7..dd43f7bd25277e34a2fd8b04aae6b705402a0436 100644
--- a/lite/model_parser/compatibility.cc
+++ b/lite/model_parser/compatibility.cc
@@ -20,10 +20,7 @@
 #include "lite/model_parser/naive_buffer/program_desc.h"
 #include "lite/model_parser/naive_buffer/var_desc.h"
 #ifndef LITE_ON_TINY_PUBLISH
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #endif
 
 namespace paddle {
diff --git a/lite/model_parser/compatibility.h b/lite/model_parser/compatibility.h
index 9e421d709d1823852d6dac5cd0070b4330f56752..a47870cf9c4d8e1743f2eb749823e88f18b33900 100644
--- a/lite/model_parser/compatibility.h
+++ b/lite/model_parser/compatibility.h
@@ -17,7 +17,7 @@
 #include <set>
 #include <string>
 #include "lite/api/paddle_place.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/model_parser/compatibility_test.cc b/lite/model_parser/compatibility_test.cc
index b3cb38f1c95649567b72d73b8938420537ec7b5b..957bcb25ea68b5555c9937de4e87dc8e9c4923b1 100644
--- a/lite/model_parser/compatibility_test.cc
+++ b/lite/model_parser/compatibility_test.cc
@@ -17,10 +17,7 @@
 #include "lite/api/paddle_lite_factory_helper.h"
 
 #include "lite/model_parser/compatible_pb.h"
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 
 USE_LITE_KERNEL(leaky_relu, kCUDA, kFloat, kNCHW, def);
 
diff --git a/lite/model_parser/compatible_pb.h b/lite/model_parser/compatible_pb.h
index 80fee49133130b09fbdd490ed86dce0af924aac1..c9889a5879160dd60ec64c4806df8af888db99c9 100644
--- a/lite/model_parser/compatible_pb.h
+++ b/lite/model_parser/compatible_pb.h
@@ -21,10 +21,7 @@
  * lite::pb::XXDesc/lite::naive_buffer::XXDesc.
  */
 
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 
 namespace paddle {
 namespace lite {
diff --git a/lite/model_parser/compatible_pb_test.cc b/lite/model_parser/compatible_pb_test.cc
index 088b64bf2cd13ce0f443f962bd2cb5f709c4d4f2..d9a46e463209eb33e6f2cb53f4644056f88e7085 100644
--- a/lite/model_parser/compatible_pb_test.cc
+++ b/lite/model_parser/compatible_pb_test.cc
@@ -14,10 +14,7 @@
 
 #include "lite/model_parser/compatible_pb.h"
 #include <gtest/gtest.h>
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/program_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/model_parser/naive_buffer/block_desc.h"
 #include "lite/model_parser/naive_buffer/op_desc.h"
 #include "lite/model_parser/naive_buffer/program_desc.h"
diff --git a/lite/model_parser/cpp_desc.h b/lite/model_parser/cpp_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..477f90a28d7bf1e31dbc648b18af42381e0c93d6
--- /dev/null
+++ b/lite/model_parser/cpp_desc.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/model_parser/general/block_desc.h"
+#include "lite/model_parser/general/op_desc.h"
+#include "lite/model_parser/general/program_desc.h"
+#include "lite/model_parser/general/var_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace cpp = general;
+}
+}
diff --git a/lite/model_parser/flatbuffers/CMakeLists.txt b/lite/model_parser/flatbuffers/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5ca669bfeb512de47f3a15eb7119f12487accc8a
--- /dev/null
+++ b/lite/model_parser/flatbuffers/CMakeLists.txt
@@ -0,0 +1,16 @@
+function(lite_fbs_library TARGET)
+  set(multiValueArgs SRCS FBS_DEPS)
+  cmake_parse_arguments(args "" "" "${multiValueArgs}" ${ARGN})
+  lite_cc_library(${TARGET} SRCS ${args_SRCS})
+  add_dependencies(${TARGET} ${args_FBS_DEPS})
+endfunction()
+
+lite_fbs_library(fbs_op_desc SRCS op_desc.cc FBS_DEPS framework_fbs_header)
+lite_fbs_library(fbs_var_desc SRCS var_desc.cc FBS_DEPS framework_fbs_header)
+lite_fbs_library(fbs_block_desc SRCS block_desc.cc FBS_DEPS framework_fbs_header)
+lite_fbs_library(fbs_program_desc SRCS program_desc.cc FBS_DEPS framework_fbs_header)
+
+lite_cc_test(test_vector_view SRCS vector_view_test.cc)
+if (TARGET test_vector_view)
+  add_dependencies(test_vector_view framework_fbs_header)
+endif()
diff --git a/lite/model_parser/flatbuffers/block_desc.cc b/lite/model_parser/flatbuffers/block_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc43af6d6273c845f00e2046ae846f044659fe57
--- /dev/null
+++ b/lite/model_parser/flatbuffers/block_desc.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/block_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+template <>
+proto::VarDesc* BlockDesc::GetVar<proto::VarDesc>(int32_t idx) {
+  CHECK_LT(idx, VarsSize()) << "idx >= vars.size()";
+  return const_cast<proto::VarDesc*>(desc_->vars()->Get(idx));
+}
+
+template <>
+proto::OpDesc* BlockDesc::GetOp<proto::OpDesc>(int32_t idx) {
+  CHECK_LT(idx, OpsSize()) << "idx >= ops.size()";
+  return const_cast<proto::OpDesc*>(desc_->ops()->Get(idx));
+}
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/block_desc.h b/lite/model_parser/flatbuffers/block_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..0bfef5a452051c37e31f9d2c6ab2504e9addd800
--- /dev/null
+++ b/lite/model_parser/flatbuffers/block_desc.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/model_parser/base/block_desc.h"
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+class BlockDesc : public BlockDescAPI {
+ public:
+  explicit BlockDesc(proto::BlockDesc* desc) : desc_(desc) { CHECK(desc_); }
+
+  int32_t Idx() const override { return desc_->idx(); }
+
+  int32_t ParentIdx() const override { return desc_->parent_idx(); }
+
+  size_t VarsSize() const override { return desc_->vars()->size(); }
+
+  template <typename T>
+  T* GetVar(int32_t idx);
+
+  template <typename T>
+  T const* GetVar(int32_t idx) const {
+    return GetVar<T>(idx);
+  }
+
+  size_t OpsSize() const override {
+    CHECK(desc_);
+    CHECK(desc_->ops());
+    return desc_->ops()->size();
+  }
+
+  template <typename T>
+  T* GetOp(int32_t idx);
+
+  template <typename T>
+  T const* GetOp(int32_t idx) const {
+    return GetOp<T>(idx);
+  }
+
+  int32_t ForwardBlockIdx() const override {
+    return desc_->forward_block_idx();
+  }
+
+  BlockDesc() = delete;
+
+ private:
+  proto::BlockDesc* desc_;  // not_own
+};
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/framework.fbs b/lite/model_parser/flatbuffers/framework.fbs
new file mode 100644
index 0000000000000000000000000000000000000000..90f6e626088003975f18303e47230a85c303181d
--- /dev/null
+++ b/lite/model_parser/flatbuffers/framework.fbs
@@ -0,0 +1,172 @@
+// Generated from framework.proto
+
+namespace paddle.lite.fbs.proto;
+
+enum AttrType : int {
+  INT = 0,
+  FLOAT = 1,
+  STRING = 2,
+  INTS = 3,
+  FLOATS = 4,
+  STRINGS = 5,
+  BOOLEAN = 6,
+  BOOLEANS = 7,
+  BLOCK = 8,
+  LONG = 9,
+  BLOCKS = 10,
+  LONGS = 11,
+}
+
+namespace paddle.lite.fbs.proto.VarType_;
+
+enum Type : int {
+  BOOL = 0,
+  INT16 = 1,
+  INT32 = 2,
+  INT64 = 3,
+  FP16 = 4,
+  FP32 = 5,
+  FP64 = 6,
+  LOD_TENSOR = 7,
+  SELECTED_ROWS = 8,
+  FEED_MINIBATCH = 9,
+  FETCH_LIST = 10,
+  STEP_SCOPES = 11,
+  LOD_RANK_TABLE = 12,
+  LOD_TENSOR_ARRAY = 13,
+  PLACE_LIST = 14,
+  READER = 15,
+  RAW = 17,
+  TUPLE = 18,
+  SIZE_T = 19,
+  UINT8 = 20,
+  INT8 = 21,
+}
+
+namespace paddle.lite.fbs.proto.CompatibleInfo_;
+
+enum Type : int {
+  COMPATIBLE = 0,
+  DEFINITELY_NOT = 1,
+  POSSIBLE = 2,
+  BUG_FIX = 3,
+  PRECISION_CHANGE = 4,
+}
+
+namespace paddle.lite.fbs.proto;
+
+table Version {
+  version:long;
+}
+
+table OpDesc {
+  type:string (required);
+  inputs:[paddle.lite.fbs.proto.OpDesc_.Var];
+  outputs:[paddle.lite.fbs.proto.OpDesc_.Var];
+  attrs:[paddle.lite.fbs.proto.OpDesc_.Attr];
+  is_target:bool;
+}
+
+namespace paddle.lite.fbs.proto.OpDesc_;
+
+table Attr {
+  name:string (required, key);
+  type:paddle.lite.fbs.proto.AttrType;
+  i:int;
+  f:float;
+  s:string;
+  ints:[int];
+  floats:[float];
+  strings:[string];
+  b:bool;
+  bools:[bool];
+  block_idx:int;
+  l:long;
+  blocks_idx:[int];
+  longs:[long];
+}
+
+table Var {
+  parameter:string (required, key);
+  arguments:[string];
+}
+
+namespace paddle.lite.fbs.proto;
+
+table VarType {
+  type:paddle.lite.fbs.proto.VarType_.Type;
+  selected_rows:paddle.lite.fbs.proto.VarType_.TensorDesc;
+  lod_tensor:paddle.lite.fbs.proto.VarType_.LoDTensorDesc;
+  tensor_array:paddle.lite.fbs.proto.VarType_.LoDTensorArrayDesc;
+  reader:paddle.lite.fbs.proto.VarType_.ReaderDesc;
+  tuple:paddle.lite.fbs.proto.VarType_.Tuple;
+}
+
+namespace paddle.lite.fbs.proto.VarType_;
+
+table TensorDesc {
+  data_type:paddle.lite.fbs.proto.VarType_.Type;
+  dims:[long];
+}
+
+table LoDTensorDesc {
+  tensor:paddle.lite.fbs.proto.VarType_.TensorDesc (required);
+  lod_level:int;
+}
+
+table LoDTensorArrayDesc {
+  tensor:paddle.lite.fbs.proto.VarType_.TensorDesc (required);
+  lod_level:int;
+}
+
+table ReaderDesc {
+  lod_tensor:[paddle.lite.fbs.proto.VarType_.LoDTensorDesc];
+}
+
+table Tuple {
+  element_type:[paddle.lite.fbs.proto.VarType_.Type];
+}
+
+namespace paddle.lite.fbs.proto;
+
+table VarDesc {
+  name:string (required, key);
+  type:paddle.lite.fbs.proto.VarType (required);
+  persistable:bool;
+  need_check_feed:bool;
+}
+
+table BlockDesc {
+  idx:int;
+  parent_idx:int;
+  vars:[paddle.lite.fbs.proto.VarDesc];
+  ops:[paddle.lite.fbs.proto.OpDesc];
+  forward_block_idx:int = -1;
+}
+
+table CompatibleInfo {
+  version:string (required);
+  type:paddle.lite.fbs.proto.CompatibleInfo_.Type;
+}
+
+table OpCompatibleMap {
+  pair:[paddle.lite.fbs.proto.OpCompatibleMap_.OpCompatiblePair];
+  default_required_version:string;
+}
+
+namespace paddle.lite.fbs.proto.OpCompatibleMap_;
+
+table OpCompatiblePair {
+  op_name:string (required, key);
+  compatible_info:paddle.lite.fbs.proto.CompatibleInfo (required);
+}
+
+namespace paddle.lite.fbs.proto;
+
+table ProgramDesc {
+  blocks:[paddle.lite.fbs.proto.BlockDesc];
+  version:paddle.lite.fbs.proto.Version;
+  op_compatible_map:paddle.lite.fbs.proto.OpCompatibleMap;
+}
+
+root_type paddle.lite.fbs.proto.ProgramDesc;
diff --git a/lite/model_parser/flatbuffers/op_desc.cc b/lite/model_parser/flatbuffers/op_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9e416b020d8fed0861d1d0b02ae74a9ccc47df59
--- /dev/null
+++ b/lite/model_parser/flatbuffers/op_desc.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/op_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+template <>
+std::string OpDesc::GetAttr<std::string>(const std::string& name) const {
+  const auto& it = desc_->attrs()->LookupByKey(name.c_str());
+  if (!it->s()) {
+    return std::string();
+  }
+  return it->s()->str();
+}
+
+template <>
+std::string OpDesc::GetAttr<std::string>(size_t idx) const {
+  const auto& it = desc_->attrs()->Get(idx);
+  if (!it->s()) {
+    return std::string();
+  }
+  return it->s()->str();
+}
+
+template <>
+lite::VectorView<std::string, Flatbuffers>
+OpDesc::GetAttr<std::vector<std::string>>(const std::string& name) const {
+  const auto& it = desc_->attrs()->LookupByKey(name.c_str());
+  CHECK(it) << "Attr " << name << "does not exist.";
+  return VectorView<std::string>(it->strings());
+}
+
+template <>
+VectorView<std::string, Flatbuffers> OpDesc::GetAttr<std::vector<std::string>>(
+    size_t idx) const {
+  const auto& it = desc_->attrs()->Get(idx);
+  CHECK(it) << "Attr " << idx << "does not exist.";
+  return VectorView<std::string>(it->strings());
+}
+
+#define GET_ATTR_IMPL(T, fb_f__)                                         \
+  template <>                                                            \
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT OpDesc::GetAttr<T>( \
+      const std::string& name) const {                                   \
+    const auto& it = desc_->attrs()->LookupByKey(name.c_str());          \
+    return it->fb_f__();                                                 \
+  }                                                                      \
+  template <>                                                            \
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT OpDesc::GetAttr<T>( \
+      size_t idx) const {                                                \
+    const auto& it = desc_->attrs()->Get(idx);                           \
+    return it->fb_f__();                                                 \
+  }
+
+#define GET_ATTRS_IMPL(T, fb_f__)                                            \
+  template <>                                                                \
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT OpDesc::GetAttr<T>(     \
+      const std::string& name) const {                                       \
+    const auto& it = desc_->attrs()->LookupByKey(name.c_str());              \
+    return typename lite::OpDataTypeTrait<T, Flatbuffers>::RT(it->fb_f__()); \
+  }                                                                          \
+  template <>                                                                \
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT OpDesc::GetAttr<T>(     \
+      size_t idx) const {                                                    \
+    const auto& it = desc_->attrs()->Get(idx);                               \
+    return typename lite::OpDataTypeTrait<T, Flatbuffers>::RT(it->fb_f__()); \
+  }
+
+GET_ATTR_IMPL(int32_t, i);
+GET_ATTR_IMPL(int16_t, block_idx);
+GET_ATTR_IMPL(float, f);
+GET_ATTR_IMPL(bool, b);
+GET_ATTR_IMPL(int64_t, l);
+GET_ATTRS_IMPL(std::vector<int>, ints);
+GET_ATTRS_IMPL(std::vector<float>, floats);
+GET_ATTRS_IMPL(std::vector<int64_t>, longs);
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/op_desc.h b/lite/model_parser/flatbuffers/op_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2d78ca68af3d2f0595e710d9c0f75d8cceefbb3
--- /dev/null
+++ b/lite/model_parser/flatbuffers/op_desc.h
@@ -0,0 +1,200 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "lite/model_parser/base/op_desc.h"
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+#include "lite/model_parser/flatbuffers/vector_view.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+class OpDesc : public OpDescAPI {
+ public:
+  explicit OpDesc(proto::OpDesc* desc) : desc_(desc) { CHECK(desc_); }
+
+  std::string Type() const override { return desc_->type()->str(); }
+
+  // Get the arguments of parameter called `param`
+  std::vector<std::string> Input(const std::string& param) const override {
+    const auto& var = desc_->inputs()->LookupByKey(param.c_str());
+    std::vector<std::string> args_vec;
+    if (var->arguments()) {
+      args_vec.reserve(var->arguments()->size());
+      for (const auto& in : *var->arguments()) {
+        args_vec.push_back(in->str());
+      }
+    }
+    return args_vec;
+  }
+
+  std::vector<std::string> InputArgumentNames() const override {
+    const auto& vars = desc_->inputs();
+    std::vector<std::string> input_names_vec;
+    if (vars) {
+      input_names_vec.reserve(vars->size());
+      for (const auto& in : *vars) {
+        input_names_vec.push_back(in->parameter()->str());
+      }
+    }
+    return input_names_vec;
+  }
+
+  std::vector<std::string> Output(const std::string& param) const override {
+    const auto& var = desc_->outputs()->LookupByKey(param.c_str());
+    std::vector<std::string> args_vec;
+    if (var->arguments()) {
+      args_vec.reserve(var->arguments()->size());
+      for (const auto& out : *var->arguments()) {
+        args_vec.push_back(out->str());
+      }
+    }
+    return args_vec;
+  }
+
+  std::vector<std::string> OutputArgumentNames() const override {
+    const auto& vars = desc_->outputs();
+    std::vector<std::string> output_names_vec;
+    if (vars) {
+      output_names_vec.reserve(vars->size());
+      for (const auto& out : *vars) {
+        output_names_vec.push_back(out->parameter()->str());
+      }
+    }
+    return output_names_vec;
+  }
+
+  bool HasAttr(const std::string& name) const override {
+    return desc_->attrs()->LookupByKey(name.c_str()) != nullptr;
+  }
+
+  size_t AttrsSize() const { return desc_->attrs()->size(); }
+
+  std::string AttrName(size_t idx) const {
+    return desc_->attrs()->Get(idx)->name()->str();
+  }
+
+  OpDescAPI::AttrType GetAttrType(const std::string& name) const override {
+    const auto& attr = desc_->attrs()->LookupByKey(name.c_str());
+    CHECK(attr);
+    return static_cast<OpDescAPI::AttrType>(attr->type());
+  }
+
+  OpDescAPI::AttrType GetAttrType(size_t idx) const {
+    const auto& attr = desc_->attrs()->Get(idx);
+    CHECK(attr);
+    return static_cast<OpDescAPI::AttrType>(attr->type());
+  }
+
+  std::vector<std::string> AttrNames() const override {
+    const auto& attrs = desc_->attrs();
+    std::vector<std::string> attr_names_vec;
+    if (attrs) {
+      attr_names_vec.reserve(attrs->size());
+      for (const auto& attr : *attrs) {
+        attr_names_vec.push_back(attr->name()->str());
+      }
+    }
+    return attr_names_vec;
+  }
+
+  template <typename T>
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT GetAttr(
+      const std::string& name) const;
+
+  template <typename T>
+  typename lite::OpDataTypeTrait<T, Flatbuffers>::RT GetAttr(size_t idx) const;
+
+  OpDesc() = delete;
+
+ private:
+  proto::OpDesc* desc_;
+
+  // To reduce overhead, we expect to use namespace aliasing to make cpp::Desc
+  // and flatbuffers::Desc replace each other. However, there is no direct
+  // inheritance relationship between the two data types, and the read-only
+  // version of flatbuffers lacks some write implementations. Therefore, at
+  // present, we are temporarily providing a default interface that triggers
+  // execution-time errors to avoid type ambiguity and compile-time errors
+  // caused by different building options.
+
+ public:
+  bool HasInput(const std::string& param) const {
+    return desc_->inputs()->LookupByKey(param.c_str()) != nullptr;
+  }
+
+  const std::map<std::string, std::vector<std::string>>& inputs() const {
+    NotImplemented();
+    return inputs_;
+  }
+  const std::map<std::string, std::vector<std::string>>& outputs() const {
+    NotImplemented();
+    return outputs_;
+  }
+  std::map<std::string, std::vector<std::string>>* mutable_inputs() {
+    NotImplemented();
+    return &inputs_;
+  }
+  std::map<std::string, std::vector<std::string>>* mutable_outputs() {
+    NotImplemented();
+    return &outputs_;
+  }
+
+  std::vector<std::string> input_vars() const {
+    NotImplemented();
+    return std::vector<std::string>();
+  }
+
+  std::vector<std::string> output_vars() const {
+    NotImplemented();
+    return std::vector<std::string>();
+  }
+
+  bool HasOutput(const std::string& param) const {
+    NotImplemented();
+    return false;
+  }
+
+  const std::map<std::string, Any>& attrs() const {
+    NotImplemented();
+    return attrs_;
+  }
+  const std::map<std::string, AttrType>& attr_types() const {
+    NotImplemented();
+    return attr_types_;
+  }
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "The additional interfaces of OpDesc is temporarily "
+                  "unavailable in read-only mode.";
+  }
+  std::string type_;
+  std::map<std::string, std::vector<std::string>> inputs_;
+  std::map<std::string, std::vector<std::string>> outputs_;
+  std::map<std::string, Any> attrs_;
+  std::map<std::string, AttrType> attr_types_;
+};
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/program_desc.cc b/lite/model_parser/flatbuffers/program_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..36429103a72f7b54651aac8d30671f7b3c41956e
--- /dev/null
+++ b/lite/model_parser/flatbuffers/program_desc.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/program_desc.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+template <>
+proto::BlockDesc* ProgramDesc::GetBlock<proto::BlockDesc>(int32_t idx) {
+  CHECK_LT(idx, BlocksSize()) << "idx >= blocks.size()";
+  return const_cast<proto::BlockDesc*>(desc_->blocks()->Get(idx));
+}
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/program_desc.h b/lite/model_parser/flatbuffers/program_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..f41fd996b2533321c2494ea6c15d53ed31a3e7c8
--- /dev/null
+++ b/lite/model_parser/flatbuffers/program_desc.h
@@ -0,0 +1,54 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "lite/model_parser/base/program_desc.h"
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+class ProgramDesc : public ProgramDescAPI {
+ public:
+  ProgramDesc() = default;
+  explicit ProgramDesc(proto::ProgramDesc *desc) : desc_(desc) { CHECK(desc); }
+
+  size_t BlocksSize() const override { return desc_->blocks()->size(); }
+
+  template <typename T>
+  T *GetBlock(int32_t idx);
+
+  template <typename T>
+  T const *GetBlock(int32_t idx) const {
+    return GetBlock<T>(idx);
+  }
+
+  bool HasVersion() const override { return desc_->version() != nullptr; }
+
+  int64_t Version() const override {
+    CHECK(HasVersion());
+    return desc_->version()->version();
+  }
+
+ private:
+  proto::ProgramDesc *desc_;  // not_own
+};
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/var_desc.cc b/lite/model_parser/flatbuffers/var_desc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a629ffd5e35223aee218a8798a597b8c684c8c62
--- /dev/null
+++ b/lite/model_parser/flatbuffers/var_desc.cc
@@ -0,0 +1,15 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/var_desc.h"
diff --git a/lite/model_parser/flatbuffers/var_desc.h b/lite/model_parser/flatbuffers/var_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..387e52ec3150e5bc01f365934c310fb1990ce1e4
--- /dev/null
+++ b/lite/model_parser/flatbuffers/var_desc.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/model_parser/base/var_desc.h"
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace fbs {
+
+class VarDesc : public VarDescAPI {
+ public:
+  explicit VarDesc(proto::VarDesc* desc) : desc_(desc) {}
+
+  std::string Name() const override { return desc_->name()->str(); }
+
+  VarDescAPI::Type GetType() const override {
+    return static_cast<VarDescAPI::Type>(desc_->type()->type());
+  }
+
+  bool Persistable() const override { return desc_->persistable(); }
+
+  std::vector<int64_t> GetShape() const override {
+    CHECK(GetType() == VarDescAPI::Type::LOD_TENSOR);
+    const auto& dims = desc_->type()->lod_tensor()->tensor()->dims();
+    std::vector<int64_t> dims_vec;
+    dims_vec.reserve(dims->size());
+    for (const auto& dim : *dims) {
+      dims_vec.push_back(dim);
+    }
+    return dims_vec;
+  }
+
+  VarDesc() = delete;
+
+ private:
+  proto::VarDesc* desc_;
+
+  // To reduce overhead, we expect to use namespace aliasing to make cpp::Desc
+  // and flatbuffers::Desc replace each other. However, there is no direct
+  // inheritance relationship between the two data types, and the read-only
+  // version of flatbuffers lacks some write implementations. Therefore, at
+  // present, we are temporarily providing a default interface that triggers
+  // execution-time errors to avoid type ambiguity and compile-time errors
+  // caused by different building options.
+
+ public:
+  VarDescAPI::Type GetDataType() const {
+    NotImplemented();
+    return data_type_;
+  }
+  void SetDataType(Type data_type) { NotImplemented(); }
+  void SetShape(const std::vector<int64_t>& dims) { NotImplemented(); }
+
+ private:
+  void NotImplemented() const {
+    LOG(FATAL) << "The additional interfaces of VarDesc is temporarily "
+                  "unavailable in read-only mode.";
+  }
+  Type data_type_;
+  std::vector<int64_t> shape_;
+};
+
+}  // namespace fbs
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/vector_view.h b/lite/model_parser/flatbuffers/vector_view.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccb700072690c3ecfe55549a1f39d3d574686c7d
--- /dev/null
+++ b/lite/model_parser/flatbuffers/vector_view.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "flatbuffers/flatbuffers.h"
+#include "lite/model_parser/base/vector_view.h"
+
+namespace paddle {
+namespace lite {
+namespace vector_view {
+
+template <typename T>
+struct ElementTraits<T*,
+                     typename std::enable_if<std::is_class<T>::value>::type> {
+  typedef flatbuffers::Offset<T> element_type;
+};
+
+template <>
+struct ElementTraits<std::string, void> {
+  typedef flatbuffers::Offset<flatbuffers::String> element_type;
+};
+
+template <typename T>
+struct VectorTraits<T, Flatbuffers> {
+  typedef flatbuffers::Vector<typename ElementTraits<T>::element_type>
+      vector_type;
+  typedef typename vector_type::const_iterator const_iterator;
+  typedef typename const_iterator::value_type value_type;
+  typedef const typename const_iterator::reference const_reference;
+  typedef value_type subscript_return_type;
+};
+
+struct FBSStrIterator {
+  typedef flatbuffers::VectorIterator<
+      flatbuffers::Offset<flatbuffers::String>,
+      typename flatbuffers::IndirectHelper<
+          flatbuffers::Offset<flatbuffers::String>>::return_type>
+      VI;
+
+  explicit FBSStrIterator(const VI& iter) { iter_ = iter; }
+  const VI& raw_iter() const { return iter_; }
+
+  bool operator==(const FBSStrIterator& other) const {
+    return iter_ == other.raw_iter();
+  }
+
+  bool operator<(const FBSStrIterator& other) const {
+    return iter_ < other.raw_iter();
+  }
+
+  bool operator!=(const FBSStrIterator& other) const {
+    return iter_ != other.raw_iter();
+  }
+
+  ptrdiff_t operator-(const FBSStrIterator& other) const {
+    return iter_ - other.raw_iter();
+  }
+
+  std::string operator*() const { return iter_.operator*()->str(); }
+  std::string operator->() const { return iter_.operator->()->str(); }
+
+  FBSStrIterator& operator++() {
+    iter_++;
+    return *this;
+  }
+
+  FBSStrIterator& operator--() {
+    iter_--;
+    return *this;
+  }
+
+  FBSStrIterator operator+(const size_t& offset) {
+    return FBSStrIterator(iter_ + offset);
+  }
+
+  FBSStrIterator operator-(const size_t& offset) {
+    return FBSStrIterator(iter_ - offset);
+  }
+
+ private:
+  VI iter_;
+};
+
+}  // namespace vector_view
+
+template <>
+class VectorView<std::string, Flatbuffers> {
+ public:
+  typedef vector_view::VectorTraits<std::string, Flatbuffers> Traits;
+  explicit VectorView(typename Traits::vector_type const* cvec) {
+    cvec_ = cvec;
+  }
+  std::string operator[](size_t i) const { return cvec_->operator[](i)->str(); }
+  vector_view::FBSStrIterator begin() const {
+    return vector_view::FBSStrIterator(cvec_->begin());
+  }
+  vector_view::FBSStrIterator end() const {
+    return vector_view::FBSStrIterator(cvec_->end());
+  }
+  size_t size() const { return cvec_->size(); }
+  operator std::vector<std::string>() const {
+    VLOG(5) << "Copying elements out of VectorView will damage performance.";
+    std::vector<std::string> tmp;
+    tmp.reserve(cvec_->size());
+    for (auto val : *cvec_) {
+      tmp.push_back(val->str());
+    }
+    return tmp;
+  }
+  ~VectorView() = default;
+
+ private:
+  typename Traits::vector_type const* cvec_;
+};
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/flatbuffers/vector_view_test.cc b/lite/model_parser/flatbuffers/vector_view_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6512ee69bd4f34c0d6e49274d478404191fd9476
--- /dev/null
+++ b/lite/model_parser/flatbuffers/vector_view_test.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/model_parser/flatbuffers/vector_view.h"
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "lite/model_parser/flatbuffers/framework_generated.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(VectorView, std_vector) {
+  std::vector<int64_t> vector{1, 2, 3};
+  VectorView<int64_t, Standard> vector_view(&vector);
+  size_t i = 0;
+  for (const auto& value : vector_view) {
+    EXPECT_EQ(value, vector[i]);
+    ++i;
+  }
+  for (size_t j = 0; j < vector_view.size(); ++j) {
+    EXPECT_EQ(vector_view[i], vector[i]);
+  }
+}
+
+TEST(VectorView, Flatbuffers) {
+  using namespace flatbuffers;        // NOLINT
+  using namespace paddle::lite::fbs;  // NOLINT
+
+  auto create_desc = [](FlatBufferBuilder& fbb) {
+    /* --------- Set --------- */
+    // Attr
+    std::vector<int32_t> ints({-1, 0, 1, 2, 3});
+    auto string_0 = fbb.CreateString("string_0");
+    auto string_1 = fbb.CreateString("string_1");
+    std::vector<Offset<String>> strings;
+    strings.push_back(string_0);
+    strings.push_back(string_1);
+    auto attr = proto::OpDesc_::CreateAttrDirect(fbb,
+                                                 nullptr,
+                                                 proto::AttrType_INT,
+                                                 0,
+                                                 0.0f,
+                                                 nullptr,
+                                                 &ints,
+                                                 nullptr,
+                                                 &strings);
+
+    // OpDesc
+    std::vector<Offset<proto::OpDesc_::Attr>> attrs;
+    attrs.push_back(attr);
+    auto op_desc =
+        proto::CreateOpDescDirect(fbb, "hello!", nullptr, nullptr, &attrs);
+
+    // BlockDesc 0
+    std::vector<Offset<proto::OpDesc>> ops;
+    ops.push_back(op_desc);
+    auto block_0 = proto::CreateBlockDescDirect(fbb, 0, 0, nullptr, &ops);
+
+    // BlockDesc 1
+    auto block_1 = proto::CreateBlockDescDirect(fbb, 1);
+
+    // ProgramDesc
+    std::vector<Offset<proto::BlockDesc>> block_vector;
+    block_vector.push_back(block_0);
+    block_vector.push_back(block_1);
+    auto orc = proto::CreateProgramDescDirect(fbb, &block_vector);
+    fbb.Finish(orc);
+  };
+
+  FlatBufferBuilder fbb;
+  create_desc(fbb);
+  auto program = fbs::proto::GetProgramDesc(fbb.GetBufferPointer());
+
+  // BlockDesc View
+  VectorView<proto::BlockDesc*> block_view(program->blocks());
+  EXPECT_EQ(block_view.size(), static_cast<size_t>(2));
+  EXPECT_EQ(block_view[0]->idx(), 0);
+  EXPECT_EQ(block_view[1]->idx(), 1);
+
+  // OpDesc & Attr View
+  VectorView<proto::OpDesc*> op_view(block_view[0]->ops());
+  EXPECT_EQ(op_view[0]->type()->str(), std::string("hello!"));
+  VectorView<proto::OpDesc_::Attr*> attr_view(op_view[0]->attrs());
+
+  // int32_t View
+  VectorView<int32_t> ints_view(attr_view[0]->ints());
+  std::vector<int32_t> ints({-1, 0, 1, 2, 3});
+  size_t cnt_0 = 0;
+  for (const auto& i : ints_view) {
+    EXPECT_EQ(i, ints[cnt_0]);
+    ++cnt_0;
+  }
+  for (size_t i = 0; i < ints_view.size(); ++i) {
+    EXPECT_EQ(ints_view[i], ints[i]);
+  }
+  std::vector<int32_t> ints_2(ints_view);
+  for (size_t i = 0; i < ints_2.size(); ++i) {
+    EXPECT_EQ(ints_2[i], ints[i]);
+  }
+
+  // String View
+  VectorView<std::string> strings_view(attr_view[0]->strings());
+  std::vector<std::string> strings({"string_0", "string_1"});
+  EXPECT_EQ(strings_view.size(), strings.size());
+  size_t cnt_1 = 0;
+  for (const auto& s : strings_view) {
+    EXPECT_EQ(s, strings[cnt_1]);
+    ++cnt_1;
+  }
+  for (size_t i = 0; i < strings_view.size(); ++i) {
+    EXPECT_EQ(strings_view[i], strings[i]);
+  }
+  std::vector<std::string> string_2(strings_view);
+  for (size_t i = 0; i < string_2.size(); ++i) {
+    EXPECT_EQ(string_2[i], strings[i]);
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/cpp/CMakeLists.txt b/lite/model_parser/general/CMakeLists.txt
similarity index 100%
rename from lite/model_parser/cpp/CMakeLists.txt
rename to lite/model_parser/general/CMakeLists.txt
diff --git a/lite/model_parser/cpp/block_desc.cc b/lite/model_parser/general/block_desc.cc
similarity index 92%
rename from lite/model_parser/cpp/block_desc.cc
rename to lite/model_parser/general/block_desc.cc
index a4dc7cd72acacb6392cecdfe9a551773c1937888..0766333d66c1299b738098a33a1a2c6433782337 100644
--- a/lite/model_parser/cpp/block_desc.cc
+++ b/lite/model_parser/general/block_desc.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/model_parser/cpp/block_desc.h"
+#include "lite/model_parser/general/block_desc.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 template <>
 VarDesc* BlockDesc::GetVar<VarDesc>(int32_t idx) {
@@ -42,6 +42,6 @@ OpDesc* BlockDesc::AddOp<OpDesc>() {
   return &ops_.back();
 }
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/block_desc.h b/lite/model_parser/general/block_desc.h
similarity index 88%
rename from lite/model_parser/cpp/block_desc.h
rename to lite/model_parser/general/block_desc.h
index a6cd714e60a66398bffb5ed05a3d7d7eb1da9ac2..3b1b1ff4e6616c936bd3b09bff563656f6bdbc6a 100644
--- a/lite/model_parser/cpp/block_desc.h
+++ b/lite/model_parser/general/block_desc.h
@@ -14,16 +14,17 @@
 
 #pragma once
 #include <vector>
-#include "lite/model_parser/cpp/op_desc.h"
-#include "lite/model_parser/cpp/var_desc.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
+#include "lite/model_parser/general/op_desc.h"
+#include "lite/model_parser/general/var_desc.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 /*
- * The cpp::BlockDesc is the internal representation for Op. All the internal
+ * The general::BlockDesc is the internal representation for Op. All the
+ * internal
  * imprementation should use it, not the pb::BlockDesc.
  */
 class BlockDesc : public BlockDescAPI {
@@ -82,6 +83,6 @@ class BlockDesc : public BlockDescAPI {
   int32_t forward_block_idx_;
 };
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/op_desc.cc b/lite/model_parser/general/op_desc.cc
similarity index 95%
rename from lite/model_parser/cpp/op_desc.cc
rename to lite/model_parser/general/op_desc.cc
index a816943bb9689483f1eb60575147a42594db2654..b4589a14f26b641a0e48c69ec067cd847649b67e 100644
--- a/lite/model_parser/cpp/op_desc.cc
+++ b/lite/model_parser/general/op_desc.cc
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/model_parser/cpp/op_desc.h"
+#include "lite/model_parser/general/op_desc.h"
 #include <set>
 #include <utility>
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 std::vector<std::string> OpDesc::OutputArgumentNames() const {
   std::vector<std::string> res;
@@ -69,6 +69,6 @@ bool OpDesc::HasOutput(const std::string& param) const {
   return it != outputs_.end();
 }
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/op_desc.h b/lite/model_parser/general/op_desc.h
similarity index 96%
rename from lite/model_parser/cpp/op_desc.h
rename to lite/model_parser/general/op_desc.h
index dfd60c0793af650ede4327bc37f5dccac2e9ee67..e0c2541182adde6ab9171a55d859a5bd5a1195e2 100644
--- a/lite/model_parser/cpp/op_desc.h
+++ b/lite/model_parser/general/op_desc.h
@@ -17,16 +17,16 @@
 #include <string>
 #include <utility>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/any.h"
 #include "lite/utils/varient.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 /*
- * The cpp::OpDesc is the internal representation for Op. All the internal
+ * The general::OpDesc is the internal representation for Op. All the internal
  * imprementation should use it, not the pb::OpDesc.
  */
 class OpDesc : public OpDescAPI {
@@ -131,6 +131,6 @@ class OpDesc : public OpDescAPI {
   }
 };
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/program_desc.cc b/lite/model_parser/general/program_desc.cc
similarity index 91%
rename from lite/model_parser/cpp/program_desc.cc
rename to lite/model_parser/general/program_desc.cc
index 3c6adcddf319db57366e5b3cdb05bc6169f229ee..670c7684312265d5a1f1eb2cbef54ed5fe62b2d2 100644
--- a/lite/model_parser/cpp/program_desc.cc
+++ b/lite/model_parser/general/program_desc.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/model_parser/cpp/program_desc.h"
+#include "lite/model_parser/general/program_desc.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 template <>
 BlockDesc* ProgramDesc::GetBlock<BlockDesc>(int32_t idx) {
@@ -30,6 +30,6 @@ BlockDesc* ProgramDesc::AddBlock<BlockDesc>() {
   return &blocks_.back();
 }
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/program_desc.h b/lite/model_parser/general/program_desc.h
similarity index 87%
rename from lite/model_parser/cpp/program_desc.h
rename to lite/model_parser/general/program_desc.h
index 63ac8e0d79c16ea6e64daa4a0b1922a3350037cc..0fbc0742fe149075d3ede2b688fd071727baafc9 100644
--- a/lite/model_parser/cpp/program_desc.h
+++ b/lite/model_parser/general/program_desc.h
@@ -14,15 +14,16 @@
 
 #pragma once
 #include <vector>
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
+#include "lite/model_parser/general/block_desc.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 /*
- * The cpp::ProgramDesc is the internal representation for Op. All the internal
+ * The general::ProgramDesc is the internal representation for Op. All the
+ * internal
  * imprementation should use it, not the pb::ProgramDesc.
  */
 class ProgramDesc : public ProgramDescAPI {
@@ -59,6 +60,6 @@ class ProgramDesc : public ProgramDescAPI {
   std::vector<BlockDesc> blocks_;
 };
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/cpp/var_desc.cc b/lite/model_parser/general/var_desc.cc
similarity index 92%
rename from lite/model_parser/cpp/var_desc.cc
rename to lite/model_parser/general/var_desc.cc
index e30bb3eb55d274d5287702d6247b94d5d33c4e74..f2782d1778b07ef201401a62f9c7a6295159ef5f 100644
--- a/lite/model_parser/cpp/var_desc.cc
+++ b/lite/model_parser/general/var_desc.cc
@@ -12,4 +12,4 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/model_parser/cpp/var_desc.h"
+#include "lite/model_parser/general/var_desc.h"
diff --git a/lite/model_parser/cpp/var_desc.h b/lite/model_parser/general/var_desc.h
similarity index 91%
rename from lite/model_parser/cpp/var_desc.h
rename to lite/model_parser/general/var_desc.h
index c56d7cce53180e0157913372f8b0da4c9cedd8c9..ed69d035dfbe837afa79a3f52bd2c0c925bd19ea 100644
--- a/lite/model_parser/cpp/var_desc.h
+++ b/lite/model_parser/general/var_desc.h
@@ -15,14 +15,14 @@
 #pragma once
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 
 namespace paddle {
 namespace lite {
-namespace cpp {
+namespace general {
 
 /*
- * The cpp::VarDesc is the internal representation for Op. All the internal
+ * The general::VarDesc is the internal representation for Op. All the internal
  * imprementation should use it, not the pb::VarDesc.
  */
 class VarDesc : public VarDescAPI {
@@ -59,6 +59,6 @@ class VarDesc : public VarDescAPI {
   std::vector<int64_t> shape_;
 };
 
-}  // namespace cpp
+}  // namespace general
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index ea94ca52e8f123da5077f3b751ab03b857e8c390..640dd044174c831e4570c5e8cc81af02fa50f0c4 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -21,7 +21,7 @@
 #include "lite/core/tensor.h"
 #include "lite/core/variable.h"
 #include "lite/core/version.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/combined_params_desc.h"
 #include "lite/model_parser/naive_buffer/param_desc.h"
 #include "lite/model_parser/naive_buffer/program_desc.h"
diff --git a/lite/model_parser/naive_buffer/block_desc.h b/lite/model_parser/naive_buffer/block_desc.h
index ea4a779fb17559d3487c07b60bd18020fc0e9cce..61c624d9593244a3e680b5541e32cd4aeee949d5 100644
--- a/lite/model_parser/naive_buffer/block_desc.h
+++ b/lite/model_parser/naive_buffer/block_desc.h
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
 namespace paddle {
diff --git a/lite/model_parser/naive_buffer/combined_params_desc.h b/lite/model_parser/naive_buffer/combined_params_desc.h
index a5462ef5eea47867a737cd1eff344c696f9dc159..1131bab9615b53055d58ba962ad21e206ee70bfc 100644
--- a/lite/model_parser/naive_buffer/combined_params_desc.h
+++ b/lite/model_parser/naive_buffer/combined_params_desc.h
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/param_desc.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
diff --git a/lite/model_parser/naive_buffer/op_desc.h b/lite/model_parser/naive_buffer/op_desc.h
index cce0c22c2e717b6d622314f31af2dc418503c78b..f4cd2d8578cf69854fc4044b739fdfa3d6516d50 100644
--- a/lite/model_parser/naive_buffer/op_desc.h
+++ b/lite/model_parser/naive_buffer/op_desc.h
@@ -23,7 +23,7 @@
 #include <set>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
 namespace paddle {
diff --git a/lite/model_parser/naive_buffer/param_desc.h b/lite/model_parser/naive_buffer/param_desc.h
index 0a20b153312d99602ada77317e64c5934df0f070..ebbbdaf846a3550015ec97c11ccfb7d34271b6c5 100644
--- a/lite/model_parser/naive_buffer/param_desc.h
+++ b/lite/model_parser/naive_buffer/param_desc.h
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
 namespace paddle {
diff --git a/lite/model_parser/naive_buffer/program_desc.h b/lite/model_parser/naive_buffer/program_desc.h
index d3926b7c629c4bf56d104ca12c1fc70fbf3c0387..1552b6bcdd7ea7f8efd3954e2625712a7684a5f2 100644
--- a/lite/model_parser/naive_buffer/program_desc.h
+++ b/lite/model_parser/naive_buffer/program_desc.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
 namespace paddle {
diff --git a/lite/model_parser/naive_buffer/var_desc.h b/lite/model_parser/naive_buffer/var_desc.h
index bf0845d7464f511dfb77812612c2b99c954600da..20c8e03a5433ba98c8dc3d98af25920a934ee31d 100644
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ b/lite/model_parser/naive_buffer/var_desc.h
@@ -17,7 +17,7 @@
 #include <algorithm>
 #include <string>
 #include <vector>
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/model_parser/naive_buffer/naive_buffer_wrapper_helper.h"
 #include "lite/model_parser/naive_buffer/proto/framework.nb.h"
 
diff --git a/lite/model_parser/pb/block_desc.h b/lite/model_parser/pb/block_desc.h
index 2a34a51f686caab7aed6a9fb64bb405cd64a2d71..8844173798dcacf77c876f717b71c87cbc57e5e6 100644
--- a/lite/model_parser/pb/block_desc.h
+++ b/lite/model_parser/pb/block_desc.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 #include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
diff --git a/lite/model_parser/pb/op_desc.h b/lite/model_parser/pb/op_desc.h
index f21c194a271b46c84b3a363c6f7c0d9c1f7b1f32..6f186e778298a5ae59a63188640725b3ae5322c9 100644
--- a/lite/model_parser/pb/op_desc.h
+++ b/lite/model_parser/pb/op_desc.h
@@ -26,7 +26,7 @@
 #include <string>
 #include <vector>
 #include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/all.h"
 
 namespace paddle {
diff --git a/lite/model_parser/pb/program_desc.h b/lite/model_parser/pb/program_desc.h
index 9ff4c28a6d9adce85950bb7e83f15004d766d2dc..950bf5480db501289250ece88b28d1c1369e56fc 100644
--- a/lite/model_parser/pb/program_desc.h
+++ b/lite/model_parser/pb/program_desc.h
@@ -16,7 +16,7 @@
 
 #include <vector>
 #include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
diff --git a/lite/model_parser/pb/var_desc.h b/lite/model_parser/pb/var_desc.h
index eefacef4b0c90faf132b2e4ef141ac7009939db5..d36881d5892ca8b4bef754554d164409fab4b858 100644
--- a/lite/model_parser/pb/var_desc.h
+++ b/lite/model_parser/pb/var_desc.h
@@ -18,7 +18,7 @@
 #include <string>
 #include <vector>
 #include "lite/core/framework.pb.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 192cffccb19040a5ab77feae4d8b6a5a5fe4ba00..45b49f91ace12da5934471e01afd91c2832f1d6d 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -39,6 +39,7 @@ add_operator(unsqueeze_op_lite basic SRCS unsqueeze_op.cc DEPS ${op_DEPS})
 add_operator(stack_op basic SRCS stack_op.cc DEPS ${op_DEPS})
 add_operator(cast_op_lite basic SRCS cast_op.cc DEPS ${op_DEPS})
 add_operator(affine_channel_op basic SRCS affine_channel_op.cc DEPS ${op_DEPS})
+add_operator(affine_grid_op basic SRCS affine_grid_op.cc DEPS ${op_DEPS})
 add_operator(range_op basic SRCS range_op.cc DEPS ${op_DEPS})
 add_operator(reduce_mean_op basic SRCS reduce_mean_op.cc DEPS ${op_DEPS})
 add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
@@ -76,6 +77,8 @@ add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
 add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
 add_operator(sequence_unpad_op_lite extra SRCS sequence_unpad_op.cc DEPS ${op_DEPS})
+add_operator(sequence_pad_op_lite extra SRCS sequence_pad_op.cc DEPS ${op_DEPS})
+add_operator(sequence_mask_op_lite extra SRCS sequence_mask_op.cc DEPS ${op_DEPS})
 add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
 add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
 add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
@@ -110,6 +113,8 @@ add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposal
 add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
 add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
 add_operator(max_pool_with_index_op extra SRCS max_pool_with_index_op.cc DEPS ${op_DEPS})
+add_operator(pixel_shuffle_op extra SRCS pixel_shuffle_op.cc DEPS ${op_DEPS})
+add_operator(clip_op extra SRCS clip_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
@@ -137,12 +142,15 @@ add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS})
 add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS})
 add_operator(layer_norm_op extra SRCS layer_norm_op.cc DEPS ${op_DEPS})
 add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS})
+add_operator(retinanet_detection_output_op extra SRCS retinanet_detection_output_op.cc DEPS ${op_DEPS})
+add_operator(where_index_op extra SRCS where_index_op.cc DEPS ${op_DEPS})
 # for content-dnn specific
 add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc DEPS ${op_DEPS})
 add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
 add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
 add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
 add_operator(lstm_op extra SRCS lstm_op.cc DEPS ${op_DEPS})
+add_operator(topk_pooling_op extra SRCS topk_pooling_op.cc DEPS ${op_DEPS})
 # for deformable-convNet
 add_operator(deformable_conv_op extra SRCS deformable_conv_op.cc DEPS ${op_DEPS})
 
@@ -160,6 +168,9 @@ add_operator(__xpu__resnet50_op extra SRCS __xpu__resnet50_op.cc DEPS ${op_DEPS}
 add_operator(__xpu__multi_encoder_op extra SRCS __xpu__multi_encoder_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__embedding_with_eltwise_add_op extra SRCS __xpu__embedding_with_eltwise_add_op.cc DEPS ${op_DEPS})
 add_operator(__xpu__fc_op extra SRCS __xpu__fc_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__resnet_cbam_op extra SRCS __xpu__resnet_cbam_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__search_attention_op extra SRCS __xpu__search_attention_op.cc DEPS ${op_DEPS})
+add_operator(__xpu__mmdnn_op extra SRCS __xpu__mmdnn_op.cc DEPS ${op_DEPS})
 
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
diff --git a/lite/operators/__xpu__mmdnn_op.cc b/lite/operators/__xpu__mmdnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35024da911ba0659c5005a1adc641fa3adc2f282
--- /dev/null
+++ b/lite/operators/__xpu__mmdnn_op.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__mmdnn_op.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUMmdnnBidEmbGrnnAttOp::CheckShape() const { return true; }
+
+bool XPUMmdnnBidEmbGrnnAttOp::InferShapeImpl() const {
+  auto& id_dims = param_.id0->dims();
+  auto& id_lod = param_.id0->lod()[0];
+  auto& emb_tbl_dims = param_.emb_tbl->dims();
+  auto& grnn_wh_dims = param_.grnn_rv_wh->dims();
+
+  param_.grnn_fw_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
+  param_.grnn_rv_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, grnn_wh_dims[2]});
+  param_.att_pool_out->Resize(
+      {(int64_t)id_lod.size() - 1, 2 * grnn_wh_dims[2]});
+  param_.concat_3in1_out->Resize({id_dims[0], 3 * grnn_wh_dims[2]});
+  param_.concat_3in1_out->set_lod({id_lod});
+  param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]});
+  param_.emb_fw_out->set_lod({id_lod});
+  return true;
+}
+
+bool XPUMmdnnBidEmbGrnnAttOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                         lite::Scope* scope) {
+  param_.id0 =
+      scope->FindVar(op_desc.Input("id0").front())->GetMutable<lite::Tensor>();
+  param_.id1 =
+      scope->FindVar(op_desc.Input("id1").front())->GetMutable<lite::Tensor>();
+  param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front())
+                        ->GetMutable<lite::Tensor>();
+  param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front())
+                        ->GetMutable<lite::Tensor>();
+
+  param_.grnn_fw_pool_out =
+      scope->FindVar(op_desc.Output("grnn_fw_pool_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_pool_out =
+      scope->FindVar(op_desc.Output("grnn_rv_pool_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front())
+                            ->GetMutable<lite::Tensor>();
+  param_.concat_3in1_out =
+      scope->FindVar(op_desc.Output("concat_3in1_out").front())
+          ->GetMutable<lite::Tensor>();
+  param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front())
+                          ->GetMutable<lite::Tensor>();
+
+  param_.grnn_fw_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wh_maxs");
+  param_.grnn_fw_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wi_maxs");
+  param_.grnn_rv_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wh_maxs");
+  param_.grnn_rv_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wi_maxs");
+  param_.att_fc_w_max = op_desc.GetAttr<float>("att_fc_w_max");
+  return true;
+}
+
+bool XPUMmdnnBidEmbAttOp::CheckShape() const { return true; }
+
+bool XPUMmdnnBidEmbAttOp::InferShapeImpl() const {
+  auto& id_dims = param_.id0->dims();
+  auto& id_lod = param_.id0->lod()[0];
+  auto& emb_tbl_dims = param_.emb_tbl->dims();
+
+  param_.att_pool_out->Resize({(int64_t)id_lod.size() - 1, emb_tbl_dims[1]});
+  param_.emb_fw_out->Resize({id_dims[0], emb_tbl_dims[1]});
+  param_.emb_fw_out->set_lod({id_lod});
+  return true;
+}
+
+bool XPUMmdnnBidEmbAttOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                     lite::Scope* scope) {
+  param_.id0 =
+      scope->FindVar(op_desc.Input("id0").front())->GetMutable<lite::Tensor>();
+  param_.id1 =
+      scope->FindVar(op_desc.Input("id1").front())->GetMutable<lite::Tensor>();
+  param_.emb_tbl = scope->FindVar(op_desc.Input("emb_tbl").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.att_fc_w = scope->FindVar(op_desc.Input("att_fc_w").front())
+                        ->GetMutable<lite::Tensor>();
+  param_.att_fc_b = scope->FindVar(op_desc.Input("att_fc_b").front())
+                        ->GetMutable<lite::Tensor>();
+
+  param_.att_pool_out = scope->FindVar(op_desc.Output("att_pool_out").front())
+                            ->GetMutable<lite::Tensor>();
+  param_.emb_fw_out = scope->FindVar(op_desc.Output("emb_fw_out").front())
+                          ->GetMutable<lite::Tensor>();
+
+  param_.att_fc_w_max = op_desc.GetAttr<float>("att_fc_w_max");
+  return true;
+}
+
+bool XPUMmdnnMatchConvTopkOp::CheckShape() const { return true; }
+
+bool XPUMmdnnMatchConvTopkOp::InferShapeImpl() const {
+  int channel_num = param_.channel_num;
+  std::vector<int> topks = param_.topks;
+  auto row_dim = param_.input_x->dims();
+  auto num_k = topks.size();
+  auto row_shape_0 = row_dim[0];
+  std::vector<int64_t> vec_out_shape;
+  vec_out_shape.push_back(row_shape_0);
+  vec_out_shape.push_back(channel_num * num_k);
+
+  param_.topk_out->Resize(lite::DDim(vec_out_shape));
+  param_.topk_out->set_lod(param_.input_x->lod());
+  return true;
+}
+
+bool XPUMmdnnMatchConvTopkOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                         lite::Scope* scope) {
+  param_.input_x = scope->FindVar(op_desc.Input("input_x").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.input_y = scope->FindVar(op_desc.Input("input_y").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.input_w = scope->FindVar(op_desc.Input("input_w").front())
+                       ->GetMutable<lite::Tensor>();
+  param_.conv_w = scope->FindVar(op_desc.Input("conv_w").front())
+                      ->GetMutable<lite::Tensor>();
+
+  param_.topk_out = scope->FindVar(op_desc.Output("topk_out").front())
+                        ->GetMutable<lite::Tensor>();
+
+  param_.input_w_max = op_desc.GetAttr<float>("input_w_max");
+  param_.conv_w_max = op_desc.GetAttr<float>("conv_w_max");
+  param_.topks = op_desc.GetAttr<std::vector<int>>("topks");
+  param_.channel_num = op_desc.GetAttr<int>("channel_num");
+  param_.dim_t = op_desc.GetAttr<int>("dim_t");
+  return true;
+}
+
+bool XPUMmdnnMergeAllOp::CheckShape() const { return true; }
+
+bool XPUMmdnnMergeAllOp::InferShapeImpl() const {
+  int64_t dim0 = param_.concat_7in1_x[0]->dims()[0];
+  int64_t dim1 = param_.fc2_w->dims()[0];
+  std::vector<int64_t> vec_out_shape;
+  vec_out_shape.push_back(dim0);
+  vec_out_shape.push_back(dim1);
+
+  param_.out->Resize(lite::DDim(vec_out_shape));
+  return true;
+}
+
+bool XPUMmdnnMergeAllOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                    lite::Scope* scope) {
+  param_.concat_7in1_x.clear();
+  for (auto& name : op_desc.Input("concat_7in1_x")) {
+    auto t = scope->FindVar(name)->GetMutable<lite::Tensor>();
+    param_.concat_7in1_x.push_back(t);
+  }
+  param_.concat_2in1_x.clear();
+  for (auto& name : op_desc.Input("concat_2in1_x")) {
+    auto t = scope->FindVar(name)->GetMutable<lite::Tensor>();
+    param_.concat_2in1_x.push_back(t);
+  }
+  param_.grnn_fw_wh = scope->FindVar(op_desc.Input("grnn_fw_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_fw_wi = scope->FindVar(op_desc.Input("grnn_fw_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wh = scope->FindVar(op_desc.Input("grnn_rv_wh").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.grnn_rv_wi = scope->FindVar(op_desc.Input("grnn_rv_wi").front())
+                          ->GetMutable<lite::Tensor>();
+  param_.fc0_w = scope->FindVar(op_desc.Input("fc0_w").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc0_b = scope->FindVar(op_desc.Input("fc0_b").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc1_w = scope->FindVar(op_desc.Input("fc1_w").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc1_b = scope->FindVar(op_desc.Input("fc1_b").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc2_w = scope->FindVar(op_desc.Input("fc2_w").front())
+                     ->GetMutable<lite::Tensor>();
+  param_.fc2_b = scope->FindVar(op_desc.Input("fc2_b").front())
+                     ->GetMutable<lite::Tensor>();
+
+  param_.out =
+      scope->FindVar(op_desc.Output("out").front())->GetMutable<lite::Tensor>();
+
+  param_.grnn_fw_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wh_maxs");
+  param_.grnn_fw_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_fw_wi_maxs");
+  param_.grnn_rv_wh_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wh_maxs");
+  param_.grnn_rv_wi_maxs =
+      op_desc.GetAttr<std::vector<float>>("grnn_rv_wi_maxs");
+  param_.fc0_w_max = op_desc.GetAttr<float>("fc0_w_max");
+  param_.fc1_w_max = op_desc.GetAttr<float>("fc1_w_max");
+  param_.fc2_w_max = op_desc.GetAttr<float>("fc2_w_max");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_grnn_att,
+                 paddle::lite::operators::XPUMmdnnBidEmbGrnnAttOp);
+REGISTER_LITE_OP(__xpu__mmdnn_bid_emb_att,
+                 paddle::lite::operators::XPUMmdnnBidEmbAttOp);
+REGISTER_LITE_OP(__xpu__mmdnn_match_conv_topk,
+                 paddle::lite::operators::XPUMmdnnMatchConvTopkOp);
+REGISTER_LITE_OP(__xpu__mmdnn_merge_all,
+                 paddle::lite::operators::XPUMmdnnMergeAllOp);
diff --git a/lite/operators/__xpu__mmdnn_op.h b/lite/operators/__xpu__mmdnn_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7038898cad0823746f905e4e60c06885b57a737c
--- /dev/null
+++ b/lite/operators/__xpu__mmdnn_op.h
@@ -0,0 +1,107 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUMmdnnBidEmbGrnnAttOp : public OpLite {
+ public:
+  XPUMmdnnBidEmbGrnnAttOp() {}
+
+  explicit XPUMmdnnBidEmbGrnnAttOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUMmdnnBidEmbGrnnAttOp"; }
+
+ private:
+  mutable XPUMmdnnBidEmbGrnnAttParam param_;
+};
+
+class XPUMmdnnBidEmbAttOp : public OpLite {
+ public:
+  XPUMmdnnBidEmbAttOp() {}
+
+  explicit XPUMmdnnBidEmbAttOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUMmdnnBidEmbAttOp"; }
+
+ private:
+  mutable XPUMmdnnBidEmbAttParam param_;
+};
+
+class XPUMmdnnMatchConvTopkOp : public OpLite {
+ public:
+  XPUMmdnnMatchConvTopkOp() {}
+
+  explicit XPUMmdnnMatchConvTopkOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUMmdnnMatchConvTopkOp"; }
+
+ private:
+  mutable XPUMmdnnMatchConvTopkParam param_;
+};
+
+class XPUMmdnnMergeAllOp : public OpLite {
+ public:
+  XPUMmdnnMergeAllOp() {}
+
+  explicit XPUMmdnnMergeAllOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "XPUMmdnnMergeAllOp"; }
+
+ private:
+  mutable XPUMmdnnMergeAllParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/__xpu__resnet_cbam_op.cc b/lite/operators/__xpu__resnet_cbam_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6013f4fa90033c51df7a0d3bb670e02f8bf4628d
--- /dev/null
+++ b/lite/operators/__xpu__resnet_cbam_op.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__resnet_cbam_op.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUResNetCbamOp::CheckShape() const { return true; }
+
+bool XPUResNetCbamOp::InferShapeImpl() const {
+  auto input_shape = param_.input->dims();
+  std::vector<int64_t> output_shape_vec{1, 64};
+  paddle::lite::DDim output_shape(output_shape_vec);
+  output_shape[0] = input_shape[0];
+  param_.output->Resize(output_shape);
+  return true;
+}
+
+bool XPUResNetCbamOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                 lite::Scope* scope) {
+  param_.input = const_cast<lite::Tensor*>(
+      &scope->FindVar(op_desc.Input("Input").front())->Get<lite::Tensor>());
+  param_.output = scope->FindVar(op_desc.Output("Output").front())
+                      ->GetMutable<lite::Tensor>();
+
+  param_.filter.clear();
+  for (auto& name : op_desc.Input("Filter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.filter.push_back(t);
+  }
+  param_.bias.clear();
+  for (auto& name : op_desc.Input("Bias")) {
+    if (name.substr(0, 11) == "placeholder") {
+      param_.bias.push_back(nullptr);
+    } else {
+      auto t =
+          const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+      param_.bias.push_back(t);
+    }
+  }
+  param_.max_filter.clear();
+  for (auto& name : op_desc.Input("MaxFilter")) {
+    auto t =
+        const_cast<lite::Tensor*>(&scope->FindVar(name)->Get<lite::Tensor>());
+    param_.max_filter.push_back(t);
+  }
+
+  param_.pool_p = op_desc.GetAttr<float>("pool_p");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__resnet_cbam, paddle::lite::operators::XPUResNetCbamOp);
diff --git a/lite/operators/__xpu__resnet_cbam_op.h b/lite/operators/__xpu__resnet_cbam_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..26e5bafeae31183e9054e7e77ea46813c95db707
--- /dev/null
+++ b/lite/operators/__xpu__resnet_cbam_op.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUResNetCbamOp : public OpLite {
+ public:
+  XPUResNetCbamOp() {}
+  explicit XPUResNetCbamOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "ResNetCbam"; }
+
+ private:
+  mutable XPUResNetCbamParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/__xpu__search_attention_op.cc b/lite/operators/__xpu__search_attention_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..acd8c817b0d81ef03df1c05417b8bb2f56c00812
--- /dev/null
+++ b/lite/operators/__xpu__search_attention_op.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/__xpu__search_attention_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool XPUMmdnnSearchAttentionOp::CheckShape() const { return true; }
+
+bool XPUMmdnnSearchAttentionOp::InferShapeImpl() const {
+  auto& x_dims = param_.X->dims();
+  param_.Out->Resize(x_dims);
+  param_.Out->set_lod(param_.X->lod());
+  return true;
+}
+
+bool XPUMmdnnSearchAttentionOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                           lite::Scope* scope) {
+  auto x = op_desc.Input("X").front();
+  auto w = op_desc.Input("W").front();
+  auto b = op_desc.Input("b").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.W = scope->FindVar(w)->GetMutable<lite::Tensor>();
+  param_.b = scope->FindVar(b)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  param_.W_max = op_desc.GetAttr<float>("W_max");
+  param_.pad_id = op_desc.GetAttr<int>("pad_id");
+  param_.alpha0 = op_desc.GetAttr<float>("alpha0");
+  param_.alpha1 = op_desc.GetAttr<float>("alpha1");
+  param_.mask = op_desc.GetAttr<float>("mask");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(__xpu__mmdnn_search_attention,
+                 paddle::lite::operators::XPUMmdnnSearchAttentionOp);
diff --git a/lite/operators/__xpu__search_attention_op.h b/lite/operators/__xpu__search_attention_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..81bd366ee8a51dc8d2d7fb4c9cb03d2199bcb4f2
--- /dev/null
+++ b/lite/operators/__xpu__search_attention_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class XPUMmdnnSearchAttentionOp : public OpLite {
+ public:
+  XPUMmdnnSearchAttentionOp() {}
+
+  explicit XPUMmdnnSearchAttentionOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "XPUMmdnnSearchAttentionOp";
+  }
+
+ private:
+  mutable XPUMmdnnSearchAttentionParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/activation_grad_ops.cc b/lite/operators/activation_grad_ops.cc
index b31163e5dce6d9b77d923ba44ed58952263610a5..a30231be921e2c4445bb4c7a72c9572b14c1c0f5 100644
--- a/lite/operators/activation_grad_ops.cc
+++ b/lite/operators/activation_grad_ops.cc
@@ -41,15 +41,11 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
   if (opdesc.HasInput("X")) {
     auto X_name = opdesc.Input("X").front();
     param_.X = GetVar<lite::Tensor>(scope, X_name);
-  } else {
-    param_.X = param_.X_grad;
   }
 
   if (opdesc.HasInput("Out")) {
     auto Out_name = opdesc.Input("Out").front();
     param_.Out = GetVar<lite::Tensor>(scope, Out_name);
-  } else {
-    param_.Out = param_.Out_grad;
   }
 
   return true;
@@ -60,3 +56,5 @@ bool ActivationGradOp::AttachImpl(const cpp::OpDesc& opdesc,
 }  // namespace paddle
 
 REGISTER_LITE_OP(square_grad, paddle::lite::operators::ActivationGradOp);
+REGISTER_LITE_OP(relu_grad, paddle::lite::operators::ActivationGradOp);
+REGISTER_LITE_OP(tanh_grad, paddle::lite::operators::ActivationGradOp);
diff --git a/lite/operators/affine_grid_op.cc b/lite/operators/affine_grid_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22c6b531ae2f4db4136c842720edf56e41900157
--- /dev/null
+++ b/lite/operators/affine_grid_op.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/affine_grid_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool AffineGridOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+
+  const auto x_dims = param_.X->dims();
+
+  CHECK_OR_FALSE(x_dims.size() == 3);
+  CHECK_OR_FALSE(x_dims[1] == 2 && x_dims[2] == 3);
+
+  if (param_.output_shape.size() != 0) {
+    CHECK_OR_FALSE(param_.output_shape.size() == 4);
+  }
+  return true;
+}
+
+bool AffineGridOpLite::InferShapeImpl() const {
+  int N = param_.X->dims()[0];
+  int H, W;
+  if (param_.output_shape.size() == 0) {
+    const auto out_shape = param_.OutputShape->dims();
+    H = out_shape[2];
+    W = out_shape[3];
+
+  } else {
+    H = param_.output_shape[2];
+    W = param_.output_shape[3];
+  }
+  param_.Out->Resize(std::vector<int64_t>({N, H, W, 2}));
+
+  return true;
+}
+
+bool AffineGridOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                  lite::Scope *scope) {
+  auto x = op_desc.Input("Theta").front();
+  auto output = op_desc.Output("Output").front();
+
+  param_.X = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.output_shape = op_desc.GetAttr<std::vector<int>>("output_shape");
+
+  param_.Out = scope->FindVar(output)->GetMutable<lite::Tensor>();
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(affine_grid, paddle::lite::operators::AffineGridOpLite);
diff --git a/lite/operators/affine_grid_op.h b/lite/operators/affine_grid_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a94eb3d122b74b4e42d8714f284e478e6fb053f6
--- /dev/null
+++ b/lite/operators/affine_grid_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class AffineGridOpLite : public OpLite {
+ public:
+  AffineGridOpLite() {}
+
+  explicit AffineGridOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "affine_grid"; }
+
+ private:
+  mutable AffineGridParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/assign_value_op.cc b/lite/operators/assign_value_op.cc
index ff5b55735f7b58aa2eaa2274574336dadd8061e6..f6f8cb7e3c8958693dd7234b7a21b29b769aa96c 100644
--- a/lite/operators/assign_value_op.cc
+++ b/lite/operators/assign_value_op.cc
@@ -26,12 +26,15 @@ bool AssignValueOpLite::CheckShape() const {
   auto shape = param_.shape;
   auto int32_values = param_.int32_values;
   auto fp32_values = param_.fp32_values;
+  auto int64_values = param_.int64_values;
+  auto bool_values = param_.bool_values;
   size_t shape_num = 1;
-  for (int i = 0; i < shape.size(); i++) {
+  for (size_t i = 0; i < shape.size(); i++) {
     shape_num *= shape[i];
   }
-  CHECK_OR_FALSE(shape_num == int32_values.size() ||
-                 shape_num == fp32_values.size());
+  CHECK_OR_FALSE(
+      shape_num == int32_values.size() || shape_num == fp32_values.size() ||
+      shape_num == int64_values.size() || shape_num == bool_values.size());
   return true;
 }
 
@@ -47,9 +50,18 @@ bool AssignValueOpLite::AttachImpl(const cpp::OpDesc &op_desc,
                                    lite::Scope *scope) {
   param_.shape = op_desc.GetAttr<std::vector<int>>("shape");
   param_.dtype = op_desc.GetAttr<int>("dtype");
-  param_.fp32_values = op_desc.GetAttr<std::vector<float>>("fp32_values");
-  param_.int32_values = op_desc.GetAttr<std::vector<int>>("int32_values");
-
+  if (op_desc.HasAttr("fp32_values")) {
+    param_.fp32_values = op_desc.GetAttr<std::vector<float>>("fp32_values");
+  }
+  if (op_desc.HasAttr("int32_values")) {
+    param_.int32_values = op_desc.GetAttr<std::vector<int>>("int32_values");
+  }
+  if (op_desc.HasAttr("int64_values")) {
+    param_.int64_values = op_desc.GetAttr<std::vector<int64_t>>("int64_values");
+  }
+  if (op_desc.HasAttr("bool_values")) {
+    param_.bool_values = op_desc.GetAttr<std::vector<int>>("bool_values");
+  }
   auto out = op_desc.Output("Out").front();
   param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
   return true;
diff --git a/lite/operators/clip_op.cc b/lite/operators/clip_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ad8eef45f3b38cd176d1bd3d2d0b42620faf602c
--- /dev/null
+++ b/lite/operators/clip_op.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/clip_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ClipOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.out);
+  return true;
+}
+
+bool ClipOpLite::InferShapeImpl() const {
+  param_.out->Resize(param_.x->dims());
+  param_.out->set_lod(param_.x->lod());
+  return true;
+}
+
+bool ClipOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  AttachInput(op_desc, scope, "X", false, &param_.x);
+  AttachInput(op_desc, scope, "Min", true, &param_.min_tensor);
+  AttachInput(op_desc, scope, "Max", true, &param_.max_tensor);
+  AttachOutput(op_desc, scope, "Out", false, &param_.out);
+
+  param_.min = op_desc.GetAttr<float>("min");
+  param_.max = op_desc.GetAttr<float>("max");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(clip, paddle::lite::operators::ClipOpLite);
diff --git a/lite/operators/clip_op.h b/lite/operators/clip_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..25c7f9a824ffc4b395a13df39811074724211f44
--- /dev/null
+++ b/lite/operators/clip_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ClipOpLite : public OpLite {
+ public:
+  ClipOpLite() {}
+
+  explicit ClipOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "clip"; }
+
+ private:
+  mutable ClipParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index c3e375e2e44b8184e6e7e635ab2c6c1f8889f844..a1d4e2e8a038046b257b3ab5f936cc4cb2e62c67 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -74,7 +74,7 @@ class ConvOpLite : public OpLite {
     param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    std::vector<int> paddings = op_desc.GetAttr<std::vector<int>>("paddings");
     param_.groups = op_desc.GetAttr<int>("groups");
     auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
     param_.dilations = std::make_shared<std::vector<int>>(dilations);
@@ -130,15 +130,18 @@ class ConvOpLite : public OpLite {
       padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
     }
     // For Int8
-    if (op_desc.HasAttr("enable_int8")) {
-      param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
-      if (op_desc.HasAttr("input_scale"))
-        param_.input_scale = op_desc.GetAttr<float>("input_scale");
-      if (op_desc.HasAttr("weight_scale"))
-        param_.weight_scale =
-            op_desc.GetAttr<std::vector<float>>("weight_scale");
-      if (op_desc.HasAttr("output_scale")) {
-        param_.output_scale = op_desc.GetAttr<float>("output_scale");
+    const OpInfo* op_info = dynamic_cast<const OpInfo*>(&op_desc);
+    if (op_info != nullptr && op_info->HasAttr("enable_int8")) {
+      param_.enable_int8 = op_info->GetAttr<bool>("enable_int8");
+      auto input_name = op_info->Input("Input").front();
+      auto filter_name = op_info->Input("Filter").front();
+      auto output_name = op_info->Output("Output").front();
+      if (op_info->HasInputScale(input_name))
+        param_.input_scale = op_info->GetInputScale(input_name)[0];
+      if (op_info->HasInputScale(filter_name))
+        param_.weight_scale = op_info->GetInputScale(filter_name);
+      if (op_info->HasOutputScale(output_name)) {
+        param_.output_scale = op_info->GetOutputScale(output_name)[0];
       }
     }
 
diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc
index 9d098eb975ef071a4650ea547d6081d950b251f1..732f8c5056f930259655339c8d8a0b2846f29313 100644
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -106,7 +106,7 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc,
   param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
   param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  std::vector<int> paddings = op_desc.GetAttr<std::vector<int>>("paddings");
   param_.groups = op_desc.GetAttr<int>("groups");
   auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
 
diff --git a/lite/operators/elementwise_ops.cc b/lite/operators/elementwise_ops.cc
index 6cc41f0a66cfac4a0baa0153765a59766fa045f4..5895bb667aa22507d362004627304ecf78e085f1 100644
--- a/lite/operators/elementwise_ops.cc
+++ b/lite/operators/elementwise_ops.cc
@@ -144,6 +144,8 @@ REGISTER_LITE_OP(elementwise_add, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_mul, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_max, paddle::lite::operators::ElementwiseOp);
 REGISTER_LITE_OP(elementwise_div, paddle::lite::operators::ElementwiseOp);
+REGISTER_LITE_OP(elementwise_mod, paddle::lite::operators::ElementwiseOp);
+REGISTER_LITE_OP(elementwise_pow, paddle::lite::operators::ElementwiseOp);
 
 // #ifdef LITE_WITH_TRAIN
 // REGISTER_LITE_OP(elementwise_sub_grad,
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index d4032c5e8b98ff6d5763d2d06610d2e214ad90ca..28a220da2de0920643d46f1ed9c610dfa613cf95 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -102,14 +102,18 @@ bool FcOpLite::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   }
 
   // For Int8
-  if (op_desc.HasAttr("enable_int8")) {
-    param_.enable_int8 = op_desc.GetAttr<bool>("enable_int8");
-    if (op_desc.HasAttr("input_scale"))
-      param_.input_scale = op_desc.GetAttr<float>("input_scale");
-    if (op_desc.HasAttr("weight_scale"))
-      param_.weight_scale = op_desc.GetAttr<std::vector<float>>("weight_scale");
-    if (op_desc.HasAttr("output_scale"))
-      param_.output_scale = op_desc.GetAttr<float>("output_scale");
+  const OpInfo* op_info = dynamic_cast<const OpInfo*>(&op_desc);
+  if (op_info != nullptr && op_info->HasAttr("enable_int8")) {
+    param_.enable_int8 = op_info->GetAttr<bool>("enable_int8");
+    auto input_name = op_info->Input("Input").front();
+    auto weight_name = op_info->Input("W").front();
+    auto out_name = op_info->Output("Out").front();
+    if (op_info->HasInputScale(input_name))
+      param_.input_scale = op_info->GetInputScale(input_name)[0];
+    if (op_info->HasInputScale(weight_name))
+      param_.weight_scale = op_info->GetInputScale(weight_name);
+    if (op_info->HasOutputScale(out_name))
+      param_.output_scale = op_info->GetOutputScale(out_name)[0];
   }
   return true;
 }
diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc
index 1cc751109f76a96097d363b493322dde182a715d..fd70143131b458c1d985a21a6d9d84c707ba9986 100644
--- a/lite/operators/match_matrix_tensor_op.cc
+++ b/lite/operators/match_matrix_tensor_op.cc
@@ -94,6 +94,18 @@ bool MatchMatrixTensorOpLite::AttachImpl(const cpp::OpDesc& op_desc,
 
   param_.dim_t = op_desc.GetAttr<int32_t>("dim_t");
 
+  if (op_desc.HasAttr("fuse_relu")) {
+    param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
+  }
+#ifdef LITE_WITH_XPU
+  if (op_desc.HasAttr("__xpu__float_to_fix")) {
+    param_.__xpu__float_to_fix = op_desc.GetAttr<bool>("__xpu__float_to_fix");
+  }
+  if (op_desc.HasAttr("__xpu__w_max")) {
+    param_.__xpu__w_max = op_desc.GetAttr<float>("__xpu__w_max");
+  }
+#endif
+
   return true;
 }
 
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 8b463956438c61a95af9ec6ae7e7a3230672a237..f351e8e5344424d80fa79f8d7c83be3bf367441f 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -21,10 +21,9 @@
 #include "lite/core/scope.h"
 #include "lite/core/tensor.h"
 #include "lite/core/types.h"
-#include "lite/model_parser/cpp/block_desc.h"
-#include "lite/model_parser/desc_apis.h"
+#include "lite/model_parser/base/apis.h"
+#include "lite/model_parser/cpp_desc.h"
 #include "lite/utils/all.h"
-#include "lite/utils/variant.h"
 /*
  * This file contains all the argument parameter data structure for operators.
  */
@@ -1032,12 +1031,28 @@ struct SequenceExpandParam : ParamBase {
   int ref_level{-1};
 };
 
+struct SequencePadParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* PadValue{};
+  lite::Tensor* Out{};
+  lite::Tensor* Length{};
+  int padded_length{-1};
+};
+
 struct SequenceUnpadParam : ParamBase {
   const lite::Tensor* X{};
   const lite::Tensor* Length{};
   lite::Tensor* Out{};
 };
 
+struct SequenceMaskParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* MaxLenTensor{nullptr};
+  lite::Tensor* Y{};
+  int maxlen{-1};
+  int out_dtype;
+};
+
 struct SequenceExpandAsParam : ParamBase {
   const lite::Tensor* x{nullptr};
   const lite::Tensor* y{nullptr};
@@ -1114,6 +1129,11 @@ struct VarConv2DParam : ParamBase {
   int kernel_w;
 
   bool fuse_relu{false};
+
+#ifdef LITE_WITH_XPU
+  bool __xpu__float_to_fix{false};  // Is W already converted to int16/int8
+  float __xpu__w_max{0.0f};         // Abs max in W
+#endif
 };
 
 /// ----------------------- shape operators ----------------------
@@ -1166,6 +1186,13 @@ struct AffineChannelParam : ParamBase {
   lite::Tensor* Out{};
 };
 
+struct AffineGridParam : ParamBase {
+  const lite::Tensor* X{};  // Theta:shape {?, 2, 3}
+  std::vector<int> output_shape;
+  const lite::Tensor* OutputShape;
+  lite::Tensor* Out{};
+};
+
 struct AnchorGeneratorParam : ParamBase {
   const lite::Tensor* Input{};
   std::vector<float> anchor_sizes{};
@@ -1324,6 +1351,8 @@ struct AssignValueParam : ParamBase {
   int dtype{};
   std::vector<float> fp32_values{};
   std::vector<int> int32_values{};
+  std::vector<int64_t> int64_values{};
+  std::vector<int> bool_values{};
   lite::Tensor* Out{};
 };
 
@@ -1338,6 +1367,15 @@ struct SequenceTopkAvgPoolingParam : ParamBase {
   std::vector<int> topks{};
 };
 
+/// --------------- topk_pooling operators ------------------
+struct TopkPoolingParam : ParamBase {
+  const lite::Tensor* X{};
+  const lite::Tensor* Y{};
+  lite::Tensor* Out{};
+  int top_k{1};
+  int feat_map_num{1};
+};
+
 /// --------------- search_fc operators ------------------
 struct SearchFcParam : ParamBase {
   const lite::Tensor* X{};
@@ -1345,6 +1383,13 @@ struct SearchFcParam : ParamBase {
   const lite::Tensor* b{};
   lite::Tensor* Out{};
   int out_size{};
+
+  bool fuse_relu{false};
+
+#ifdef LITE_WITH_XPU
+  bool __xpu__float_to_fix{false};  // Is W already converted to int16/int8
+  float __xpu__w_max{0.0f};         // Abs max in W
+#endif
 };
 /// --------------------- match_matrix_tensor operators --------------------
 struct MatchMatrixTensorParam : ParamBase {
@@ -1355,6 +1400,12 @@ struct MatchMatrixTensorParam : ParamBase {
   lite::Tensor* tmp{};
 
   int dim_t;
+  bool fuse_relu{false};
+
+#ifdef LITE_WITH_XPU
+  bool __xpu__float_to_fix{false};  // Is w already converted to int16/int8
+  float __xpu__w_max{0.0f};         // Abs max in w
+#endif
 };
 
 /// --------------------- search_seq_depadding operators --------------------
@@ -1376,6 +1427,12 @@ struct SearchGrnnParam : ParamBase {
   lite::Tensor* tmp_buffer{};
   lite::Tensor* idx_sorted_by_width{};
   lite::Tensor* layout_input{};
+
+#ifdef LITE_WITH_XPU
+  bool __xpu__float_to_fix{false};   // Is wi/wh already converted to int16/int8
+  std::vector<float> __xpu__wi_max;  // Abs max in wi
+  std::vector<float> __xpu__wh_max;  // Abs max in wh
+#endif
 };
 
 struct SplitLodTensorParam : ParamBase {
@@ -1530,6 +1587,106 @@ struct XPUFcParam : ParamBase {
   std::string activation_type{""};
 };
 
+struct XPUResNetCbamParam : ParamBase {
+  lite::Tensor* input{};
+  std::vector<lite::Tensor*> filter;
+  std::vector<lite::Tensor*> bias;
+  std::vector<lite::Tensor*> max_filter;
+  lite::Tensor* output{};
+
+  float pool_p{1.0f};
+};
+
+struct XPUMmdnnSearchAttentionParam : ParamBase {
+  lite::Tensor* X{};
+  lite::Tensor* W{};
+  lite::Tensor* b{};
+  lite::Tensor* Out{};
+
+  float W_max{0.0f};
+  int pad_id{0};
+  float alpha0{1.0f};
+  float alpha1{1.0f};
+  float mask{1.0f};
+};
+
+struct XPUMmdnnBidEmbGrnnAttParam : ParamBase {
+  lite::Tensor* id0{};
+  lite::Tensor* id1{};
+  lite::Tensor* emb_tbl{};
+  lite::Tensor* grnn_fw_wh{};
+  lite::Tensor* grnn_fw_wi{};
+  lite::Tensor* grnn_rv_wh{};
+  lite::Tensor* grnn_rv_wi{};
+  lite::Tensor* att_fc_w{};
+  lite::Tensor* att_fc_b{};
+
+  std::vector<float> grnn_fw_wh_maxs;
+  std::vector<float> grnn_fw_wi_maxs;
+  std::vector<float> grnn_rv_wh_maxs;
+  std::vector<float> grnn_rv_wi_maxs;
+  float att_fc_w_max{0.0f};
+
+  lite::Tensor* grnn_fw_pool_out{};  // 1
+  lite::Tensor* grnn_rv_pool_out{};  // 2
+  lite::Tensor* att_pool_out{};      // 3
+  lite::Tensor* concat_3in1_out{};   // 4
+  lite::Tensor* emb_fw_out{};        // 5
+};
+
+struct XPUMmdnnBidEmbAttParam : ParamBase {
+  lite::Tensor* id0{};
+  lite::Tensor* id1{};
+  lite::Tensor* emb_tbl{};
+  lite::Tensor* att_fc_w{};
+  lite::Tensor* att_fc_b{};
+
+  float att_fc_w_max{0.0f};
+
+  lite::Tensor* att_pool_out{};  // 1
+  lite::Tensor* emb_fw_out{};    // 2
+};
+
+struct XPUMmdnnMatchConvTopkParam : ParamBase {
+  lite::Tensor* input_x{};
+  lite::Tensor* input_y{};
+  lite::Tensor* input_w{};
+  lite::Tensor* conv_w{};
+
+  float input_w_max{0.0f};
+  float conv_w_max{0.0f};
+  std::vector<int> topks;
+  int channel_num{0};
+  int dim_t{0};
+
+  lite::Tensor* topk_out{};
+};
+
+struct XPUMmdnnMergeAllParam : ParamBase {
+  std::vector<lite::Tensor*> concat_7in1_x;
+  std::vector<lite::Tensor*> concat_2in1_x;
+  lite::Tensor* grnn_fw_wh{};
+  lite::Tensor* grnn_fw_wi{};
+  lite::Tensor* grnn_rv_wh{};
+  lite::Tensor* grnn_rv_wi{};
+  lite::Tensor* fc0_w{};
+  lite::Tensor* fc0_b{};
+  lite::Tensor* fc1_w{};
+  lite::Tensor* fc1_b{};
+  lite::Tensor* fc2_w{};
+  lite::Tensor* fc2_b{};
+
+  std::vector<float> grnn_fw_wh_maxs;
+  std::vector<float> grnn_fw_wi_maxs;
+  std::vector<float> grnn_rv_wh_maxs;
+  std::vector<float> grnn_rv_wi_maxs;
+  float fc0_w_max{0.0f};
+  float fc1_w_max{0.0f};
+  float fc2_w_max{0.0f};
+
+  lite::Tensor* out{};
+};
+
 // For DeformableConvolution op
 struct DeformableConvParam : ParamBase {
   lite::Tensor* x{};
@@ -1568,6 +1725,34 @@ struct PixelShuffleParam : ParamBase {
   lite::Tensor* output{nullptr};
   int upscale_factor{1};
 };
+
+struct RetinanetDetectionOutputParam : ParamBase {
+  std::vector<Tensor*> bboxes{};
+  std::vector<Tensor*> scores{};
+  std::vector<Tensor*> anchors{};
+  Tensor* im_info{};
+  Tensor* out{};
+  float score_threshold{};
+  int nms_top_k{};
+  float nms_threshold{};
+  float nms_eta{};
+  int keep_top_k{};
+};
+
+struct WhereIndexParam : ParamBase {
+  const lite::Tensor* input{nullptr};
+  lite::Tensor* output{nullptr};
+};
+
+struct ClipParam : ParamBase {
+  Tensor* x{};
+  Tensor* min_tensor{};
+  Tensor* max_tensor{};
+  Tensor* out{};
+  float min{};
+  float max{};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/pixel_shuffle_op.cc b/lite/operators/pixel_shuffle_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..40f564bdd6d2699bafe497bdfded21ea4f3956a3
--- /dev/null
+++ b/lite/operators/pixel_shuffle_op.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/pixel_shuffle_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool PixelShuffleOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  CHECK_OR_FALSE(param_.upscale_factor);
+  const auto x_dims = param_.x->dims();
+  const auto upscale_factor = param_.upscale_factor;
+  CHECK_EQ_OR_FALSE(x_dims[1] % (upscale_factor * upscale_factor), 0);
+  return true;
+}
+
+bool PixelShuffleOpLite::InferShapeImpl() const {
+  const auto x_dims = param_.x->dims();
+  const auto upscale_factor = param_.upscale_factor;
+  auto output_dims = x_dims;
+  output_dims[0] = x_dims[0];
+  output_dims[1] = x_dims[1] / (upscale_factor * upscale_factor);
+  output_dims[2] = x_dims[2] * upscale_factor;
+  output_dims[3] = x_dims[3] * upscale_factor;
+  param_.output->Resize(output_dims);
+  return true;
+}
+
+bool PixelShuffleOpLite::AttachImpl(const cpp::OpDesc& opdesc,
+                                    lite::Scope* scope) {
+  auto input = opdesc.Input("X").front();
+  auto out = opdesc.Output("Out").front();
+
+  param_.x = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  if (opdesc.HasAttr("upscale_factor")) {
+    param_.upscale_factor = opdesc.GetAttr<int>("upscale_factor");
+  }
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(pixel_shuffle, paddle::lite::operators::PixelShuffleOpLite);
diff --git a/lite/operators/pixel_shuffle_op.h b/lite/operators/pixel_shuffle_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..63efd8df778c6d92bc448f795c19ff5bffba62c8
--- /dev/null
+++ b/lite/operators/pixel_shuffle_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class PixelShuffleOpLite : public OpLite {
+ public:
+  PixelShuffleOpLite() {}
+  explicit PixelShuffleOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "pixel_shuffle"; }
+
+ private:
+  mutable PixelShuffleParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index 92f00a4272fddeb03abd04cba473a997cce37217..916ed1dd6f036c6c36954622abbbc1361de1b790 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -54,7 +54,7 @@ class PoolOpLite : public OpLite {
     param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
     param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    std::vector<int> paddings = op_desc.GetAttr<std::vector<int>>("paddings");
 
     if (op_desc.HasAttr("exclusive")) {
       param_.exclusive = op_desc.GetAttr<bool>("exclusive");
diff --git a/lite/operators/retinanet_detection_output_op.cc b/lite/operators/retinanet_detection_output_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e27f2bfca0ab25b8f73d4c6a68d539a7c22389e0
--- /dev/null
+++ b/lite/operators/retinanet_detection_output_op.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/retinanet_detection_output_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool RetinanetDetectionOutputOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.bboxes.size() > 0);
+  CHECK_OR_FALSE(param_.scores.size() > 0);
+  CHECK_OR_FALSE(param_.anchors.size() > 0);
+  CHECK_OR_FALSE(param_.bboxes.size() == param_.scores.size());
+  CHECK_OR_FALSE(param_.bboxes.size() == param_.anchors.size());
+  CHECK_OR_FALSE(param_.im_info);
+  CHECK_OR_FALSE(param_.out);
+
+  DDim bbox_dims = param_.bboxes.front()->dims();
+  DDim score_dims = param_.scores.front()->dims();
+  DDim anchor_dims = param_.anchors.front()->dims();
+  DDim im_info_dims = param_.im_info->dims();
+
+  CHECK_OR_FALSE(bbox_dims.size() == 3);
+  CHECK_OR_FALSE(score_dims.size() == 3);
+  CHECK_OR_FALSE(anchor_dims.size() == 2);
+  CHECK_OR_FALSE(bbox_dims[2] == 4);
+  CHECK_OR_FALSE(bbox_dims[1] == score_dims[1]);
+  CHECK_OR_FALSE(anchor_dims[0] == bbox_dims[1]);
+  CHECK_OR_FALSE(im_info_dims.size() == 2);
+
+  return true;
+}
+
+bool RetinanetDetectionOutputOpLite::InferShapeImpl() const {
+  DDim bbox_dims = param_.bboxes.front()->dims();
+  param_.out->Resize({bbox_dims[1], bbox_dims[2] + 2});
+  return true;
+}
+
+bool RetinanetDetectionOutputOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                                lite::Scope *scope) {
+  for (auto arg_name : op_desc.Input("BBoxes")) {
+    param_.bboxes.push_back(
+        scope->FindVar(arg_name)->GetMutable<lite::Tensor>());
+  }
+  for (auto arg_name : op_desc.Input("Scores")) {
+    param_.scores.push_back(
+        scope->FindVar(arg_name)->GetMutable<lite::Tensor>());
+  }
+  for (auto arg_name : op_desc.Input("Anchors")) {
+    param_.anchors.push_back(
+        scope->FindVar(arg_name)->GetMutable<lite::Tensor>());
+  }
+  AttachInput(op_desc, scope, "ImInfo", false, &param_.im_info);
+  AttachOutput(op_desc, scope, "Out", false, &param_.out);
+
+  param_.score_threshold = op_desc.GetAttr<float>("score_threshold");
+  param_.nms_top_k = op_desc.GetAttr<int>("nms_top_k");
+  param_.nms_threshold = op_desc.GetAttr<float>("nms_threshold");
+  param_.nms_eta = op_desc.GetAttr<float>("nms_eta");
+  param_.keep_top_k = op_desc.GetAttr<int>("keep_top_k");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(retinanet_detection_output,
+                 paddle::lite::operators::RetinanetDetectionOutputOpLite);
diff --git a/lite/operators/retinanet_detection_output_op.h b/lite/operators/retinanet_detection_output_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9969227e15941644249b46ba7372f9afc705672c
--- /dev/null
+++ b/lite/operators/retinanet_detection_output_op.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class RetinanetDetectionOutputOpLite : public OpLite {
+ public:
+  RetinanetDetectionOutputOpLite() {}
+
+  explicit RetinanetDetectionOutputOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "retinanet_detection_output";
+  }
+
+#ifdef LITE_WITH_PROFILE
+  void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {}
+#endif
+
+ private:
+  mutable RetinanetDetectionOutputParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc
index 71e62c2ae729b4e1516a219888b9af3f7d994428..8024c38f9cc4a6d3ba2d47d6c61e716dd57bb362 100644
--- a/lite/operators/search_fc_op.cc
+++ b/lite/operators/search_fc_op.cc
@@ -70,6 +70,18 @@ bool SearchFcOpLite::AttachImpl(const cpp::OpDesc &op_desc,
   param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
   param_.out_size = op_desc.GetAttr<int>("out_size");
 
+  if (op_desc.HasAttr("fuse_relu")) {
+    param_.fuse_relu = op_desc.GetAttr<bool>("fuse_relu");
+  }
+#ifdef LITE_WITH_XPU
+  if (op_desc.HasAttr("__xpu__float_to_fix")) {
+    param_.__xpu__float_to_fix = op_desc.GetAttr<bool>("__xpu__float_to_fix");
+  }
+  if (op_desc.HasAttr("__xpu__w_max")) {
+    param_.__xpu__w_max = op_desc.GetAttr<float>("__xpu__w_max");
+  }
+#endif
+
   return true;
 }
 
diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc
index 1ced477c109d8cd93485f0193523887759939f17..6f743693bc782e636064ca398539433b497dc645 100644
--- a/lite/operators/search_grnn_op.cc
+++ b/lite/operators/search_grnn_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/operators/search_grnn_op.h"
+#include <vector>
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 
@@ -84,6 +85,18 @@ bool SearchGrnnOpLite::AttachImpl(const cpp::OpDesc& op_desc,
   param_.layout_input =
       scope->FindVar(layout_input)->GetMutable<lite::Tensor>();
 
+#ifdef LITE_WITH_XPU
+  if (op_desc.HasAttr("__xpu__float_to_fix")) {
+    param_.__xpu__float_to_fix = op_desc.GetAttr<bool>("__xpu__float_to_fix");
+  }
+  if (op_desc.HasAttr("__xpu__wi_max")) {
+    param_.__xpu__wi_max = op_desc.GetAttr<std::vector<float>>("__xpu__wi_max");
+  }
+  if (op_desc.HasAttr("__xpu__wh_max")) {
+    param_.__xpu__wh_max = op_desc.GetAttr<std::vector<float>>("__xpu__wh_max");
+  }
+#endif
+
   return true;
 }
 
diff --git a/lite/operators/sequence_mask_op.cc b/lite/operators/sequence_mask_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bac1dc8a26abe9a9ae2bbd77e03c2375b4814268
--- /dev/null
+++ b/lite/operators/sequence_mask_op.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_mask_op.h"
+
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceMaskOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  return true;
+}
+
+bool SequenceMaskOp::InferShapeImpl() const { return true; }
+
+bool SequenceMaskOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  if (opdesc.HasInput("MaxLenTensor") &&
+      !opdesc.Input("MaxLenTensor").empty()) {
+    auto var = scope->FindVar(opdesc.Input("MaxLenTensor").front());
+    if (var != nullptr) {
+      param_.MaxLenTensor = var->GetMutable<lite::Tensor>();
+    }
+  }
+  param_.Y =
+      scope->FindVar(opdesc.Output("Y").front())->GetMutable<lite::Tensor>();
+  param_.maxlen = opdesc.GetAttr<int>("maxlen");
+  param_.out_dtype = opdesc.GetAttr<int>("out_dtype");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_mask, paddle::lite::operators::SequenceMaskOp);
diff --git a/lite/operators/sequence_mask_op.h b/lite/operators/sequence_mask_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..97008b865b850f3837fcc49befc5735987fb2048
--- /dev/null
+++ b/lite/operators/sequence_mask_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceMaskOp : public OpLite {
+ public:
+  SequenceMaskOp() {}
+  explicit SequenceMaskOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_mask"; }
+
+ private:
+  mutable SequenceMaskParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_pad_op.cc b/lite/operators/sequence_pad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..687c4a1989deaa5afea2356338630fa0ee846cb5
--- /dev/null
+++ b/lite/operators/sequence_pad_op.cc
@@ -0,0 +1,102 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_pad_op.h"
+#include <algorithm>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequencePadOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.PadValue);
+  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.Length);
+
+  return true;
+}
+
+bool SequencePadOp::InferShapeImpl() const {
+  auto x_dims = param_.X->dims();
+  CHECK_GE(x_dims.size(), 2) << "The rank of SequencePad OP Input(x) can't be "
+                                "less than 2. But the rank we received is "
+                             << x_dims.size();
+  auto time_step_dims = x_dims.Slice(1, x_dims.size());
+  auto pad_value_dims = param_.PadValue->dims();
+  CHECK_EQ((pad_value_dims == DDim({1})) || (pad_value_dims == time_step_dims),
+           true)
+      << "The SequencePad OP Input(PadValue) must be a scalar or a tensor "
+         "whiose shape equals to time steps in sequences";
+
+  auto x_lod = param_.X->lod();
+  CHECK_EQ(x_lod.empty(), false)
+      << "The SequencePad OP Input(X) must hold lod info.";
+  const auto &x_lod_0 = x_lod[0];
+  CHECK_GE(x_lod_0.size(), 2)
+      << "The size of SequencePadOp Input(X)'s lod info can't be less than 2. "
+         "But the size we received is "
+      << x_lod_0.size();
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+      << "The SequencePadOp Input(X)'s lod info mismatches the actual tensor "
+         "shape. The 1st dimension of Input(X)'s lod info is "
+      << x_dims[0] << ", the 1st dimension of actual tensor shape is "
+      << static_cast<int64_t>(x_lod_0.back());
+
+  int seq_num = x_lod_0.size() - 1;
+  int max_seq_len = 0;
+  for (int i = 0; i < seq_num; ++i) {
+    max_seq_len =
+        std::max(max_seq_len, static_cast<int>(x_lod_0[i + 1] - x_lod_0[i]));
+  }
+  if (param_.padded_length == -1) {
+    param_.padded_length = max_seq_len;
+  }
+  CHECK_GE(param_.padded_length, max_seq_len)
+      << "The SequencePadOp Attr(padded_length) should be greater than or "
+         "equal to the length of the longest original sequence. But the "
+         "padded_length we received is "
+      << param_.padded_length
+      << ", the length of the longest original sequence is " << max_seq_len;
+
+  int out_dim_0 = seq_num;
+  std::vector<int64_t> out_dims_vec{out_dim_0, param_.padded_length};
+  std::vector<int64_t> len_dims_vec{out_dim_0};
+  auto time_step_dims_vec = time_step_dims.Vectorize();
+  out_dims_vec.insert(
+      out_dims_vec.end(), time_step_dims_vec.begin(), time_step_dims_vec.end());
+  param_.Out->Resize(out_dims_vec);
+  param_.Length->Resize(len_dims_vec);
+  return true;
+}
+
+bool SequencePadOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.PadValue = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("PadValue").front())->Get<lite::Tensor>());
+  param_.Length = scope->FindVar(opdesc.Input("Length").front())
+                      ->GetMutable<lite::Tensor>();
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.padded_length = opdesc.GetAttr<int>("padded_length");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_pad, paddle::lite::operators::SequencePadOp);
diff --git a/lite/operators/sequence_pad_op.h b/lite/operators/sequence_pad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd5d732a5d8816d4f7994ee0e3175ac8a032b2d4
--- /dev/null
+++ b/lite/operators/sequence_pad_op.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequencePadOp : public OpLite {
+ public:
+  SequencePadOp() {}
+  explicit SequencePadOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_pad"; }
+
+ private:
+  mutable SequencePadParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc
index 19a47cac9da666269fc5ef2a172ff0295b71e95d..fa2b0553aa2ac84f27d5d27d31df5ce9584d82c3 100644
--- a/lite/operators/sequence_reverse_op.cc
+++ b/lite/operators/sequence_reverse_op.cc
@@ -34,6 +34,7 @@ bool SequenceReverseOp::InferShapeImpl() const {
   const auto *input = param_.X;
   auto out_dims = input->dims();
   param_.Out->Resize(out_dims);
+  param_.Out->set_lod(param_.X->lod());
   return true;
 }
 
@@ -45,6 +46,7 @@ bool SequenceReverseOp::AttachImpl(const cpp::OpDesc &opdesc,
       scope->FindVar(opdesc.Output("Y").front())->GetMutable<lite::Tensor>();
   CHECK(param_.X);
   CHECK(param_.Out);
+
   return true;
 }
 
diff --git a/lite/operators/topk_pooling_op.cc b/lite/operators/topk_pooling_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..76634d216a8a120f4e83dfe511089c6deb750cba
--- /dev/null
+++ b/lite/operators/topk_pooling_op.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/topk_pooling_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool TopkPoolingOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool TopkPoolingOp::InferShapeImpl() const {
+  auto out_dims = param_.X->dims();
+  out_dims[1] *= param_.top_k;
+  auto out = param_.Out;
+  out->Resize(out_dims);
+  out->set_lod(param_.X->lod());
+
+  return true;
+}
+
+bool TopkPoolingOp::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto y = op_desc.Input("Y").front();
+  param_.X = scope->FindTensor(x);
+  param_.Y = scope->FindTensor(y);
+  auto output = op_desc.Output("Out").front();
+  param_.Out = scope->FindMutableTensor(output);
+  param_.top_k = op_desc.GetAttr<int>("top_k");
+  param_.feat_map_num = op_desc.GetAttr<int>("feat_map_num");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(topk_pooling, paddle::lite::operators::TopkPoolingOp);
diff --git a/lite/operators/topk_pooling_op.h b/lite/operators/topk_pooling_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ec48c476ca3e6854038bed591ca59402eda93736
--- /dev/null
+++ b/lite/operators/topk_pooling_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class TopkPoolingOp : public OpLite {
+ public:
+  TopkPoolingOp() {}
+  explicit TopkPoolingOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "topk_pooling"; }
+
+ private:
+  mutable TopkPoolingParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc
index fe40bf6fa2f84ce7c999b41435aed00cd6555887..8f1372a883a1cd54ac2368f1e7f5e30a60a6b1db 100644
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
@@ -43,24 +43,9 @@ bool TransposeOp::CheckShape() const {
 }
 
 bool TransposeOp::InferShapeImpl() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
   auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
   std::vector<int> axis = param_.axis;
   size_t axis_size = axis.size();
-  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
-  // x_rank, axis_size
-  CHECK_OR_FALSE(x_rank == axis_size);
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    // Each element of Attribute axis should be a unique value
-    // range from 0 to (dims - 1),
-    // where the dims is the axis's size
-    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
-                   ++count[axis[i]] == 1);
-  }
   lite::DDim out_dims(x_dims);
   for (size_t i = 0; i < axis_size; i++) {
     out_dims[i] = x_dims[axis[i]];
@@ -113,24 +98,9 @@ bool Transpose2Op::CheckShape() const {
 }
 
 bool Transpose2Op::InferShapeImpl() const {
-  CHECK_OR_FALSE(param_.x);
-  CHECK_OR_FALSE(param_.output);
   auto x_dims = param_.x->dims();
-  auto x_rank = x_dims.size();
   std::vector<int> axis = param_.axis;
   size_t axis_size = axis.size();
-  // "The input tensor's rank(%d) should be equal to the axis's size(%d)",
-  // x_rank, axis_size
-  CHECK_OR_FALSE(x_rank == axis_size);
-
-  std::vector<int> count(axis_size, 0);
-  for (size_t i = 0; i < axis_size; i++) {
-    // Each element of Attribute axis should be a unique value
-    // range from 0 to (dims - 1),
-    // where the dims is the axis's size
-    CHECK_OR_FALSE(axis[i] < static_cast<int>(axis_size) &&
-                   ++count[axis[i]] == 1);
-  }
   lite::DDim out_dims(x_dims);
   for (size_t i = 0; i < axis_size; i++) {
     out_dims[i] = x_dims[axis[i]];
diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc
index 8cf11f6465d73646ec9bf846cbe6347bdc4b9f5b..83b6cc6a24ed1537adec8fd7d54a477edf91f873 100644
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
@@ -52,6 +52,15 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   if (opdesc.HasAttr("fuse_relu")) {
     param_.fuse_relu = opdesc.GetAttr<bool>("fuse_relu");
   }
+#ifdef LITE_WITH_XPU
+  if (opdesc.HasAttr("__xpu__float_to_fix")) {
+    param_.__xpu__float_to_fix = opdesc.GetAttr<bool>("__xpu__float_to_fix");
+  }
+  if (opdesc.HasAttr("__xpu__w_max")) {
+    param_.__xpu__w_max = opdesc.GetAttr<float>("__xpu__w_max");
+  }
+#endif
+
   return true;
 }
 
diff --git a/lite/operators/where_index_op.cc b/lite/operators/where_index_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..81443b7058e0c7d68008cbe98040b3f50eac852f
--- /dev/null
+++ b/lite/operators/where_index_op.cc
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/where_index_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool WhereIndexdOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.input);
+  CHECK_OR_FALSE(param_.output);
+  CHECK_GE(param_.input->dims().size(), 1);
+  return true;
+}
+
+bool WhereIndexdOp::InferShapeImpl() const {
+  int64_t rank = static_cast<int64_t>(param_.input->dims().size());
+  int64_t numel = static_cast<int64_t>(param_.input->dims().production());
+  param_.output->Resize({numel, rank});
+  return true;
+}
+
+bool WhereIndexdOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  AttachParam(&param_);
+  auto input = opdesc.Input("Condition").front();
+  auto output = opdesc.Output("Out").front();
+  CHECK(scope->FindVar(input));
+  CHECK(scope->FindVar(output));
+  param_.input = GetVar<lite::Tensor>(scope, input);
+  param_.output = GetMutableVar<lite::Tensor>(scope, output);
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(where_index, paddle::lite::operators::WhereIndexdOp);
diff --git a/lite/operators/where_index_op.h b/lite/operators/where_index_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..157a3cb0be33ffad275ae55a0999095357a09948
--- /dev/null
+++ b/lite/operators/where_index_op.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class WhereIndexdOp : public OpLite {
+ public:
+  WhereIndexdOp() {}
+  explicit WhereIndexdOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShapeImpl() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "where_index_op"; }
+
+ private:
+  mutable WhereIndexParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/CMakeLists.txt b/lite/tests/api/CMakeLists.txt
index 810a20abbc0d13897822cef2c99e5942e352a19f..844c3f2ac7146e05b2d93eac76279df022e06652 100644
--- a/lite/tests/api/CMakeLists.txt
+++ b/lite/tests/api/CMakeLists.txt
@@ -6,11 +6,25 @@ if(LITE_WITH_XPU)
     lite_cc_test(test_ernie_lite_xpu SRCS test_ernie_lite_xpu.cc
       DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
       ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+      ARGS --model_dir=${LITE_MODEL_DIR}/ernie)
     lite_cc_test(test_bert_lite_xpu SRCS test_bert_lite_xpu.cc
       DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
       ${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
-      ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+      ARGS --model_dir=${LITE_MODEL_DIR}/bert)
+    if(WITH_TESTING)
+        add_dependencies(test_resnet50_lite_xpu extern_lite_download_resnet50_tar_gz)
+        add_dependencies(test_ernie_lite_xpu extern_lite_download_ernie_tar_gz)
+        add_dependencies(test_bert_lite_xpu extern_lite_download_bert_tar_gz)
+    endif()
+    # TODO(miaotianxiang): enable later
+    #lite_cc_test(test_fpr_lite_xpu SRCS test_fpr_lite_xpu.cc
+      #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
+    #lite_cc_test(test_mmdnn_lite_xpu SRCS test_mmdnn_lite_xpu.cc
+      #DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
+      #${ops} ${host_kernels} ${x86_kernels} ${xpu_kernels}
+      #ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
 endif()
 
 if(LITE_WITH_RKNPU)
diff --git a/lite/tests/api/test_bert_lite_xpu.cc b/lite/tests/api/test_bert_lite_xpu.cc
index b3ee9febb3f0eabd36118680beca66ace9470de4..5d66fd0d5496e105ba97bea6c5e5387d96c9e01b 100644
--- a/lite/tests/api/test_bert_lite_xpu.cc
+++ b/lite/tests/api/test_bert_lite_xpu.cc
@@ -93,7 +93,7 @@ TEST(Ernie, test_ernie_lite_xpu) {
   for (size_t i = 0; i < results.size(); ++i) {
     for (size_t j = 0; j < results[i].size(); ++j) {
       EXPECT_NEAR(
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 3e-5);
     }
   }
 }
diff --git a/lite/tests/api/test_ernie_lite_xpu.cc b/lite/tests/api/test_ernie_lite_xpu.cc
index 0b614fec96cbcc5d9c96653681d0e8794cf4ab8f..b1db9f353657f3f09bcad25db4e777b05f15e0f7 100644
--- a/lite/tests/api/test_ernie_lite_xpu.cc
+++ b/lite/tests/api/test_ernie_lite_xpu.cc
@@ -93,7 +93,7 @@ TEST(Ernie, test_ernie_lite_xpu) {
   for (size_t i = 0; i < results.size(); ++i) {
     for (size_t j = 0; j < results[i].size(); ++j) {
       EXPECT_NEAR(
-          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 1e-5);
+          out->data<float>()[j + (out->shape()[1] * i)], results[i][j], 2e-5);
     }
   }
 }
diff --git a/lite/tests/api/test_fpr_lite_xpu.cc b/lite/tests/api/test_fpr_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..026c25690fe2a673be0a5a97b163d7bbe5fdb4f6
--- /dev/null
+++ b/lite/tests/api/test_fpr_lite_xpu.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+
+TEST(ResnetCbam, test_resnet_cbam_lite_xpu) {
+  lite_api::CxxConfig config;
+  // config.set_model_dir(FLAGS_model_dir);
+  config.set_model_file(FLAGS_model_dir + "/__model__");
+  config.set_param_file(FLAGS_model_dir + "/__params__");
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  auto input_tensor = predictor->GetInput(0);
+  std::vector<int64_t> input_shape{1, 3, 224, 224};
+  input_tensor->Resize(input_shape);
+  auto* data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  for (int i = 0; i < input_num; i++) {
+    data[i] = 1;
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/api/test_mmdnn_lite_xpu.cc b/lite/tests/api/test_mmdnn_lite_xpu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2a98821e70cb462b23887f851cfc4bce6b463ca
--- /dev/null
+++ b/lite/tests/api/test_mmdnn_lite_xpu.cc
@@ -0,0 +1,311 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "lite/api/lite_api_test_helper.h"
+#include "lite/api/paddle_api.h"
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/api/paddle_use_passes.h"
+#include "lite/api/test_helper.h"
+#include "lite/utils/cp_logging.h"
+#include "lite/utils/string.h"
+
+DEFINE_bool(perf, false, "perf?");
+DEFINE_string(perf_input, "perf_input", "perf_input");
+
+namespace paddle {
+namespace lite {
+
+std::vector<int64_t> input0;
+std::vector<uint64_t> input0_lod = {0};
+std::vector<int64_t> input1;
+std::vector<uint64_t> input1_lod = {0};
+std::vector<int64_t> input2;
+std::vector<uint64_t> input2_lod = {0};
+std::vector<int64_t> input3;
+std::vector<uint64_t> input3_lod = {0};
+std::vector<int64_t> input4;
+std::vector<uint64_t> input4_lod = {0};
+std::vector<int64_t> input5;
+std::vector<uint64_t> input5_lod = {0};
+
+void ParseInput() {
+  std::string raw_input =
+      "0 1;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
+      "760166;3719 428 52 18 1102 10327 252 20 153 2897 1146 70 156 6 145 "
+      "10251 839 5 1779 1729 1779 1729 18 2707 6 2707 20 4742 4937 432 6 "
+      "3869;3719 760166 760166 18 1035176 1035176 764393 764393 1259006 767614 "
+      "767614 1020808 769579 793958 793958 1050488 911898 751332 751332 750336 "
+      "750799 750336 751575 751575 751544 751735 751397 751365 751512 751512 "
+      "753011 751562;3719 428 52 18 1102 10327 252 20 153 2897 1146 70 156 6 "
+      "145 10251 839 2 1211 3 3719 720 1540 145 10251 839 9405 4315 5998 4 2 "
+      "600 373 41 3719 428 52 44 10251 4302 1319 7 12 2 768 6 918 6 841 870 8 "
+      "843 8 271;3719 760166 760166 18 1035176 1035176 764393 764393 1259006 "
+      "767614 767614 1020808 769579 793958 793958 1050488 911898 2 773899 "
+      "773899 3719 1118420 1118420 1050488 1050488 911898 9405 4315 5998 4 2 "
+      "785435 785435 41 3719 760166 760166 44 10251 4302 1319 750118 750118 2 "
+      "750465 750465 750274 750398 750233 751252 751252 753447 752830 753112;\n"
+      "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
+      "760166;2109 2467 1805 227 3719 428 52 18 1102 10327 252 20 6 242 78 6 "
+      "532 78;2109 2467 1805 1245431 1245431 760166 760166 18 1035176 1035176 "
+      "764393 764393 752116 242 750370 750370 752081 751247;2109 2467 1805 227 "
+      "3719 428 52 18 1102 10327 252 20 2 145 242 1050 252 3582 2212;2109 2467 "
+      "1805 1245431 1245431 760166 760166 18 1035176 1035176 764393 764393 2 "
+      "871717 871717 757921 757921 3582 2212;\n"
+      "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
+      "760166;145 10251 839 76 31 1337 823 7506 567 65 170 8 21293 3719 5 43 "
+      "394 743 42;1050488 1050488 911898 750016 750016 1337 823 7506 762617 "
+      "762617 866652 8 21293 3719 5 43 914758 914758 757202;145 10251 839 76 "
+      "31 1337 823 7506 567 65 170 8 21293 3719 2 17580 30 523324 3 10251 4104 "
+      "281 3 8511 3719 2217 3 13 226 3083 4 11251 1606 357 9 2 145 10251 839 "
+      "76 31 1337 823 7506 567 65 170 2 7506 2445 8 145 10251 839 528 839 "
+      "19670 6538;1050488 1050488 911898 750016 750016 1337 823 7506 762617 "
+      "762617 866652 8 21293 3719 2 816626 816626 523324 3 1181698 1181698 "
+      "751656 780821 1063148 3719 2217 3 752498 752498 831323 753602 11251 "
+      "1606 357 9 2 1050488 1050488 911898 750016 750016 1337 823 7506 762617 "
+      "762617 866652 2 7506 753045 753045 756756 1050488 911898 528 839 19670 "
+      "6538;\n"
+      "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
+      "760166;145 10251 839 99 4 1102 10327 2196 41 3719 428 52 44 99 4 2899 "
+      "229 10 10 10;1050488 1050488 911898 807966 750273 1035176 1035176 "
+      "1237875 41 3719 760166 760166 753645 753645 750273 2899 229 750001 "
+      "750001 750001;145 10251 839 99 4 1102 10327 2196 41 3719 428 52 44 99 4 "
+      "2899 229 10 10 10 2 1177 8 145 10251 839 99 4 1102 10327 2196 41 3719 "
+      "428 52 44 99 4 2 101 8 1922 17 2184 2 1154 1922 72 1198 1266 "
+      "4516;1050488 1050488 911898 807966 750273 1035176 1035176 1237875 41 "
+      "3719 760166 760166 753645 753645 750273 2899 229 750001 750001 750001 2 "
+      "750257 750257 756756 1050488 911898 807966 750273 1035176 1035176 "
+      "1237875 41 3719 760166 760166 753645 753645 750273 2 764513 764513 "
+      "851213 851213 854628 2 753018 753018 754317 753328 754085 754070;\n"
+      "0 0;145 10251 839 3719 428 52;1050488 1050488 911898 3719 760166 "
+      "760166;73 5347 112 8 145 10251 839 262 169 22729 3719 6 743 6 339 1156 "
+      "78 136 399 693 128 571;776150 776150 112 756756 756756 1050488 911898 "
+      "791355 791355 22729 3719 6 758277 758277 750137 750234 750241 750178 "
+      "750055 750216 750212 750049;73 5347 112 8 145 10251 839 262 169 22729 "
+      "3719 2 588 415 549 415 115 23;776150 776150 112 756756 756756 1050488 "
+      "911898 791355 791355 22729 3719 2 750221 750221 750262 750277 750277 "
+      "750261;";
+  auto raw_lines = Split(raw_input, "\n");
+  for (auto& raw_line : raw_lines) {
+    auto inputx = Split(raw_line, ";");
+    for (size_t i = 1; i < inputx.size(); ++i) {
+      auto tokens = Split(inputx[i], " ");
+      static std::vector<int64_t>* const input_array[] = {
+          &input0, &input0, &input1, &input2, &input3, &input4, &input5};
+      static std::vector<uint64_t>* const lod_array[] = {&input0_lod,
+                                                         &input0_lod,
+                                                         &input1_lod,
+                                                         &input2_lod,
+                                                         &input3_lod,
+                                                         &input4_lod,
+                                                         &input5_lod};
+      for (auto token : tokens) {
+        input_array[i]->push_back((int64_t)atoi(token.c_str()));
+      }
+      lod_array[i]->push_back((uint64_t)tokens.size() +
+                              (*lod_array[i])[lod_array[i]->size() - 1]);
+    }
+  }
+  return;
+}
+
+class MmdnnReader {
+  std::ifstream ifs;
+  std::vector<std::string> StringSplit(const std::string& in,
+                                       const std::string& delim) {
+    std::vector<std::string> ret;
+    if (in == "") {
+      return ret;
+    }
+    auto begpos = in.find_first_not_of(delim);
+    while (begpos != std::string::npos) {
+      auto endpos = in.find_first_of(delim, begpos);
+      if (endpos == std::string::npos) {
+        endpos = in.size();
+      }
+      std::string ssubstr = in.substr(begpos, endpos - begpos);
+      ret.push_back(ssubstr);
+      begpos = endpos + 1;
+      if (endpos >= (in.size() - 1)) {
+        break;
+      }
+    }
+    return ret;
+  }
+
+ public:
+  std::vector<int64_t> data[6];
+  std::vector<uint64_t> lod[6];
+
+  void Init(std::string file_name) { ifs.open(file_name); }
+
+  int Read(int maxline) {
+    for (int i = 0; i < 6; i++) {
+      data[i].clear();
+    }
+    for (int i = 0; i < 6; i++) {
+      lod[i].clear();
+      lod[i].push_back(0);
+    }
+    std::string line;
+    int cnt = 0;
+    while (cnt < maxline && getline(ifs, line)) {
+      std::vector<std::string> split1 = StringSplit(line, ";");
+      for (int i = 1; i < 7; i++) {
+        std::vector<std::string> split2 = StringSplit(split1[i], " ");
+        if (split2.size() == 0) {
+          split2.push_back("1280000");
+        }
+        for (size_t j = 0; j < split2.size(); j++) {
+          data[i - 1].push_back(std::stoi(split2[j].c_str(), nullptr, 0));
+        }
+        // if (i % 2 == 1) {
+        // lod[i / 2].push_back(lod[i / 2].back() + split2.size());
+        //}
+        lod[i - 1].push_back(lod[i - 1].back() + split2.size());
+      }
+      cnt++;
+    }
+    return cnt;
+  }
+};
+
+TEST(MMDNN, test_mmdnn_lite_xpu) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(FLAGS_model_dir);
+  config.set_valid_places({lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kXPU), PRECISION(kInt64)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+                           lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
+                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
+  config.set_xpu_workspace_l3_size_per_thread();
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+
+  if (FLAGS_perf) {
+    MmdnnReader reader;
+    reader.Init(FLAGS_perf_input);
+    int UB_batch = 40;  //  upper bound of batch
+    int iter = 0;
+    double tsc_sum = 0;
+
+    while (true) {
+      int batch = reader.Read(UB_batch);
+      if (batch <= 0) {
+        break;
+      }
+      ++iter;
+      for (int i = 0; i < 6; ++i) {
+        auto input_x = predictor->GetInput(i);
+        input_x->Resize({(int64_t)reader.data[i].size(), 1});
+        input_x->SetLoD({reader.lod[i]});
+        auto* data_x = input_x->mutable_data<int64_t>();
+        memcpy(data_x,
+               reader.data[i].data(),
+               reader.data[i].size() * sizeof(int64_t));
+      }
+
+      auto start = GetCurrentUS();
+      predictor->Run();
+      auto end = GetCurrentUS();
+      tsc_sum += end - start;
+    }
+    LOG(INFO) << "================== Speed Report ===================";
+    LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num "
+              << FLAGS_threads << ", warmup: " << FLAGS_warmup
+              << ", repeats: " << iter << ", spend " << tsc_sum / iter / 1000.0
+              << " ms in average.";
+
+    return;
+  }
+
+  ParseInput();
+
+  {
+    std::vector<int64_t> input0_shape{(int64_t)input0.size(), 1};
+    auto input_tensor0 = predictor->GetInput(0);
+    input_tensor0->Resize(input0_shape);
+    input_tensor0->SetLoD({input0_lod});
+    auto* data0 = input_tensor0->mutable_data<int64_t>();
+    memcpy(data0, input0.data(), sizeof(int64_t) * input0.size());
+  }
+  {
+    std::vector<int64_t> input1_shape{(int64_t)input1.size(), 1};
+    auto input_tensor1 = predictor->GetInput(1);
+    input_tensor1->Resize(input1_shape);
+    input_tensor1->SetLoD({input1_lod});
+    auto* data1 = input_tensor1->mutable_data<int64_t>();
+    memcpy(data1, input1.data(), sizeof(int64_t) * input1.size());
+  }
+  {
+    std::vector<int64_t> input2_shape{(int64_t)input2.size(), 1};
+    auto input_tensor2 = predictor->GetInput(2);
+    input_tensor2->Resize(input2_shape);
+    input_tensor2->SetLoD({input2_lod});
+    auto* data2 = input_tensor2->mutable_data<int64_t>();
+    memcpy(data2, input2.data(), sizeof(int64_t) * input2.size());
+  }
+  {
+    std::vector<int64_t> input3_shape{(int64_t)input3.size(), 1};
+    auto input_tensor3 = predictor->GetInput(3);
+    input_tensor3->Resize(input3_shape);
+    input_tensor3->SetLoD({input3_lod});
+    auto* data3 = input_tensor3->mutable_data<int64_t>();
+    memcpy(data3, input3.data(), sizeof(int64_t) * input3.size());
+  }
+  {
+    std::vector<int64_t> input4_shape{(int64_t)input4.size(), 1};
+    auto input_tensor4 = predictor->GetInput(4);
+    input_tensor4->Resize(input4_shape);
+    input_tensor4->SetLoD({input4_lod});
+    auto* data4 = input_tensor4->mutable_data<int64_t>();
+    memcpy(data4, input4.data(), sizeof(int64_t) * input4.size());
+  }
+  {
+    std::vector<int64_t> input5_shape{(int64_t)input5.size(), 1};
+    auto input_tensor5 = predictor->GetInput(5);
+    input_tensor5->Resize(input5_shape);
+    input_tensor5->SetLoD({input5_lod});
+    auto* data5 = input_tensor5->mutable_data<int64_t>();
+    memcpy(data5, input5.data(), sizeof(int64_t) * input5.size());
+  }
+
+  for (int i = 0; i < FLAGS_warmup; ++i) {
+    predictor->Run();
+  }
+
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    predictor->Run();
+  }
+
+  auto out = predictor->GetOutput(0);
+  auto out_shape = out->shape();
+  auto out_size = std::accumulate(
+      out_shape.begin(), out_shape.end(), 1, std::multiplies<int64_t>());
+  for (int i = 0; i < out_size; ++i) {
+    LOG(INFO) << "out[" << i << "] = " << out->data<float>()[i];
+  }
+
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
+            << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
+            << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
+            << " ms in average.";
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index d29f88f334754720b4681042ac5693723e028ba1..9fa795ad89981c52d00772dcd86d952430782adb 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -63,6 +63,7 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_lookup_table_dequant_compute SRCS lookup_table_dequant_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_ctc_align_compute SRCS ctc_align_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_clip_compute SRCS clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
     # for training kernel
     if (LITE_WITH_TRAIN)
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
index 5824ba91c2f824dd351f8977aa497b9ce2238ec6..a62c698f83fe10409af0bba8774135d3409358ea 100644
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -300,7 +300,7 @@ TEST(Activation_relu, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
@@ -426,7 +426,7 @@ TEST(Activation_tanh, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
@@ -572,7 +572,7 @@ TEST(Activation_gelu, precision) {
   LOG(INFO) << "test gelu op";
   Place place;
   float abs_error = 2e-5;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/activation_grad_compute_test.cc b/lite/tests/kernels/activation_grad_compute_test.cc
index 5d5046b01dee6c84f341159b68300197c20695e6..2ad5b80a910f323b34b039eabda0ceb4b49784c5 100644
--- a/lite/tests/kernels/activation_grad_compute_test.cc
+++ b/lite/tests/kernels/activation_grad_compute_test.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/activation_grad_compute.h"
+#include "lite/kernels/host/activation_grad_compute.h"
 #include <gtest/gtest.h>
 #include "lite/core/op_registry.h"
 #include "lite/kernels/arm/activation_compute.h"
@@ -20,13 +20,11 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
 
 using param_t = operators::ActivationParam;
 using grad_param_t = operators::ActivationGradParam;
-using kernel_t = SquareCompute;
-using grad_kernel_t = SquareGradCompute;
 
+template <class kernel_t, class grad_kernel_t>
 class ActivationGradTester {
  public:
   explicit ActivationGradTester(DDim dims) : dims_(dims) {}
@@ -71,22 +69,28 @@ class ActivationGradTester {
   void run_backward(grad_param_t* param,
                     grad_kernel_t* kernel,
                     const std::vector<float>& in_vec,
+                    const std::vector<float>& out_vec,
                     const std::vector<float>& out_grad_vec,
                     float* in_grad_vec) {
     Tensor x;
+    Tensor out;
     Tensor x_grad;
     Tensor out_grad;
     x.Resize(dims_);
+    out.Resize(dims_);
     x_grad.Resize(dims_);
     out_grad.Resize(dims_);
     auto* x_data = x.mutable_data<float>();
+    auto* out_data = out.mutable_data<float>();
     auto* out_grad_data = out_grad.mutable_data<float>();
 
     for (int i = 0; i < dims_.production(); i++) {
       x_data[i] = in_vec[i];
+      out_data[i] = out_vec[i];
       out_grad_data[i] = out_grad_vec[i];
     }
     param->X = &x;
+    param->Out = &out;
     param->X_grad = &x_grad;
     param->Out_grad = &out_grad;
     kernel->SetParam(*param);
@@ -102,7 +106,9 @@ class ActivationGradTester {
     std::vector<float> x(dims_.production());
     std::vector<float> out(dims_.production());
     for (int i = 0; i < dims_.production(); i++) {
-      x[i] = 1.0 * static_cast<float>(i % 128) * 0.3f - 1.1;
+      x[i] = static_cast<float>(i % 3 - 2.0) / 2.0 * 0.333 +
+             static_cast<float>(i % 19 - 10.0) / 10.0 * 0.333 +
+             static_cast<float>(i % 39 - 20.0) / 20.0 * 0.333 + 0.001213;
     }
     this->run_forward(&param_, &kernel_, x, out.data());
 
@@ -120,7 +126,8 @@ class ActivationGradTester {
     for (int i = 0; i < dims_.production(); i++) {
       out_grad[i] = 1.0;
     }
-    this->run_backward(&grad_param_, &grad_kernel_, x, out_grad, x_grad.data());
+    this->run_backward(
+        &grad_param_, &grad_kernel_, x, out, out_grad, x_grad.data());
 
     for (int i = 0; i < dims_.production(); i++) {
       EXPECT_NEAR(x_grad[i], (out_delta[i] - out[i]) / delta, max_grad_delta);
@@ -137,31 +144,58 @@ class ActivationGradTester {
   grad_param_t grad_param_;
 };
 
-void TestNormalCase(DDim dims) {
-  std::unique_ptr<ActivationGradTester> tester(new ActivationGradTester(dims));
+void TestSquareGrad(DDim dims) {
+  LOG(INFO) << "Test Square grad";
+  std::unique_ptr<
+      ActivationGradTester<arm::SquareCompute, host::SquareGradCompute>>
+      tester(
+          new ActivationGradTester<arm::SquareCompute, host::SquareGradCompute>(
+              dims));
   tester->prepare_kernel();
   float delta = 0.001;
   float max_grad_delta = 0.005;
   tester->check_grad(delta, max_grad_delta);
 }
 
-TEST(activation_grad_arm, compute) {
-  LOG(INFO) << "Test Square grad";
+void TestReluGrad(DDim dims) {
+  LOG(INFO) << "Test Relu grad";
+  std::unique_ptr<ActivationGradTester<arm::ReluCompute, host::ReluGradCompute>>
+      tester(new ActivationGradTester<arm::ReluCompute, host::ReluGradCompute>(
+          dims));
+  tester->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester->check_grad(delta, max_grad_delta);
+}
+
+void TestTanhGrad(DDim dims) {
+  LOG(INFO) << "Test Tanh grad";
+  std::unique_ptr<ActivationGradTester<arm::TanhCompute, host::TanhGradCompute>>
+      tester(new ActivationGradTester<arm::TanhCompute, host::TanhGradCompute>(
+          dims));
+  tester->prepare_kernel();
+  float delta = 0.001;
+  float max_grad_delta = 0.005;
+  tester->check_grad(delta, max_grad_delta);
+}
+
+TEST(activation_grad_host, compute) {
   DeviceInfo::Init();
-  for (auto n : {2}) {
-    for (auto c : {2}) {
-      for (auto h : {2}) {
-        for (auto w : {2}) {
-          TestNormalCase(DDim(std::vector<int64_t>({n, c, h, w})));
+  for (auto n : {2, 1}) {
+    for (auto c : {2, 9}) {
+      for (auto h : {2, 1}) {
+        for (auto w : {2, 10}) {
+          TestSquareGrad(DDim(std::vector<int64_t>({n, c, h, w})));
+          TestReluGrad(DDim(std::vector<int64_t>({n, c, h, w})));
+          TestTanhGrad(DDim(std::vector<int64_t>({n, c, h, w})));
         }
       }
     }
   }
 }
 
-}  // namespace arm
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(square_grad, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(square_grad, kHost, kFloat, kNCHW, def);
diff --git a/lite/tests/kernels/batch_norm_compute_test.cc b/lite/tests/kernels/batch_norm_compute_test.cc
index ae65e0e3c320ff153a99d2a1656227bad34428d4..9674f95d0b52dbc264ef78748d0c0fba1e4ebc37 100644
--- a/lite/tests/kernels/batch_norm_compute_test.cc
+++ b/lite/tests/kernels/batch_norm_compute_test.cc
@@ -157,7 +157,7 @@ TEST(BatchNorm, precision) {
   LOG(INFO) << "test BatchNorm op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #elif defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
diff --git a/lite/tests/kernels/box_clip_compute_test.cc b/lite/tests/kernels/box_clip_compute_test.cc
index 72947fa4b258a894e5a73c5e8fe8cce12ef9a02c..c599e64214d3fb15a52cb14fe48de7a7d75b2868 100644
--- a/lite/tests/kernels/box_clip_compute_test.cc
+++ b/lite/tests/kernels/box_clip_compute_test.cc
@@ -70,9 +70,7 @@ class BoxClipComputeTester : public arena::TestCase {
       float sign = i % 3 == 0 ? -1.0f : 1.0f;
       input_data[i] = sign * static_cast<float>((i * 7) % 20);
     }
-    SetCommonTensor(input_, input_dims_, input_data.data());
-    auto input_tensor = baseline_scope()->FindMutableTensor(input_);
-    input_tensor->set_lod(input_lod_);
+    SetCommonTensor(input_, input_dims_, input_data.data(), input_lod_);
 
     std::vector<float> im_info_data{10, 10, 1, 15, 15, 1};
     SetCommonTensor(im_info_, im_info_dim_, im_info_data.data());
diff --git a/lite/tests/kernels/cast_compute_test.cc b/lite/tests/kernels/cast_compute_test.cc
index 86331bb8a1cce89da76d2ebb87a9d091e34f68c5..34038dfdc797d0e5ee618b575ad532fd64809276 100644
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -135,7 +135,7 @@ TEST(Cast, precision) {
   float abs_error = 2e-5;
 #if defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/clip_compute_test.cc b/lite/tests/kernels/clip_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c6149bb753b2a83813d0a129d61d7444456c399
--- /dev/null
+++ b/lite/tests/kernels/clip_compute_test.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class ClipComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "x";
+  std::string out_ = "out";
+  std::string min_tensor_ = "min_tensor";
+  std::string max_tensor_ = "max_tensor";
+  float min_{};
+  float max_{};
+  bool use_minmax_tensor_{};
+  DDim x_dims_;
+
+ public:
+  ClipComputeTester(const Place& place,
+                    const std::string& alias,
+                    int n,
+                    int c,
+                    int h,
+                    int w,
+                    float min,
+                    float max,
+                    bool use_minmax_tensor)
+      : TestCase(place, alias) {
+    x_dims_ = DDim(std::vector<int64_t>({n, c, h, w}));
+    min_ = min;
+    max_ = max;
+    use_minmax_tensor_ = use_minmax_tensor;
+  }
+
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindTensor(x_);
+    auto* out = scope->NewTensor(out_);
+    CHECK(out);
+    out->Resize(x->dims());
+    const auto* x_data = x->data<float>();
+    auto* out_data = out->mutable_data<float>();
+
+    for (int i = 0; i < x->numel(); i++) {
+      if (x_data[i] < min_)
+        out_data[i] = min_;
+      else if (x_data[i] > max_)
+        out_data[i] = max_;
+      else
+        out_data[i] = x_data[i];
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("clip");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+    if (use_minmax_tensor_) {
+      op_desc->SetInput("Min", {min_tensor_});
+      op_desc->SetInput("Max", {max_tensor_});
+      op_desc->SetAttr("min", 0.f);
+      op_desc->SetAttr("max", 0.f);
+    } else {
+      op_desc->SetAttr("min", min_);
+      op_desc->SetAttr("max", max_);
+    }
+  }
+
+  void PrepareData() override {
+    std::vector<float> x_data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      float sign = i % 3 == 0 ? -1.0f : 1.0f;
+      x_data[i] = sign * static_cast<float>(i % 128) * 0.013f + 0.001;
+    }
+    SetCommonTensor(x_, x_dims_, x_data.data());
+
+    if (use_minmax_tensor_) {
+      std::vector<float> min_data = {min_};
+      SetCommonTensor(
+          min_tensor_, DDim(std::vector<int64_t>({1})), min_data.data());
+
+      std::vector<float> max_data = {max_};
+      SetCommonTensor(
+          max_tensor_, DDim(std::vector<int64_t>({1})), max_data.data());
+    }
+  }
+};
+
+TEST(Clip, precision) {
+  LOG(INFO) << "test clip op";
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+
+  float min = -1;
+  float max = 1;
+  for (int n : {1, 3}) {
+    for (int c : {3, 5}) {
+      for (int h : {5, 6}) {
+        for (int w : {6, 7}) {
+          for (bool use_minmax_tensor : {true, false}) {
+            std::unique_ptr<arena::TestCase> tester(new ClipComputeTester(
+                place, "def", n, c, h, w, min, max, use_minmax_tensor));
+            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena.TestPrecision();
+          }
+        }
+      }
+    }
+  }
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/dropout_compute_test.cc b/lite/tests/kernels/dropout_compute_test.cc
index 025f02ce31505cee684fb9a21c7b26d96e1c3026..c4ecc0cf01e3da7c43294ba1249b5b4f106caa95 100644
--- a/lite/tests/kernels/dropout_compute_test.cc
+++ b/lite/tests/kernels/dropout_compute_test.cc
@@ -94,7 +94,7 @@ TEST(Dropout, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // Using fp16 in NPU
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/elementwise_compute_test.cc b/lite/tests/kernels/elementwise_compute_test.cc
index 505ab72dc125d5b527845f4695a444c215422f8b..d91c304ef7e76b9ff623ebfe1bb9ad5bb4ace2c9 100644
--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
@@ -228,7 +228,7 @@ TEST(Elementwise, precision) {
   abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/elementwise_grad_compute_test.cc b/lite/tests/kernels/elementwise_grad_compute_test.cc
index 2b5fbbb65d3d7e17bf90afb71f5c8154f0d88488..04e74e49099f13a7e5920b306f8d2e26650a2574 100644
--- a/lite/tests/kernels/elementwise_grad_compute_test.cc
+++ b/lite/tests/kernels/elementwise_grad_compute_test.cc
@@ -215,18 +215,6 @@ class ElementwiseAddGradTester {
     fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
     this->run_forward(&param_, &kernel_, x, y, out.data());
 
-    for (int i = 0; i < x_dims_.production(); i++) {
-      LOG(INFO) << "x_" << i << ": " << x[i];
-    }
-
-    for (int i = 0; i < y_dims_.production(); i++) {
-      LOG(INFO) << "y_" << i << ": " << y[i];
-    }
-
-    for (int i = 0; i < out_dims_.production(); i++) {
-      LOG(INFO) << "out_" << i << ": " << out[i];
-    }
-
     // backward
     std::vector<float> out_grad(out_dims_.production());
     std::vector<float> x_grad(x_dims_.production());
@@ -242,14 +230,6 @@ class ElementwiseAddGradTester {
                        x_grad.data(),
                        y_grad.data());
 
-    for (int i = 0; i < x_grad.size(); i++) {
-      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
-    }
-
-    for (int i = 0; i < y_grad.size(); i++) {
-      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
-    }
-
     // get numeric gradient
     std::vector<float> x_delta(x_dims_.production());
     std::vector<float> y_delta(y_dims_.production());
@@ -443,18 +423,6 @@ class ElementwiseSubGradTester {
     fill_data_rand(y.data(), -1.f, 1.f, y_dims_.production());
     this->run_forward(&param_, &kernel_, x, y, out.data());
 
-    for (int i = 0; i < x_dims_.production(); i++) {
-      LOG(INFO) << "x_" << i << ": " << x[i];
-    }
-
-    for (int i = 0; i < y_dims_.production(); i++) {
-      LOG(INFO) << "y_" << i << ": " << y[i];
-    }
-
-    for (int i = 0; i < out_dims_.production(); i++) {
-      LOG(INFO) << "out_" << i << ": " << out[i];
-    }
-
     // backward
     std::vector<float> out_grad(out_dims_.production());
     std::vector<float> x_grad(x_dims_.production());
@@ -470,14 +438,6 @@ class ElementwiseSubGradTester {
                        x_grad.data(),
                        y_grad.data());
 
-    for (int i = 0; i < x_grad.size(); i++) {
-      LOG(INFO) << "x_grad_" << i << ": " << x_grad[i];
-    }
-
-    for (int i = 0; i < y_grad.size(); i++) {
-      LOG(INFO) << "y_grad_" << i << ": " << y_grad[i];
-    }
-
     // get numeric gradient
     std::vector<float> x_delta(x_dims_.production());
     std::vector<float> y_delta(y_dims_.production());
diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc
index 4d0ad1ab47a17c3e8d227b9e0482d7cbe21ab7e2..c023a12b0fb4e3118976d854114c554ca6bf6462 100644
--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -98,7 +98,7 @@ TEST(Gather, precision) {
   abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc
index 5ea01a6cca504db230d62a63ef3a62d4f73470fa..bd4480b6127a318286b3172f53fc8a5bceb8c328 100644
--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
@@ -147,7 +147,7 @@ TEST(LayerNorm, precision) {
   LOG(INFO) << "test layer_norm op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #elif defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc
index 988077c6c319d5bcc8e50d6c8e5544331a86fe45..ae39abf1dbaf206fe0a68dd492a48a2452c8094e 100644
--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -116,7 +116,7 @@ TEST(LookupTable, precision) {
   abs_error = 1e-2;
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
@@ -132,7 +132,8 @@ TEST(LookupTable, precision) {
        std::vector<std::vector<int64_t>>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) {
     for (auto w_dims :
          std::vector<std::vector<int64_t>>{{4, 2}, {6, 8}, {12, 15}}) {
-#if defined(LITE_WITH_XPU) && defined(LITE_WITH_NPU)
+#if (defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)) || \
+    defined(LITE_WITH_NPU)
       for (auto padding_idx :
            std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU or NPU
 #else
diff --git a/lite/tests/kernels/matmul_compute_test.cc b/lite/tests/kernels/matmul_compute_test.cc
index 59b0fde8fd18b8a2170b6fdbd42444f09843f077..9799c15622b07a8d126654c79738d29b176c2cf4 100644
--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
@@ -457,7 +457,7 @@ TEST(Matmul2x2, precision) {
   abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
@@ -489,7 +489,7 @@ TEST(Matmul2x2_y_transpose, precision) {
   abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc
index d070292332b65ed577ec6cefdb220ee691eb99e9..d89b3569358034d72ac8019f2348b49764ca6b0c 100644
--- a/lite/tests/kernels/mul_compute_test.cc
+++ b/lite/tests/kernels/mul_compute_test.cc
@@ -127,7 +127,7 @@ TEST(Mul, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // use fp16 in npu
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/multiclass_nms_compute_test.cc b/lite/tests/kernels/multiclass_nms_compute_test.cc
index a1190197bffdf505fec77c6b22b7871316a2d125..dd16730ef551ddc11825936d99733f33015fd2c0 100644
--- a/lite/tests/kernels/multiclass_nms_compute_test.cc
+++ b/lite/tests/kernels/multiclass_nms_compute_test.cc
@@ -478,7 +478,7 @@ TEST(multiclass_nms, precision) {
   Place place;
 #if defined(LITE_WITH_ARM)
   place = TARGET(kHost);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc
index 04894188b0bf1557000479ae18b0369997909f89..fc4d004e552e76792470f46a54afd6aa13bbc330 100644
--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
@@ -381,7 +381,7 @@ TEST(Pool, precision) {
 #if defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
   abs_error = 1e-2;  // Using fp16 in NPU
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc
index 3a866b6cf22cf67c3f5a60e5a4aa8603cee6a1a3..f3fcc0bad5418624c86897bafc52dbf3a7ec0d8e 100644
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -206,7 +206,7 @@ TEST(Reshape, precision) {
   abs_error = 1e-2;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kHost);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/roi_align_compute_test.cc b/lite/tests/kernels/roi_align_compute_test.cc
index 8eb84dd0337d0635dc360e2e04aa1ad047e912c0..2bbfdcd81da951bd769ab03094a0df48f3a6e13b 100644
--- a/lite/tests/kernels/roi_align_compute_test.cc
+++ b/lite/tests/kernels/roi_align_compute_test.cc
@@ -106,13 +106,11 @@ class RoiAlignComputeTester : public arena::TestCase {
     }
     LOG(INFO) << "Read rois  data. " << datas[0] << " " << datas.back();
     reader.close();
-    SetCommonTensor(rois_, dims, datas.data());
 
-    auto rois_tensor = baseline_scope()->FindMutableTensor(rois_);
     std::vector<uint64_t> lod0({0, 152, 304});
     LoD lod;
     lod.push_back(lod0);
-    rois_tensor->set_lod(lod);
+    SetCommonTensor(rois_, dims, datas.data(), lod);
   }
 };
 
diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc
index efd0497002ee402426a7198bf47ec60c7f41d2fd..9d1f4403dc1a82e58d8c764933ba01c0e0b5c082 100644
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
@@ -165,7 +165,7 @@ TEST(Scale, precision) {
   abs_error = 4e-3;  // Using fp16 in NPU
 #elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
   abs_error = 3e-4;  // Some operations use fp16 in XPU
 #elif defined(LITE_WITH_X86)
diff --git a/lite/tests/kernels/sequence_conv_compute_test.cc b/lite/tests/kernels/sequence_conv_compute_test.cc
index 84887b2573516d0c82cbb8c9b4cf9336f30ee41d..68afaad04f8e84995e811f81f99a2d4109c845a5 100644
--- a/lite/tests/kernels/sequence_conv_compute_test.cc
+++ b/lite/tests/kernels/sequence_conv_compute_test.cc
@@ -85,21 +85,31 @@ class SequenceConvComputeTester : public arena::TestCase {
     auto output_dims = output->dims();
     auto output_data = output->mutable_data<float>();
     std::vector<std::vector<float>> res;
-    if (contextStart_ == -2) {
+
+    if (contextStart_ == -2 && lod_.size() == 1 &&
+        lod_[0] == std::vector<uint64_t>({0, 4})) {
       res = {{-0.08867277f, -0.17257819f, -0.2564836f},
              {0.194508f, 0.05720823f, -0.08009153f},
              {0.73512584f, 0.5749428f, 0.41475973f},
              {0.5635012f, 0.49485126f, 0.42620137f}};
-    } else if (contextStart_ == -1) {
+    } else if (contextStart_ == -1 && lod_.size() == 1 &&
+               lod_[0] == std::vector<uint64_t>({0, 4})) {
       res = {{0.194508f, 0.05720823f, -0.08009153f},
              {0.73512584f, 0.5749428f, 0.41475973f},
              {0.5635012f, 0.49485126f, 0.42620137f},
              {0.2517162f, 0.23646072f, 0.22120519f}};
-    } else if (contextStart_ == 0) {
+    } else if (contextStart_ == 0 && lod_.size() == 1 &&
+               lod_[0] == std::vector<uint64_t>({0, 4})) {
       res = {{0.73512584f, 0.5749428f, 0.41475973f},
              {0.5635012f, 0.49485126f, 0.42620137f},
              {0.2517162f, 0.23646072f, 0.22120519f},
              {0.02574372f, 0.03337148f, 0.04099924f}};
+    } else if (contextStart_ == -1 && lod_.size() == 1 &&
+               lod_[0] == std::vector<uint64_t>({0, 2, 4})) {
+      res = {{0.194508, 0.05720823, -0.08009153},
+             {0.7093821, 0.57208234, 0.43478262},
+             {0.19450802, 0.17925248, 0.16399695},
+             {0.2517162, 0.23646072, 0.22120519}};
     } else {
       fprintf(stderr, "not supported contextStart_\n");
       exit(-1);
@@ -136,12 +146,25 @@ void TestNormalCase(Place place, float abs_error = 2e-5) {
   }
 }
 
+void TestBatchCase(Place place, float abs_error = 2e-5) {
+  std::vector<std::vector<uint64_t>> lod{{0, 2, 4}};
+  std::vector<int64_t> dims{4, 5};
+  std::vector<int> candidate_pad_idx{-1};
+  for (int pad_idx : candidate_pad_idx) {
+    std::unique_ptr<arena::TestCase> tester(new SequenceConvComputeTester(
+        place, "def", lod, DDim(dims), pad_idx, 1, 3, 3));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision();
+  }
+}
+
 TEST(sequence_conv, precision) {
 #ifdef LITE_WITH_ARM
   float abs_error = 2e-5;
   Place place(TARGET(kARM));
 
   TestNormalCase(place, abs_error);
+  TestBatchCase(place, abs_error);
 #endif
 }
 
diff --git a/lite/tests/kernels/slice_compute_test.cc b/lite/tests/kernels/slice_compute_test.cc
index fc96b39f010eab5eedd431cb81e881b7aadb11a2..b566bfa3e86cf6067f9914b5fc3932458a6ee186 100644
--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
@@ -202,20 +202,15 @@ class SliceComputeTester : public arena::TestCase {
                       DDim({static_cast<int64_t>(ends_.size())}),
                       ends_.data());
     } else if (use_tensor_list_) {
-      Scope& scope_ = this->scope();
       for (int i = 0; i < starts_.size(); ++i) {
-        auto* tensor = scope_.NewTensor("starts_tensor_list_" +
-                                        paddle::lite::to_string(i));
-        tensor->Resize(DDim({1}));
-        auto* d = tensor->mutable_data<int>();
-        d[0] = starts_[i];
+        SetCommonTensor("starts_tensor_list_" + paddle::lite::to_string(i),
+                        DDim({1}),
+                        &starts_[i]);
       }
       for (int i = 0; i < ends_.size(); ++i) {
-        auto* tensor =
-            scope_.NewTensor("ends_tensor_list_" + paddle::lite::to_string(i));
-        tensor->Resize(DDim({1}));
-        auto* d = tensor->mutable_data<int>();
-        d[0] = ends_[i];
+        SetCommonTensor("ends_tensor_list_" + paddle::lite::to_string(i),
+                        DDim({1}),
+                        &ends_[i]);
       }
     }
   }
@@ -273,7 +268,7 @@ TEST(Slice, precision) {
   test_slice(place);
   test_slice_tensor(place);
   test_slice_tensor_list(place);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   Place place(TARGET(kXPU));
   test_slice(place);
 #endif
diff --git a/lite/tests/kernels/softmax_compute_test.cc b/lite/tests/kernels/softmax_compute_test.cc
index a91f6534ffa1f8022e2005cc83255d306adf77c1..87a94aba184a055081446b4df830b72146834ed2 100644
--- a/lite/tests/kernels/softmax_compute_test.cc
+++ b/lite/tests/kernels/softmax_compute_test.cc
@@ -111,8 +111,12 @@ TEST(Softmax, precision) {
 
   for (auto x_dims :
        std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) {
-    for (auto axis : {-1, 0, 1, 2, 3}) {
-      if (axis >= x_dims.size()) continue;
+    int ndims = x_dims.size();
+    for (int axis = -1; axis < ndims; axis++) {
+#if defined(LITE_WITH_XPU)
+      if (axis != -1 && axis != ndims - 1)
+        continue;  // -1 and dims.size() - 1 are only supported by XPU
+#endif
       std::unique_ptr<arena::TestCase> tester(
           new SoftmaxComputeTest(place, "def", DDim(x_dims), axis));
       arena::Arena arena(std::move(tester), place, abs_error);
diff --git a/lite/tests/kernels/stack_compute_test.cc b/lite/tests/kernels/stack_compute_test.cc
index 10b289e41972eb6a9f332f0376393fdfaae94abe..72529cac5165badd50c086a75e882417725adb96 100644
--- a/lite/tests/kernels/stack_compute_test.cc
+++ b/lite/tests/kernels/stack_compute_test.cc
@@ -106,7 +106,7 @@ TEST(Stack, precision) {
   Place place;
 #ifdef LITE_WITH_ARM
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc
index 0ec010e47fe22f0bd60f0c275696f726b6f01a68..933e9f8ec5fc7b1d9b510c71f57fda309a5477dc 100644
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
@@ -164,7 +164,7 @@ TEST(Transpose, precision) {
   LOG(INFO) << "test Transpose op";
   float abs_error = 2e-5;
   Place place;
-#ifdef LITE_WITH_XPU
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #elif defined(LITE_WITH_NPU)
   place = TARGET(kNPU);
diff --git a/lite/tests/kernels/yolo_box_compute_test.cc b/lite/tests/kernels/yolo_box_compute_test.cc
index c41c89608fd7496c5b01b1a813581f7f461ff0ee..b88f25e1e0ddb85683297c19a841a5d47b2bbccf 100644
--- a/lite/tests/kernels/yolo_box_compute_test.cc
+++ b/lite/tests/kernels/yolo_box_compute_test.cc
@@ -247,7 +247,7 @@ TEST(YoloBox, precision) {
   Place place;
 #if defined(LITE_WITH_ARM)
   place = TARGET(kARM);
-#elif defined(LITE_WITH_XPU)
+#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
   place = TARGET(kXPU);
 #else
   return;
diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc
index 8265f9db2f85e54dd91314ac5dc7932e7f7e842a..9ad98ce6f4566898b3821e6bf540b331a84b97bb 100644
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -236,19 +236,19 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
 
         double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
                       weight_dim[3] / param.groups;
-        LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
-                  << ", min time: " << t0.LapTimes().Min()
-                  << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+        VLOG(4) << "conv fp32: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
           double max_diff = 0;
           tensor_cmp_host(tout_basic, *param.output, max_ratio, max_diff);
-          LOG(INFO) << "compare result, max diff: " << max_diff
-                    << ", max ratio: " << max_ratio;
+          VLOG(4) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
           if (std::abs(max_ratio) > 1e-3f) {
             if (max_diff > 5e-4f) {
               LOG(WARNING) << "basic result";
@@ -274,15 +274,15 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
             }
           }
         }
-        LOG(INFO) << "test fp32 conv: input: " << dim_in
-                  << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
-                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
-                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
-                  << ", group: " << group
-                  << ", bias: " << (flag_bias ? "true" : "false")
-                  << ", act: " << flag_act << ", threads: " << th
-                  << ", power_mode: " << cls << " successed!!\n";
+        VLOG(4) << "test fp32 conv: input: " << dim_in
+                << ", output: " << dim_out << ", weight dim: " << weight_dim
+                << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
+                << ", group: " << group
+                << ", bias: " << (flag_bias ? "true" : "false")
+                << ", act: " << flag_act << ", threads: " << th
+                << ", power_mode: " << cls << " successed!!\n";
       }
     }
   }
diff --git a/lite/tools/build_android.sh b/lite/tools/build_android.sh
index aba5fb706cb62e5bc9b50127f16d07e0db55d595..5713c4e21bb97d12bb840c99d1adbc7f2d781157 100755
--- a/lite/tools/build_android.sh
+++ b/lite/tools/build_android.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-set -x
+set +x
 #####################################################################################################
 # 1. global variables, you can change them according to your requirements
 #####################################################################################################
@@ -269,6 +269,7 @@ function main {
     if [ -z "$1" ]; then
         # compiling result contains light_api lib only, recommanded.
         make_tiny_publish_so $ARCH $TOOLCHAIN $ANDROID_STL
+        exit 0
     fi
 
     # Parse command line.
@@ -358,6 +359,7 @@ function main {
     done
     # compiling result contains light_api lib only, recommanded.
     make_tiny_publish_so
+    exit 0
 }
 
 main $@
diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh
index 964da15b0b6fcf888812271b0a2c944d9efa63b8..055f6a35c3ab145e9dfe4bc5d46172a2119ffb25 100755
--- a/lite/tools/build_bm.sh
+++ b/lite/tools/build_bm.sh
@@ -43,7 +43,7 @@ function prepare_thirdparty {
     # clone bmlibs
     if [ ! -d ${workspace}/third-party/bmlibs ]; then
         git clone https://github.com/AnBaolei1984/bmlibs.git ${workspace}/third-party/bmlibs
-    fi     
+    fi
 }
 
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
@@ -70,6 +70,13 @@ function build_bm {
     mkdir -p $build_dir
     cd $build_dir
 
+    if [ $TARGET_NAME == "BM1684" ]; then
+      BM_SDK_ROOT="$workspace/third-party/bmlibs/bm_sc5_libs"
+    else
+      BM_SDK_ROOT="$workspace/third-party/bmlibs/bm_sc3_libs"
+    fi
+    echo $BM_SDK_ROOT
+
     prepare_workspace
     cmake .. \
         ${CMAKE_COMMON_OPTIONS} \
@@ -95,17 +102,7 @@ function main {
         case $i in
             --target_name=*)
                 TARGET_NAME="${i#*=}"
-                shift
-                ;;
-            #--bm_sdk_root=*)
-            #    BM_SDK_ROOT="${i#*=}"
-            #    shift
-            #    ;;
-            bm)
                 build_bm
-                shift
-                ;;
-            *)
                 # unknown option
                 print_usage
                 exit 1
diff --git a/lite/tools/build_ios.sh b/lite/tools/build_ios.sh
index 2c7eeb466f3d82cf491b6a631d79918fa4fd4cd2..3d4337aa8ecc20fd078b8906a950408927ea56c8 100755
--- a/lite/tools/build_ios.sh
+++ b/lite/tools/build_ios.sh
@@ -152,6 +152,7 @@ function main {
         esac
     done
     make_ios $ARCH
+    exit 0
 }
 
 main $@
diff --git a/lite/tools/build_mlu.sh b/lite/tools/build_mlu.sh
index 01d71aaf213abb99633112664af580b897ce7454..e0fb2ab11b110cf5a29151ea7c8e544a4074c8c5 100755
--- a/lite/tools/build_mlu.sh
+++ b/lite/tools/build_mlu.sh
@@ -4,7 +4,7 @@ set -ex
 # global variables with default value
 NEUWARE_HOME="${NEUWARE_HOME}"
 TARGET_NAME="all"    # default target
-BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
+BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
 WITH_TESTING=ON                     # ON/OFF
 
 function print_usage {
@@ -28,16 +28,13 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t
 readonly workspace=$(pwd)
 
 function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
+    if [ ! -d $workspace/third-party ]; then
         rm -rf $workspace/third-party
-
-        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
     fi
+    if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
+        wget $THIRDPARTY_TAR
+    fi
+    tar xvf third-party-05b862.tar.gz
 }
 
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
diff --git a/lite/tools/check_api_approvals.sh b/lite/tools/check_api_approvals.sh
old mode 100644
new mode 100755
index 6100558d68abb2b4c82c1f367078e519972546ce..b2a4659c964121b0a95961195340c296710db2de
--- a/lite/tools/check_api_approvals.sh
+++ b/lite/tools/check_api_approvals.sh
@@ -5,13 +5,14 @@ if [ -z ${BRANCH} ]; then
 fi
 
 LITE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../.." && pwd )"
-
 approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle-Lite/pulls/${GIT_PR_ID}/reviews?per_page=10000`
-git_files=`git diff --numstat upstream/$BRANCH| wc -l`
-git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
 failed_num=0
 echo_list=()
 
+# approval list
+Superjomn=328693
+DannyIsFunny=45189361
+
 function add_failed(){
     failed_num=`expr $failed_num + 1`
     echo_list="${echo_list[@]}$1"
@@ -24,20 +25,105 @@ function check_approval(){
         add_failed "${failed_num}. ${echo_line}"
     fi
 }
+####################################################################################################
+#  Check 1: You must have Superjomn's (Yunchunwei) approval for changing
+#           20+ files or adding more than 1000+ lines of content
+####################################################################################################
+function CheckModifiedFileNums() {
+    git_files=`git diff --numstat upstream/$BRANCH| wc -l`
+    git_count=`git diff --numstat upstream/$BRANCH| awk '{sum+=$1}END{print sum}'`
+
+    if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
+        echo_line="You must have Superjomn's (Yunchunwei) approval for changing 20+ files or adding more than 1000+ lines of content.\n"
+        check_approval 1 $Superjomn
+    fi
 
+    if [ -n "${echo_list}" ];then
+      echo "****************"
+      echo -e "${echo_list[@]}"
+      echo "There are ${failed_num} approved errors."
+      echo "****************"
+    fi
+ 
+    if [ -n "${echo_list}" ]; then
+      exit 1
+    fi
+}
+####################################################################################################
+#  Check 2: You must have Superjomn's (Yunchunwei) approval for increasing 
+#           size of dynamic lib for 10+ kb
+####################################################################################################
+function CheckLibSizeDiff() {
+    # step1: record lib size of current branch
+    lite/tools/build_android.sh --arch=armv8 --toolchain=gcc --android_stl=c++_static --with_log=OFF
+    current_size=`stat -c%s build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib/libpaddle_light_api_shared.so`
 
-if [[ $git_files -gt 19 || $git_count -gt 999 ]];then
-    echo_line="You must have Superjomn (Yunchunwei) approval for change 20+ files or add than 1000+ lines of content.\n"
-    check_approval 1 328693
-fi 
+    # step2: record lib size of current develop branch
+    git checkout develop
+    git clean -f . && git checkout .
+    git fetch upstream && git merge upstream/develop
 
-if [ -n "${echo_list}" ];then
-  echo "****************"
-  echo -e "${echo_list[@]}"
-  echo "There are ${failed_num} approved errors."
-  echo "****************"
-fi
+    lite/tools/build_android.sh --arch=armv8 --toolchain=gcc --android_stl=c++_static --with_log=OFF
+    develop_size=`stat -c%s build.lite.android.armv8.gcc/inference_lite_lib.android.armv8/cxx/lib/libpaddle_light_api_shared.so`
+
+    # step3: if diff_size > 10485, special approval is needed    
+    diff_size=$[$current_size - $develop_size]
+    if [ $diff_size -gt 10485 ]; then
+        echo_line="Your PR has increased basic inference lib for $diff_size Byte, exceeding maximum requirement of  10485 Byte (0.01M). You need Superjomn's (Yunchunwei) approval or you can contact DannyIsFunny(HuZhiqiang).\n"
+        echo "****************"
+        echo -e "${echo_line[@]}"
+        echo "There is an approved errors."
+        echo "****************"
+        exit 1
+    fi
+#  Todo: Code below should be applied later.
+#    if [ $diff_size -gt 10485 ]; then
+#        echo_line="Your PR has increased basic inference lib for $diff_size Byte, exceeding maximum requirement of  10485 Byte (0.01M). You need Superjomn's (Yunchunwei) approval or you can contact DannyIsFunny(HuZhiqiang).\n"
+#        check_approval 1 $Superjomn
+#    fi
+#
+#    if [ -n "${echo_list}" ];then
+#      echo "****************"
+#      echo -e "${echo_list[@]}"
+#      echo "There are ${failed_num} approved errors."
+#      echo "****************"
+#    fi
+#
+#    if [ -n "${echo_list}" ]; then
+#      exit 1
+#    fi
+}
+
+####################################################################################################
+# Main functions
+####################################################################################################
+function main {
+    if [ -z "$1" ]; then
+        # at least on argument is needed
+        echo "Error: at least on argument is needed!"
+        exit 1
+    fi
+
+    # Parse command line.
+    for i in "$@"; do
+        case $i in
+            check_modified_file_nums)
+            # modified files num can not exceed 20 +
+                CheckModifiedFileNums
+                exit 0
+                ;;
+            check_lib_size_diff)
+            # size diff can not exceed 10K +
+                CheckLibSizeDiff
+                exit 0
+                ;;
+            *)
+                # unknown option
+                echo "Error: unsupported input argument!"
+                exit 1
+                ;;
+        esac
+    done
+}
 
-if [ -n "${echo_list}" ]; then
-  exit 1
-fi   
+main $@
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 29ed9100f932b3215e45fc2352b5f0d73b7349b1..680c865c2c8999a29ff2b351dadfc797506c87f6 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -279,7 +279,7 @@ function test_server {
 }
 
 function assert_api_spec_approvals() {
-    /bin/bash ${LITE_ROOT}/lite/tools/check_api_approvals.sh
+    /bin/bash ${LITE_ROOT}/lite/tools/check_api_approvals.sh check_modified_file_nums
     if [ "$?" != 0 ];then
        exit 1
     fi
@@ -353,7 +353,7 @@ function cmake_xpu {
         -DWITH_MKL=ON \
         -DLITE_BUILD_EXTRA=ON \
         -DLITE_WITH_XPU=ON \
-        -DXPU_SDK_ROOT="$(pwd)/../../XPU_SDK"
+        -DXPU_SDK_ROOT="/opt/output"
 }
 
 function build_xpu {
@@ -564,8 +564,18 @@ function test_arm_model {
 function test_model_optimize_tool_compile {
     cd $workspace
     cd build
+    # Compile opt tool
     cmake .. -DWITH_LITE=ON -DLITE_ON_MODEL_OPTIMIZE_TOOL=ON -DWITH_TESTING=OFF -DLITE_BUILD_EXTRA=ON
     make opt -j$NUM_CORES_FOR_COMPILE
+    # Check whether opt can transform quantized mobilenetv1 successfully.
+    cd lite/api && chmod +x ./opt
+    wget --no-check-certificate https://paddlelite-data.bj.bcebos.com/doc_models/MobileNetV1_quant.tar.gz
+    tar zxf MobileNetV1_quant.tar.gz
+    ./opt --model_dir=./MobileNetV1_quant --valid_targets=arm --optimize_out=quant_mobilenetv1
+    if [ ! -f quant_mobilenetv1.nb ]; then
+       echo -e "Error! Resulted opt can not tramsform MobileNetV1_quant successfully!"
+       exit 1
+    fi
 }
 
 function _test_paddle_code_generator {
diff --git a/lite/utils/all.h b/lite/utils/all.h
index a0d323aa24b36dac7858f484eb1cf1d5a7bcba50..8586188b99971d04271d14ac2d3b8043b0ea4414 100644
--- a/lite/utils/all.h
+++ b/lite/utils/all.h
@@ -14,10 +14,16 @@
 
 #pragma once
 
+#include <iostream>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+
 #include "lite/utils/any.h"
 #include "lite/utils/check.h"
 #include "lite/utils/cp_logging.h"
-#include "lite/utils/factory.h"
 #include "lite/utils/hash.h"
 #include "lite/utils/io.h"
 #include "lite/utils/macros.h"
diff --git a/lite/utils/env.h b/lite/utils/env.h
index 3048c84b42f6f658eaf0c8ee0d08456f53162c37..f3bb8b58e1b63ed2c0ed05792020d11ea307690c 100644
--- a/lite/utils/env.h
+++ b/lite/utils/env.h
@@ -22,6 +22,8 @@
 #define SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE \
   "SUBGRAPH_CUSTOM_PARTITION_CONFIG_FILE"
 
+#define SUBGRAPH_DISABLE_ONLINE_MODE "SUBGRAPH_DISABLE_ONLINE_MODE"
+
 namespace paddle {
 namespace lite {
 
diff --git a/lite/utils/factory.h b/lite/utils/factory.h
deleted file mode 100644
index d286ceb42ce32dba68bc68cabab2a600ad3d7789..0000000000000000000000000000000000000000
--- a/lite/utils/factory.h
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <functional>
-#include <iostream>
-#include <list>
-#include <map>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <utility>
-#include "lite/utils/all.h"
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/replace_stl/stream.h"
-
-namespace paddle {
-namespace lite {
-
-/*
- * Factor for any Type creator.
- *
- * Usage:
- *
- * struct SomeType;
- * // Register a creator.
- * Factory<SomeType>::Global().Register("some_key", [] ->
- *                                      std::unique_ptr<SomeType> { ... });
- * // Retrive a creator.
- * auto some_type_instance = Factory<SomeType>::Global().Create("some_key");
- */
-template <typename ItemType, typename ItemTypePtr>
-class Factory {
- public:
-  using item_t = ItemType;
-  using self_t = Factory<item_t, ItemTypePtr>;
-  using item_ptr_t = ItemTypePtr;
-  using creator_t = std::function<item_ptr_t()>;
-
-  static Factory& Global() {
-    static Factory* x = new self_t;
-    return *x;
-  }
-
-  void Register(const std::string& op_type, creator_t&& creator) {
-    creators_[op_type].emplace_back(std::move(creator));
-  }
-
-  item_ptr_t Create(const std::string& op_type) const {
-    auto res = Creates(op_type);
-    if (res.empty()) return nullptr;
-    CHECK_EQ(res.size(), 1UL) << "Get multiple Op for type " << op_type;
-    return std::move(res.front());
-  }
-
-  std::list<item_ptr_t> Creates(const std::string& op_type) const {
-    std::list<item_ptr_t> res;
-    auto it = creators_.find(op_type);
-    if (it == creators_.end()) return res;
-    for (auto& c : it->second) {
-      res.emplace_back(c());
-    }
-    return res;
-  }
-
-  std::string DebugString() const {
-    STL::stringstream ss;
-    for (const auto& item : creators_) {
-      ss << "  - " << item.first << "\n";
-    }
-    return ss.str();
-  }
-
- protected:
-  std::map<std::string, std::list<creator_t>> creators_;
-};
-
-/* A helper function to help run a lambda at the start.
- */
-template <typename Type>
-class Registor {
- public:
-  explicit Registor(std::function<void()>&& functor) { functor(); }
-
-  // Touch will do nothing.
-  int Touch() { return 0; }
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/utils/io.h b/lite/utils/io.h
index 2141364df79bb189772592a556dd9a115ae1a67e..5de95e72f06856df01189e8ae3f1c22115801094 100644
--- a/lite/utils/io.h
+++ b/lite/utils/io.h
@@ -120,5 +120,40 @@ static std::vector<std::string> ListDir(const std::string& path,
   return paths;
 }
 
+static bool ReadFile(const std::string& filename, std::vector<char>* contents) {
+  FILE* fp = fopen(filename.c_str(), "rb");
+  if (!fp) return false;
+  fseek(fp, 0, SEEK_END);
+  size_t size = ftell(fp);
+  fseek(fp, 0, SEEK_SET);
+  contents->clear();
+  contents->resize(size);
+  size_t offset = 0;
+  char* ptr = reinterpret_cast<char*>(&(contents->at(0)));
+  while (offset < size) {
+    size_t already_read = fread(ptr, 1, size - offset, fp);
+    offset += already_read;
+    ptr += already_read;
+  }
+  fclose(fp);
+  return true;
+}
+
+static bool WriteFile(const std::string& filename,
+                      const std::vector<char>& contents) {
+  FILE* fp = fopen(filename.c_str(), "wb");
+  if (!fp) return false;
+  size_t size = contents.size();
+  size_t offset = 0;
+  const char* ptr = reinterpret_cast<const char*>(&(contents.at(0)));
+  while (offset < size) {
+    size_t already_written = fwrite(ptr, 1, size - offset, fp);
+    offset += already_written;
+    ptr += already_written;
+  }
+  fclose(fp);
+  return true;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/utils/md5.h b/lite/utils/md5.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2e972dd8001a9a85e29688f460be061d64a16b5
--- /dev/null
+++ b/lite/utils/md5.h
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+
+namespace paddle {
+namespace lite {
+
+std::string MD5(std::string message) {
+  const uint32_t shiftAmounts[] = {
+      7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
+      5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20, 5, 9,  14, 20,
+      4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
+      6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
+  const uint32_t partsOfSines[] = {
+      0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
+      0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
+      0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
+      0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+      0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
+      0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
+      0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
+      0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+      0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
+      0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
+      0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391};
+
+  uint32_t state[4];
+  state[0] = 0x67452301;
+  state[1] = 0xefcdab89;
+  state[2] = 0x98badcfe;
+  state[3] = 0x10325476;
+
+  // Pad with zeros
+  int size = ((((message.length() + 8) / 64) + 1) * 64) - 8;
+  uint8_t *buf = reinterpret_cast<uint8_t *>(calloc(size + 64, 1));
+  memcpy(buf, message.c_str(), message.length());
+  buf[message.length()] = 128;
+  uint32_t bits = 8 * message.length();
+  memcpy(buf + size, &bits, 4);
+
+// Process at each 512-bit(64 bytes) chunk
+#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c))))
+  for (int offset = 0; offset < size; offset += 64) {
+    uint32_t A = state[0];
+    uint32_t B = state[1];
+    uint32_t C = state[2];
+    uint32_t D = state[3];
+    uint32_t *W = reinterpret_cast<uint32_t *>(buf + offset);
+    for (uint32_t i = 0; i < 64; i++) {
+      uint32_t F, g;
+      if (i < 16) {
+        F = (B & C) | ((~B) & D);
+        g = i;
+      } else if (i < 32) {
+        F = (D & B) | ((~D) & C);
+        g = (5 * i + 1) % 16;
+      } else if (i < 48) {
+        F = B ^ C ^ D;
+        g = (3 * i + 5) % 16;
+      } else {
+        F = C ^ (B | (~D));
+        g = (7 * i) % 16;
+      }
+      uint32_t T = D;
+      D = C;
+      C = B;
+      B = B + LEFTROTATE((A + F + partsOfSines[i] + W[g]), shiftAmounts[i]);
+      A = T;
+    }
+    state[0] += A;
+    state[1] += B;
+    state[2] += C;
+    state[3] += D;
+  }
+#undef LEFTROTATE
+  free(buf);
+
+  // Convert digest to string
+  std::string res;
+  res.reserve(16 << 1);
+  const uint8_t *digest = reinterpret_cast<uint8_t *>(state);
+  char hex[3];
+  for (size_t i = 0; i < 16; i++) {
+    snprintf(hex, sizeof(hex), "%02x", digest[i]);
+    res.append(hex);
+  }
+  return res;
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/utils/paddle_enforce.h b/lite/utils/paddle_enforce.h
deleted file mode 100644
index 82534af996919ac69a8624e442f1af6a9abb2c07..0000000000000000000000000000000000000000
--- a/lite/utils/paddle_enforce.h
+++ /dev/null
@@ -1,39 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-/*
- * This file defines PADDLE_ENFORCE_xx, which helps to adapt the legacy fluid
- * codes.
- */
-#pragma once
-#include "lite/utils/cp_logging.h"
-#include "lite/utils/string.h"
-
-#define PADDLE_ENFORCE(cond, ...) \
-  CHECK((cond)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_EQ(a, b, ...) \
-  CHECK_EQ((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_LE(a, b, ...) \
-  CHECK_LE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_LT(a, b, ...) \
-  CHECK_LT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-
-#define PADDLE_ENFORCE_GE(a, b, ...) \
-  CHECK_GE((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-#define PADDLE_ENFORCE_GT(a, b, ...) \
-  CHECK_GT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
-
-#ifndef PADDLE_THROW
-#define PADDLE_THROW(...) printf("" __VA_ARGS__);
-#endif
diff --git a/lite/utils/string.h b/lite/utils/string.h
index ada51d0b85d7536bfc937a7b1b8368a0f0e053be..b1aaf5d6c56d8931c4ad416f9d38c947abc68dd8 100644
--- a/lite/utils/string.h
+++ b/lite/utils/string.h
@@ -60,6 +60,38 @@ static std::string to_string(const T& v) {
   return ss.str();
 }
 
+static std::string to_string(int index) {
+  const int BUFFER_LENGTH = 15;
+  char buffer[BUFFER_LENGTH];
+  snprintf(buffer, sizeof(buffer), "%d", index);
+  return std::string(buffer);
+}
+
+template <typename T = std::string>
+static T parse_string(const std::string& v) {
+  return v;
+}
+
+template <>
+int32_t parse_string<int32_t>(const std::string& v) {
+  return std::stoi(v);
+}
+
+template <>
+int64_t parse_string<int64_t>(const std::string& v) {
+  return std::stoll(v);
+}
+
+template <>
+float parse_string<float>(const std::string& v) {
+  return std::stof(v);
+}
+
+template <>
+double parse_string<double>(const std::string& v) {
+  return std::stod(v);
+}
+
 template <typename T>
 std::string Join(const std::vector<T>& vec, const std::string& delim) {
   if (vec.empty()) return "";
@@ -84,19 +116,20 @@ static std::string Repr(const std::vector<std::string>& v) {
   return "{" + Join(tmp, ",") + "}";
 }
 
-static std::vector<std::string> Split(const std::string& original,
-                                      const std::string& separator) {
-  std::vector<std::string> results;
+template <class T = std::string>
+static std::vector<T> Split(const std::string& original,
+                            const std::string& separator) {
+  std::vector<T> results;
   std::string::size_type pos1, pos2;
   pos2 = original.find(separator);
   pos1 = 0;
   while (std::string::npos != pos2) {
-    results.push_back(original.substr(pos1, pos2 - pos1));
+    results.push_back(parse_string<T>(original.substr(pos1, pos2 - pos1)));
     pos1 = pos2 + separator.size();
     pos2 = original.find(separator, pos1);
   }
   if (pos1 != original.length()) {
-    results.push_back(original.substr(pos1));
+    results.push_back(parse_string<T>(original.substr(pos1)));
   }
   return results;
 }
diff --git a/third-party/flatbuffers b/third-party/flatbuffers
new file mode 160000
index 0000000000000000000000000000000000000000..6df40a2471737b27271bdd9b900ab5f3aec746c7
--- /dev/null
+++ b/third-party/flatbuffers
@@ -0,0 +1 @@
+Subproject commit 6df40a2471737b27271bdd9b900ab5f3aec746c7