diff --git a/README.md b/README.md
index 23974beee9a8af5ee7e2c454575efff2e3d96ee2..22b84888294b5ef60c3d91d7a7909aef8f601d81 100644
--- a/README.md
+++ b/README.md
@@ -44,7 +44,7 @@ Framework compatibility: In addition to models trained on PaddlePaddle, those tr
 
 Paddle Lite is designed to support a wide range of hardwares and devices, and it enables mixed execution of a single model on multiple devices, optimization on various phases, and leight-weighted applications on devices.
 
-![img](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png)
+![img](https://user-images.githubusercontent.com/45189361/70908123-6ce4fd00-2045-11ea-97e1-ad08446c5c86.png)
 
 As is shown in the figure above, analysis phase includes Machine IR module, and it enables optimizations like Op fusion and redundant computation pruning. Besides, excecution phase only involves Kernal exevution, so it can be deployed on its own to ensure maximized light-weighted deployment.
 
diff --git a/README_cn.md b/README_cn.md
index 99d38c47ffbbaa3b8593801701e3528167899f97..11d3967fe8ce88826ca982b71d96268c1a7e5c3a 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -34,7 +34,7 @@ Paddle Lite为Paddle-Mobile的升级版，定位支持包括手机移动端在
 
 PaddleLite 的架构设计着重考虑了对多硬件和平台的支持，并且强化了多个硬件在一个模型中混合执行的能力，多个层面的性能优化处理，以及对端侧应用的轻量化设计。
 
-![](https://github.com/Superjomn/_tmp_images/raw/master/images/paddle-lite-architecture.png)
+![](https://user-images.githubusercontent.com/45189361/70908123-6ce4fd00-2045-11ea-97e1-ad08446c5c86.png)
 
 其中，Analysis Phase 包括了 MIR(Machine IR) 相关模块，能够对原有的模型的计算图针对具体的硬件列表进行算子融合、计算裁剪 在内的多种优化。Execution Phase 只涉及到Kernel 的执行，且可以单独部署，以支持极致的轻量级部署。
 
diff --git a/cmake/cross_compiling/postproject.cmake b/cmake/cross_compiling/postproject.cmake
index 88ac3e101a686cb49ef5a4c3b1879c15b8f7b57b..7466b3e6d438277ad31020f76665bf689df436f5 100644
--- a/cmake/cross_compiling/postproject.cmake
+++ b/cmake/cross_compiling/postproject.cmake
@@ -63,7 +63,7 @@ if (LITE_ON_TINY_PUBLISH)
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
     endif()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto -fvisibility=hidden -fvisibility-inlines-hidden -fdata-sections -ffunction-sections")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
     check_linker_flag(-Wl,--gc-sections)
 endif()
 
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 574baa86a82963ffa76795e029a6ba14f537c80a..e5f1ec4cf21806992b22558f102c806e90e8858e 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -32,10 +32,9 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
     /usr/lib
-	${CUDA_TOOLKIT_ROOT_DIR}
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-	${CUDA_TOOLKIT_ROOT_DIR}/lib64
-	)
+    ${CUDA_TOOLKIT_ROOT_DIR}
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
 
 if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))
     find_library(CUBLAS_LIBRARY  NAMES libcublas.so PATHS ${CUDNN_CHECK_LIBRARY_DIRS} NO_DEFAULT_PATH)
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index a095eea6d1cce9ba09ee631a50b8029e769f6d37..d6b374529e27119f1c48c03c667aa694481e45e8 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -118,7 +118,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 function(lite_cc_library TARGET)
     set(options SHARED shared STATIC static MODULE module)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS NPU_DEPS XPU_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS LIGHT_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS
       HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -128,10 +128,10 @@ function(lite_cc_library TARGET)
             X86_DEPS ${args_X86_DEPS}
             CUDA_DEPS ${args_CUDA_DEPS}
             CL_DEPS ${args_CL_DEPS}
-            NPU_DEPS ${args_NPU_DEPS}
-            XPU_DEPS ${args_XPU_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
+            NPU_DEPS ${args_NPU_DEPS}
+            XPU_DEPS ${args_XPU_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
@@ -161,7 +161,7 @@ function(lite_cc_binary TARGET)
         set(options " -g ")
     endif()
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
       LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -173,6 +173,8 @@ function(lite_cc_binary TARGET)
             CL_DEPS ${args_CL_DEPS}
             ARM_DEPS ${args_ARM_DEPS}
             FPGA_DEPS ${args_FPGA_DEPS}
+            NPU_DEPS ${args_NPU_DEPS}
+            XPU_DEPS ${args_XPU_DEPS}
             PROFILE_DEPS ${args_PROFILE_DEPS}
             LIGHT_DEPS ${args_LIGHT_DEPS}
             HVY_DEPS ${args_HVY_DEPS}
@@ -205,7 +207,7 @@ function(lite_cc_test TARGET)
     endif()
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS
         COMPILE_LEVEL # (basic|extra)
@@ -225,6 +227,8 @@ function(lite_cc_test TARGET)
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
@@ -267,7 +271,7 @@ endif()
 function(add_kernel TARGET device level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -360,11 +364,12 @@ function(add_kernel TARGET device level)
     lite_cc_library(${TARGET} SRCS ${args_SRCS}
               DEPS ${args_DEPS}
               X86_DEPS ${args_X86_DEPS}
-              XPU_DEPS ${args_XPU_DEPS}
               CUDA_DEPS ${args_CUDA_DEPS}
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
@@ -383,7 +388,7 @@ endif()
 function(add_operator TARGET level)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS PROFILE_DEPS
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
         LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
         ARGS)
     cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -409,11 +414,12 @@ function(add_operator TARGET level)
     lite_cc_library(${TARGET} SRCS ${args_SRCS}
               DEPS ${args_DEPS}
               X86_DEPS ${args_X86_DEPS}
-              XPU_DEPS ${args_XPU_DEPS}
               CUDA_DEPS ${args_CUDA_DEPS}
               CL_DEPS ${args_CL_DEPS}
               ARM_DEPS ${args_ARM_DEPS}
               FPGA_DEPS ${args_FPGA_DEPS}
+              NPU_DEPS ${args_NPU_DEPS}
+              XPU_DEPS ${args_XPU_DEPS}
               PROFILE_DEPS ${args_PROFILE_DEPS}
               LIGHT_DEPS ${args_LIGHT_DEPS}
               HVY_DEPS ${args_HVY_DEPS}
diff --git a/cmake/xpu.cmake b/cmake/xpu.cmake
index 8d99343c3041351102820cb20890031fa3f5807e..ab34f409b8fa08af4eb01ff1289107a599d8c27d 100644
--- a/cmake/xpu.cmake
+++ b/cmake/xpu.cmake
@@ -89,7 +89,7 @@ else()
 endif()
 
 find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
-  PATHS ${XPU_SDK_ROOT}/XTDK/shlib)
+  PATHS ${XPU_SDK_ROOT}/XTDK/shlib/gcc482)
 
 if(NOT XPU_SDK_LLVM_FILE)
   message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 21e53bde34af66cadeea84b831fda3eccf77c643..df6b7d3648409e13d88c049ec86173905f8b3cb6 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -172,13 +172,17 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                     COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                     COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                     COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+                    COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                     COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/libpaddle_light_api_shared.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
                     COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
                     )
                 add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
+                add_dependencies(tiny_publish_cxx_lib bundle_light_api)
                 add_dependencies(publish_inference tiny_publish_cxx_lib)
-                add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
-                            COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
+                if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+                    add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                                COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
+                endif()
             endif()
         endif()
     endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index 408a63e3f5bd911ec93575d7cd6b2e2ef3b2b2d8..70239e94e7a3064fb383246623d05a2079dda1fa 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -16,8 +16,11 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
     add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
     target_link_libraries(paddle_full_api_shared framework_proto)
     if(LITE_WITH_X86)
-       add_dependencies(paddle_full_api_shared xxhash)
-       target_link_libraries(paddle_full_api_shared xxhash)
+        add_dependencies(paddle_full_api_shared xxhash)
+        target_link_libraries(paddle_full_api_shared xxhash)
+        if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) 
+            add_dependencies(paddle_full_api_shared dynload_mklml)
+        endif()
     endif()
     if(LITE_WITH_CUDA)
         target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
@@ -38,10 +41,11 @@ else()
     if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
         add_library(paddle_light_api_shared SHARED "")
         target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
-        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+       set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+       add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
         if (LITE_WITH_NPU)
             # Need to add HIAI runtime libs (libhiai.so) dependency
-            target_link_libraries(paddle_light_api_shared ${npu_runtime_libs})
+            target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
         endif()
     endif()
 endif()
@@ -77,8 +81,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
                     DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
                     X86_DEPS ${x86_kernels}
                     ARM_DEPS ${arm_kernels}
-                    NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
-                    XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
+                    NPU_DEPS ${npu_kernels}
+                    XPU_DEPS ${xpu_kernels}
                     CL_DEPS ${opencl_kernels}
                     FPGA_DEPS ${fpga_kernels})
 endif()
diff --git a/lite/api/_paddle_use_ops.h b/lite/api/_paddle_use_ops.h
index bdccfab5df67e485b9fef110dc6cc1e9d74b21c3..6da47e53789d651f4a36d0b8d6a7ca1ea5a0a3d3 100644
--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -108,7 +108,7 @@ USE_LITE_OP(while)
 USE_LITE_OP(lod_reset)
 USE_LITE_OP(lookup_table)
 USE_LITE_OP(multiclass_nms)
-USE_LITE_OP(graph_op)
+USE_LITE_OP(subgraph)
 USE_LITE_OP(sequence_expand)
 USE_LITE_OP(sequence_pool)
 USE_LITE_OP(reduce_max)
diff --git a/lite/api/android/jni/native/CMakeLists.txt b/lite/api/android/jni/native/CMakeLists.txt
index 3efa980332f25d786d5c880fab9b3ba5af0a1013..c1766772f8aaa417c3da1d72f2692c10c10194b4 100644
--- a/lite/api/android/jni/native/CMakeLists.txt
+++ b/lite/api/android/jni/native/CMakeLists.txt
@@ -25,11 +25,12 @@ if (NOT LITE_ON_TINY_PUBLISH)
     endif()
 else()
     add_library(paddle_lite_jni SHARED "")
+    set_target_properties(paddle_lite_jni PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
     target_sources(paddle_lite_jni PUBLIC ${__lite_cc_files} paddle_lite_jni.cc tensor_jni.cc)
     add_dependencies(paddle_lite_jni op_list_h kernel_list_h)
     if (LITE_WITH_NPU)
         # Need to add HIAI runtime libs (libhiai.so) dependency
-        target_link_libraries(paddle_lite_jni ${npu_runtime_libs})
+        target_link_libraries(paddle_lite_jni ${npu_builder_libs} ${npu_runtime_libs})
     endif()
 endif()
 
diff --git a/lite/api/android/jni/native/tensor_jni.cc b/lite/api/android/jni/native/tensor_jni.cc
index 59cafa19399c4d265915e2dac8653e9ed7d10851..5212fe9a6eba2b034883da93c9ea5d845a63c773 100644
--- a/lite/api/android/jni/native/tensor_jni.cc
+++ b/lite/api/android/jni/native/tensor_jni.cc
@@ -120,6 +120,22 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B(
   return JNI_TRUE;
 }
 
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I(
+    JNIEnv *env, jobject jtensor, jintArray buf) {
+  std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+  if (tensor == nullptr || (*tensor == nullptr)) {
+    return JNI_FALSE;
+  }
+  int64_t buf_size = (int64_t)env->GetArrayLength(buf);
+  if (buf_size != product((*tensor)->shape())) {
+    return JNI_FALSE;
+  }
+
+  int32_t *input = (*tensor)->mutable_data<int32_t>();
+  env->GetIntArrayRegion(buf, 0, buf_size, input);
+  return JNI_TRUE;
+}
+
 JNIEXPORT jfloatArray JNICALL
 Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *env, jobject jtensor) {
   if (is_const_tensor(env, jtensor)) {
@@ -148,6 +164,20 @@ Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *env, jobject jtensor) {
   }
 }
 
+JNIEXPORT jintArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *env, jobject jtensor) {
+  if (is_const_tensor(env, jtensor)) {
+    std::unique_ptr<const Tensor> *tensor =
+        get_read_only_tensor_pointer(env, jtensor);
+    return cpp_array_to_jintarray(
+        env, (*tensor)->data<int32_t>(), product((*tensor)->shape()));
+  } else {
+    std::unique_ptr<Tensor> *tensor = get_writable_tensor_pointer(env, jtensor);
+    return cpp_array_to_jintarray(
+        env, (*tensor)->data<int32_t>(), product((*tensor)->shape()));
+  }
+}
+
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(
     JNIEnv *env, jobject jtensor, jlong java_pointer) {
   if (java_pointer == 0) {
diff --git a/lite/api/android/jni/native/tensor_jni.h b/lite/api/android/jni/native/tensor_jni.h
index 34c35b6a76f777895dbe88dc5eadf48c659ee544..9b029dfb4c7431354d5de20c6132236764c6cc66 100644
--- a/lite/api/android/jni/native/tensor_jni.h
+++ b/lite/api/android/jni/native/tensor_jni.h
@@ -16,8 +16,8 @@
 #include <jni.h>
 /* Header for class com_baidu_paddle_lite_Tensor */
 
-#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
-#define PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#ifndef LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#define LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -49,6 +49,14 @@ Java_com_baidu_paddle_lite_Tensor_getFloatData(JNIEnv *, jobject);
 JNIEXPORT jbyteArray JNICALL
 Java_com_baidu_paddle_lite_Tensor_getByteData(JNIEnv *, jobject);
 
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    getIntData
+ * Signature: ()[I
+ */
+JNIEXPORT jintArray JNICALL
+Java_com_baidu_paddle_lite_Tensor_getIntData(JNIEnv *, jobject);
+
 /*
  * Class:     com_baidu_paddle_lite_Tensor
  * Method:    nativeResize
@@ -73,6 +81,14 @@ JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3F(
 JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3B(
     JNIEnv *, jobject, jbyteArray);
 
+/*
+ * Class:     com_baidu_paddle_lite_Tensor
+ * Method:    nativeSetData
+ * Signature: ([I)Z
+ */
+JNIEXPORT jboolean JNICALL Java_com_baidu_paddle_lite_Tensor_nativeSetData___3I(
+    JNIEnv *, jobject, jintArray);
+
 /*
  * Class:     com_baidu_paddle_lite_Tensor
  * Method:    deleteCppTensor
@@ -87,4 +103,4 @@ Java_com_baidu_paddle_lite_Tensor_deleteCppTensor(JNIEnv *, jobject, jlong);
 #ifdef __cplusplus
 }
 #endif
-#endif  // PADDLE_FLUID_LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
+#endif  // LITE_API_ANDROID_JNI_NATIVE_TENSOR_JNI_H_
diff --git a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
index ac78800bd2e4903b44332a0a0aefe9c69b75abab..f76841dd413ddda86678eecf8241068dd98b74a4 100644
--- a/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
+++ b/lite/api/android/jni/src/com/baidu/paddle/lite/Tensor.java
@@ -108,6 +108,19 @@ public class Tensor {
         return nativeSetData(buf);
     }
 
+    /**
+     * Set the tensor int data.
+     *
+     * @param buf the int array buffer which will be copied into tensor.
+     * @return true if set data successfully.
+     */
+    public boolean setData(int[] buf) {
+        if (readOnly) {
+            return false;
+        }
+        return nativeSetData(buf);
+    }
+
     /**
      * @return shape of the tensor as long array.
      */
@@ -123,12 +136,19 @@ public class Tensor {
      */
     public native byte[] getByteData();
 
+    /**
+     * @return the tensor data as int array.
+     */
+    public native int[] getIntData();
+
     private native boolean nativeResize(long[] dims);
 
     private native boolean nativeSetData(float[] buf);
 
     private native boolean nativeSetData(byte[] buf);
 
+    private native boolean nativeSetData(int[] buf);
+
     /**
      * Delete C++ Tenor object pointed by the input pointer, which is presented by a
      * long value.
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 4647f20bbe476d8763f94f707f3d88da7c7544df..990d08f18f541088d797510e9dbd4881d42b164f 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -139,22 +139,15 @@ std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
 
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
-  std::vector<const cpp::OpDesc *> feeds;
-  std::vector<const cpp::OpDesc *> fetchs;
-#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
-  // The shape of input tensors must be determined before generating NPU and XPU
-  // program.
-  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
-  for (size_t i = 0; i < current_block->OpsSize(); i++) {
-    auto op = current_block->GetOp<cpp::OpDesc>(i);
-#else
   if (!program_) {
     GenRuntimeProgram();
   }
+
+  std::vector<const cpp::OpDesc *> feeds;
+  std::vector<const cpp::OpDesc *> fetchs;
   const auto &insts = program_->instructions();
   for (size_t i = 0; i < program_->num_instructions(); i++) {
     const auto &op = insts[i].op()->op_info();
-#endif
     if (op->Type() == "feed") {
       feeds.push_back(op);
     } else if (op->Type() == "fetch") {
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 6fa400db6da9f029c38b496cd70d593a876628c9..3e6e10103e9f3af51923459a5921f9781431f352 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -20,6 +20,12 @@
 #include "lite/core/device_info.h"
 #include "lite/core/version.h"
 
+#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+#include <omp.h>
+#include "lite/backends/x86/mklml.h"
+#endif
+
 namespace paddle {
 namespace lite {
 
@@ -33,6 +39,17 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 
   mode_ = config.power_mode();
   threads_ = config.threads();
+
+#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
+    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
+  int num_threads = config.cpu_math_library_num_threads();
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
+  omp_set_num_threads(real_num_threads);
+  VLOG(3) << "set_cpu_math_library_math_threads() is set successfully and the "
+             "number of threads is:"
+          << num_threads;
+#endif
 }
 
 std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInput(int i) {
diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc
index 1c426e8568cf71b6f48edbbeb8a93fec2e89c594..b678c7ecd24c5ffbf3e9e3531264ac195c6a7325 100644
--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -90,6 +90,10 @@ std::vector<Place> ParserValidPlaces() {
           TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
     } else if (target_repr == "x86") {
       valid_places.emplace_back(TARGET(kX86));
+    } else if (target_repr == "npu") {
+      valid_places.emplace_back(TARGET(kNPU));
+    } else if (target_repr == "xpu") {
+      valid_places.emplace_back(TARGET(kXPU));
     } else {
       LOG(FATAL) << lite::string_format(
           "Wrong target '%s' found, please check the command flag "
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index a04e86b7d2a1e06a52c38b5f00e9c07966be1bfe..cf5fa4981a173ceb77e091ea9be0e510eb53980a 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -72,10 +72,6 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
          const int thread_num,
          const int repeat,
          const int warmup_times = 0) {
-#ifdef LITE_WITH_PROFILE
-  lite::profile::BasicProfiler<lite::profile::BasicTimer>::Global().SetWarmup(
-      warmup_times);
-#endif
   lite_api::MobileConfig config;
   config.set_model_dir(model_dir);
   config.set_power_mode(power_mode);
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index c578769bd5159d27ad43e4e93de33f601223004b..339117cd503247a91694d1a9ca63b930af5658de 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -133,6 +133,7 @@ class LITE_API CxxConfig : public ConfigBase {
   std::string model_file_;
   std::string param_file_;
   bool model_from_memory_{false};
+  int cpu_math_library_math_threads_ = 1;
 
  public:
   void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -151,6 +152,13 @@ class LITE_API CxxConfig : public ConfigBase {
   std::string model_file() const { return model_file_; }
   std::string param_file() const { return param_file_; }
   bool model_from_memory() const { return model_from_memory_; }
+
+  void set_cpu_math_library_math_threads(int threads) {
+    cpu_math_library_math_threads_ = threads;
+  }
+  int cpu_math_library_num_threads() const {
+    return cpu_math_library_math_threads_;
+  }
 };
 
 /// MobileConfig is the config for the light weight predictor, it will skip
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index 894d839185ea9e1b6b47b87c398f249f044c2b51..6d12df67ac70d5d922680fc76763123117045175 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -77,7 +77,8 @@ const std::string& PrecisionToStr(PrecisionType precision) {
 }
 
 const std::string& DataLayoutToStr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {"unk", "NCHW", "any", "NHWC"};
+  static const std::string datalayout2string[] = {
+      "unk", "NCHW", "any", "NHWC", "ImageDefault", "ImageFolder", "ImageNW"};
   auto x = static_cast<int>(layout);
   CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
   return datalayout2string[x];
@@ -115,8 +116,13 @@ const std::string& PrecisionRepr(PrecisionType precision) {
 }
 
 const std::string& DataLayoutRepr(DataLayoutType layout) {
-  static const std::string datalayout2string[] = {
-      "kUnk", "kNCHW", "kAny", "kNHWC"};
+  static const std::string datalayout2string[] = {"kUnk",
+                                                  "kNCHW",
+                                                  "kAny",
+                                                  "kNHWC",
+                                                  "kImageDefault",
+                                                  "kImageFolder",
+                                                  "kImageNW"};
   auto x = static_cast<int>(layout);
   CHECK_LT(x, static_cast<int>(DATALAYOUT(NUM)));
   return datalayout2string[x];
@@ -146,8 +152,12 @@ std::set<PrecisionType> ExpandValidPrecisions(PrecisionType precision) {
 }
 
 std::set<DataLayoutType> ExpandValidLayouts(DataLayoutType layout) {
-  static const std::set<DataLayoutType> valid_set(
-      {DATALAYOUT(kNCHW), DATALAYOUT(kAny), DATALAYOUT(kNHWC)});
+  static const std::set<DataLayoutType> valid_set({DATALAYOUT(kNCHW),
+                                                   DATALAYOUT(kAny),
+                                                   DATALAYOUT(kNHWC),
+                                                   DATALAYOUT(kImageDefault),
+                                                   DATALAYOUT(kImageFolder),
+                                                   DATALAYOUT(kImageNW)});
   if (layout == DATALAYOUT(kAny)) {
     return valid_set;
   }
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index 07284be095c05e5dfa069b0973d5982cf1f07c8a..1aa41522352e9c2832e3c9919249887480e871a3 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -71,8 +71,11 @@ enum class DataLayoutType : int {
   kUnk = 0,
   kNCHW = 1,
   kNHWC = 3,
-  kAny = 2,  // any data layout
-  NUM = 4,   // number of fields.
+  kImageDefault = 4,  // for opencl image2d
+  kImageFolder = 5,   // for opencl image2d
+  kImageNW = 6,       // for opencl image2d
+  kAny = 2,           // any data layout
+  NUM = 7,            // number of fields.
 };
 
 typedef enum {
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 9d56d262abf549584819ab893144e41fc399439f..ac29cdda019c29ee208df391e0c637dc07329abe 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -20,12 +20,6 @@ USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
 USE_MIR_PASS(type_target_cast_pass);
 USE_MIR_PASS(generate_program_pass);
-#ifdef LITE_WITH_NPU
-USE_MIR_PASS(generate_npu_program_pass);
-#endif
-#ifdef LITE_WITH_XPU
-USE_MIR_PASS(generate_xpu_program_pass);
-#endif
 
 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);
@@ -40,8 +34,12 @@ USE_MIR_PASS(lite_interpolate_fuse_pass);
 USE_MIR_PASS(identity_scale_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
+USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
 USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
 USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);
 USE_MIR_PASS(type_layout_cast_pass);
 USE_MIR_PASS(memory_optimize_pass);
+USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
+USE_MIR_PASS(npu_subgraph_pass);
+USE_MIR_PASS(xpu_subgraph_pass);
diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt
index 178f167e6a1627d01df13b2e105e0af36b20601a..eabb6b150b93a722282118c3932676cd1aee5da8 100644
--- a/lite/api/python/pybind/CMakeLists.txt
+++ b/lite/api/python/pybind/CMakeLists.txt
@@ -4,3 +4,6 @@ if (NOT LITE_ON_TINY_PUBLISH)
 endif()
 
 lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
+if (LITE_ON_TINY_PUBLISH)
+   set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+endif()
diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc
index 2df2e8f8f8aa56bb71b0e1cb293df2ecbbafd0bb..7d4ed4e98701a5328b0f05387dc73ad8b93dfe18 100644
--- a/lite/api/python/pybind/pybind.cc
+++ b/lite/api/python/pybind/pybind.cc
@@ -165,6 +165,9 @@ void BindLitePlace(py::module *m) {
   py::enum_<DataLayoutType>(*m, "DataLayoutType")
       .value("NCHW", DataLayoutType::kNCHW)
       .value("NHWC", DataLayoutType::kNHWC)
+      .value("ImageDefault", DataLayoutType::kImageDefault)
+      .value("ImageFolder", DataLayoutType::kImageFolder)
+      .value("ImageNW", DataLayoutType::kImageNW)
       .value("Any", DataLayoutType::kAny);
 
   // Place
diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc
index 5314c5ed75d862635a1b87cdad33bf3c58dcd6cc..4d0aefbc06a9d0678d8b401629b7cc4355967f6c 100644
--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
@@ -30,6 +30,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
   std::string model_dir = FLAGS_model_dir;
   lite_api::CxxConfig config;
   config.set_model_dir(model_dir);
+  config.set_cpu_math_library_math_threads(10);
   config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
                            lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
                            lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
index 076c791daab182c4eff477a621ecd2ec52a0c3e7..3bf1a00dd2701a2aaf79183eb6eb476e5cf67fff 100644
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -120,5 +120,7 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       stack.cc
       affine_channel.cc
       anchor_generator.cc
+      split_merge_lod_tenosr.cc
+      reduce_prod.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
index 5834461b8fe0b2d37f174d5f66269fb58f2504a1..67d60b18141f64fd4e0048e1a5d1e2c5373c7484 100644
--- a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -24,29 +24,48 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
-void input_trans_c4(const float* src,
-                    int src_stride,
-                    float* dest,
-                    int dest_stride);
-void output_trans_c4(const float* src,
-                     int src_stride,
-                     float* dest,
-                     int dest_stride);
-void output_trans_c4_post(const float* src,
-                          int src_stride,
-                          float* dest,
-                          int dest_stride,
-                          float* bias_value,
-                          bool has_relu);
-void weight_trans_c4(
+void input_trans_c4_8x8(const float* src,
+                        int src_stride,
+                        float* dest,
+                        int dest_stride);
+void output_trans_c4_6x8(const float* src,
+                         int src_stride,
+                         float* dest,
+                         int dest_stride);
+void output_trans_c4_post_6x8(const float* src,
+                              int src_stride,
+                              float* dest,
+                              int dest_stride,
+                              float* bias_value,
+                              bool has_relu);
+void input_trans_c4_4x4(const float* src,
+                        int src_stride,
+                        int src_h_stride,
+                        float* dest,
+                        int dest_stride,
+                        int dest_h_stride);
+void output_trans_c4_post_2x4(const float* src,
+                              int src_stride,
+                              int src_h_stride,
+                              float* dest,
+                              int dest_stride,
+                              int dest_h_stride,
+                              float* bias_value,
+                              bool has_relu);
+void weight_trans_c4_8x8(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+void weight_trans_c4_4x4(
     float* dest, const float* src, int ic, int oc, void* workspace);
 
 /*
-*The following function conv_compute_6x6_3x3 is base on
+*The following function conv_compute_6x6_3x3 and conv_compute_2x2_3x3[_small] is
+*base on
 *MNN[https://github.com/alibaba/MNN]
 *
 *Copyright © 2018, Alibaba Group Holding Limited
 */
+
+// F(6,3)
 void conv_compute_6x6_3x3(const float* input,
                           float* output,
                           int num,
@@ -75,11 +94,14 @@ void conv_compute_6x6_3x3(const float* input,
   int tile_w = (wout + 5) / 6;
   int tile_h = (hout + 5) / 6;
   int size_tile = tile_h * tile_w;
-  float zero_ptr[8];
-  memset(zero_ptr, 0, 8 * sizeof(float));
 
   int w_pad = win + pad_w * 2;
   int h_pad = hin + pad_h * 2;
+
+  const int zero_len = w_pad;
+  float zero_ptr[zero_len];  // NOLINT
+  memset(zero_ptr, 0, zero_len * sizeof(float));
+
   float* input_c4 = tmp_work_space;
   int new_h_stride = w_pad * 4;
   int new_c_stride = new_h_stride * h_pad;
@@ -88,9 +110,6 @@ void conv_compute_6x6_3x3(const float* input,
   int oc_4_stride = wout * hout * 4;
 
   int tile_block = 8;
-#ifdef __aarch64__
-  tile_block = 16;
-#endif
   int block_count = (size_tile + tile_block - 1) / tile_block;
 
   int threads = ctx->threads();
@@ -102,7 +121,8 @@ void conv_compute_6x6_3x3(const float* input,
 
   // begin compute
   for (int ni = 0; ni < num; ++ni) {
-    // trans input to c4
+// trans input to c4
+#pragma omp parallel for num_threads(threads)
     for (int i = 0; i < ic_4; ++i) {
       prepack_input_nxwc4_dw(input + ni * in_n_stride,
                              input_c4 + i * new_c_stride,
@@ -161,14 +181,14 @@ void conv_compute_6x6_3x3(const float* input,
             const float* src_ci = src_ptr + ci * ic_4_stride;
             for (int i = 0; i < 8; ++i) {
               const float* ci_ptr = src_ci + i * w_pad * 4;
-              input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32);
+              input_trans_c4_8x8(ci_ptr, 4, trans_tmp_data + i * 4, 32);
             }
             float* dst_ci = dst_ptr + ci * tile_count * 4;
             for (int i = 0; i < 8; ++i) {
-              input_trans_c4(trans_tmp_data + i * 32,
-                             4,
-                             dst_ci + i * b_gi_stride * 8,
-                             b_gi_stride);
+              input_trans_c4_8x8(trans_tmp_data + i * 32,
+                                 4,
+                                 dst_ci + i * b_gi_stride * 8,
+                                 b_gi_stride);
             }
           }
         } else {
@@ -189,14 +209,14 @@ void conv_compute_6x6_3x3(const float* input,
             // trans
             for (int i = 0; i < 8; ++i) {
               float* ci_ptr = trans_remain_tmp_data + i * 32;
-              input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32);
+              input_trans_c4_8x8(ci_ptr, 4, trans_tmp_data + i * 4, 32);
             }
             float* dst_ci = dst_ptr + ci * tile_count * 4;
             for (int i = 0; i < 8; ++i) {
-              input_trans_c4(trans_tmp_data + i * 32,
-                             4,
-                             dst_ci + i * b_gi_stride * 8,
-                             b_gi_stride);
+              input_trans_c4_8x8(trans_tmp_data + i * 32,
+                                 4,
+                                 dst_ci + i * b_gi_stride * 8,
+                                 b_gi_stride);
             }
           }  // for ci_4
         }
@@ -213,16 +233,8 @@ void conv_compute_6x6_3x3(const float* input,
         float* origin_C = dst_temp_data + gi * c_gi_stride;
         float* origin_B = b_ptr + gi * b_gi_stride;
         const float* origin_A = weight + gi * w_gi_stride;
-        sgemm_prepack_c4_small(oc_4 * 4,
-                               tile_count,
-                               ic_4 * 4,
-                               origin_A,
-                               origin_B,
-                               origin_C,
-                               nullptr,
-                               false,
-                               false,
-                               ctx);
+        sgemm_prepack_c4_small(
+            oc_4 * 4, tile_count, ic_4 * 4, origin_A, origin_B, origin_C, ctx);
       }
       //*/
       //*
@@ -258,18 +270,18 @@ void conv_compute_6x6_3x3(const float* input,
             float* dst_ci = dst_ptr + ci * oc_4_stride;
             float* src_ci = src_ptr + ci * tile_count * 4;
             for (int i = 0; i < 8; ++i) {
-              output_trans_c4(src_ci + i * c_gi_stride * 8,
-                              c_gi_stride,
-                              trans_tmp_data + i * 4,
-                              32);
+              output_trans_c4_6x8(src_ci + i * c_gi_stride * 8,
+                                  c_gi_stride,
+                                  trans_tmp_data + i * 4,
+                                  32);
             }
             for (int i = 0; i < ey; ++i) {
-              output_trans_c4_post(trans_tmp_data + i * 32,
-                                   4,
-                                   trans_remain_tmp_data + i * 24,
-                                   4,
-                                   bias_value,
-                                   param.fuse_relu);
+              output_trans_c4_post_6x8(trans_tmp_data + i * 32,
+                                       4,
+                                       trans_remain_tmp_data + i * 24,
+                                       4,
+                                       bias_value,
+                                       param.fuse_relu);
             }
             write_to_output_c4_fp32(trans_remain_tmp_data,
                                     output_ptr,
@@ -283,7 +295,8 @@ void conv_compute_6x6_3x3(const float* input,
                                     hout,
                                     wout,
                                     false,
-                                    zero_ptr);
+                                    zero_ptr,
+                                    nullptr);
           }
         } else {
           for (int ci = 0; ci < oc_4; ++ci) {
@@ -297,18 +310,18 @@ void conv_compute_6x6_3x3(const float* input,
             float* dst_ci = dst_ptr + ci * oc_4_stride;
             float* src_ci = src_ptr + ci * tile_count * 4;
             for (int i = 0; i < 8; ++i) {
-              output_trans_c4(src_ci + i * c_gi_stride * 8,
-                              c_gi_stride,
-                              trans_tmp_data + i * 4,
-                              32);
+              output_trans_c4_6x8(src_ci + i * c_gi_stride * 8,
+                                  c_gi_stride,
+                                  trans_tmp_data + i * 4,
+                                  32);
             }
             for (int i = 0; i < ey; ++i) {
-              output_trans_c4_post(trans_tmp_data + i * 32,
-                                   4,
-                                   trans_remain_tmp_data + i * 24,
-                                   4,
-                                   bias_value,
-                                   param.fuse_relu);
+              output_trans_c4_post_6x8(trans_tmp_data + i * 32,
+                                       4,
+                                       trans_remain_tmp_data + i * 24,
+                                       4,
+                                       bias_value,
+                                       param.fuse_relu);
             }
             // copy to dest
             memset(trans_tmp_data, 0, 144 * sizeof(float));
@@ -329,7 +342,8 @@ void conv_compute_6x6_3x3(const float* input,
                                     hout,
                                     wout,
                                     false,
-                                    zero_ptr);
+                                    zero_ptr,
+                                    nullptr);
           }
         }
       }
@@ -338,10 +352,526 @@ void conv_compute_6x6_3x3(const float* input,
   }    // for num
 }  // conv_compute
 
-void output_trans_c4(const float* src,
-                     int src_stride,
-                     float* dest,
-                     int dest_stride) {
+// F(2,3)
+void conv_compute_2x2_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx) {
+  const int pad_h = (*param.paddings)[0];
+  const int pad_w = (*param.paddings)[2];
+  float* tmp_work_space =
+      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
+
+  int in_n_stride = chin * hin * win;
+  int out_n_stride = chout * hout * wout;
+  int ic_stride = win * hin;
+  int oc_stride = wout * hout;
+  int ic_4 = (chin + 3) / 4;
+  int oc_4 = (chout + 3) / 4;
+
+  int tile_w = (wout + 1) / 2;
+  int tile_h = (hout + 1) / 2;
+  int size_tile = tile_h * tile_w;
+
+  int w_pad = win + pad_w * 2;
+  int h_pad = hin + pad_h * 2;
+
+  const int zero_len = w_pad;
+  float zero_ptr[zero_len];  // NOLINT
+  memset(zero_ptr, 0, zero_len * sizeof(float));
+
+  float* input_c4 = tmp_work_space;
+  int new_h_stride = w_pad * 4;
+  int new_c_stride = new_h_stride * h_pad;
+
+  int ic_4_stride = w_pad * h_pad * 4;
+  int oc_4_stride = wout * hout * 4;
+
+  int tile_block = 8;
+  int block_count = (size_tile + tile_block - 1) / tile_block;
+
+  int threads = ctx->threads();
+  float* g_tmp_data = tmp_work_space + ic_4 * new_c_stride;
+  int tmp_data_thread_stride = tile_block * (oc_4 + ic_4) * 64;
+  memset(g_tmp_data, 0, threads * tmp_data_thread_stride * sizeof(float));
+  float* g_trans_tmp_data = g_tmp_data + threads * tmp_data_thread_stride;
+  float* g_trans_remain_tmp_data = g_trans_tmp_data + threads * 64;
+
+  // begin compute
+  for (int ni = 0; ni < num; ++ni) {
+// trans input to c4
+#pragma omp parallel for num_threads(threads)
+    for (int i = 0; i < ic_4; ++i) {
+      prepack_input_nxwc4_dw(input + ni * in_n_stride,
+                             input_c4 + i * new_c_stride,
+                             i * 4,
+                             -pad_h,
+                             hin + pad_h,
+                             -pad_w,
+                             win + pad_w,
+                             chin,
+                             win,
+                             hin,
+                             zero_ptr);
+    }
+    float* output_ptr = output + ni * out_n_stride;
+
+    const float* weight_ptr = weight;
+    const float* bias_ptr = bias;
+#pragma omp parallel for num_threads(threads)
+    for (int tbi = 0; tbi < block_count; ++tbi) {
+#ifdef ARM_WITH_OMP
+      float* tmp_data =
+          g_tmp_data + omp_get_thread_num() * tmp_data_thread_stride;
+      float* trans_tmp_data = g_trans_tmp_data + omp_get_thread_num() * 64;
+      float* trans_remain_tmp_data =
+          g_trans_remain_tmp_data + omp_get_thread_num() * 64;
+#else
+      float* tmp_data = g_tmp_data;
+      float* trans_tmp_data = g_trans_tmp_data;
+      float* trans_remain_tmp_data = g_trans_remain_tmp_data;
+#endif
+      int tile_index = tbi * tile_block;
+      int tile_remain = size_tile - tile_index;
+      int tile_count = tile_remain > tile_block ? tile_block : tile_remain;
+
+      // input trans
+      int c_gi_stride = tile_count * oc_4 * 4;
+      int b_gi_stride = tile_count * ic_4 * 4;
+      //*
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int src_x = tw_index + tw_index;
+        int src_y = th_index + th_index;
+        int ex = src_x + 4 > w_pad ? w_pad - src_x : 4;
+        int ey = src_y + 4 > h_pad ? h_pad - src_y : 4;
+
+        float* dst_ptr = tmp_data + ti * 4;
+        const float* src_ptr = input_c4 + (src_y * w_pad + src_x) * 4;
+
+        if (ex == 4 && ey == 4) {
+          // trans input
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            input_trans_c4_4x4(
+                src_ci, 4, w_pad * 4, dst_ci, b_gi_stride, b_gi_stride * 4);
+          }
+        } else {
+          // trans remain input
+          int x_size = ex;
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            // pad
+            memset(trans_remain_tmp_data, 0, 64 * sizeof(float));
+            if (x_size > 0) {
+              for (int yi = 0; yi < ey; ++yi) {
+                float* dst_yi = trans_remain_tmp_data + yi * 16;
+                const float* src_yi = src_ci + w_pad * yi * 4;
+                memcpy(dst_yi, src_yi, x_size * sizeof(float) * 4);
+              }
+            }
+
+            // trans
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            input_trans_c4_4x4(trans_remain_tmp_data,
+                               4,
+                               16,
+                               dst_ci,
+                               b_gi_stride,
+                               b_gi_stride * 4);
+          }  // for ci_4
+        }
+      }
+      //*/
+      // input trans end
+      // *begin compute dot
+      // *
+      //*
+      float* dst_temp_data = tmp_data + tile_block * ic_4 * 64;
+      float* b_ptr = tmp_data;
+      int w_gi_stride = ic_4 * oc_4 * 16;
+      for (int gi = 0; gi < 16; ++gi) {
+        float* origin_C = dst_temp_data + gi * c_gi_stride;
+        float* origin_B = b_ptr + gi * b_gi_stride;
+        const float* origin_A = weight + gi * w_gi_stride;
+        sgemm_prepack_c4_small(
+            oc_4 * 4, tile_count, ic_4 * 4, origin_A, origin_B, origin_C, ctx);
+      }
+      //*/
+      //*
+      // output trans
+      float bias_value[4];
+      memset(bias_value, 0, 4 * sizeof(float));
+
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int dst_x = tw_index * 2;
+        int dst_y = th_index * 2;
+
+        int ex = dst_x + 2 > wout ? wout - dst_x : 2;
+        int ey = dst_y + 2 > hout ? hout - dst_y : 2;
+
+        float* dst_ptr = output + (dst_y * wout + dst_x) * 4;
+        float* src_ptr = dst_temp_data + ti * 4;
+
+        if (ex == 2) {
+          // trans output
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            output_trans_c4_post_2x4(src_ci,
+                                     c_gi_stride,
+                                     c_gi_stride * 4,
+                                     trans_remain_tmp_data,
+                                     4,
+                                     8,
+                                     bias_value,
+                                     param.fuse_relu);
+            write_to_output_c4_fp32(trans_remain_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    nullptr);
+          }
+        } else {
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+            // trans output
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            output_trans_c4_post_2x4(src_ci,
+                                     c_gi_stride,
+                                     c_gi_stride * 4,
+                                     trans_remain_tmp_data,
+                                     4,
+                                     8,
+                                     bias_value,
+                                     param.fuse_relu);
+            // copy to dest
+            memset(trans_tmp_data, 0, 16 * sizeof(float));
+            for (int i = 0; i < ey; ++i) {
+              memcpy(trans_tmp_data + i * ex * 4,
+                     trans_remain_tmp_data + i * 8,
+                     ex * sizeof(float) * 4);
+            }
+            write_to_output_c4_fp32(trans_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    nullptr);
+          }
+        }
+      }
+      //*/
+    }  // for block_count
+  }    // for num
+}  // conv_compute
+void conv_compute_2x2_3x3_small(const float* input,
+                                float* output,
+                                int num,
+                                int chout,
+                                int hout,
+                                int wout,
+                                int chin,
+                                int hin,
+                                int win,
+                                const float* weight,
+                                const float* bias,
+                                const operators::ConvParam& param,
+                                ARMContext* ctx) {
+  const int pad_h = (*param.paddings)[0];
+  const int pad_w = (*param.paddings)[2];
+  float* tmp_work_space =
+      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
+
+  int in_n_stride = chin * hin * win;
+  int out_n_stride = chout * hout * wout;
+  int ic_stride = win * hin;
+  int oc_stride = wout * hout;
+  int ic_4 = (chin + 3) / 4;
+  int oc_4 = (chout + 3) / 4;
+
+  int tile_w = (wout + 1) / 2;
+  int tile_h = (hout + 1) / 2;
+  int size_tile = tile_h * tile_w;
+
+  int w_pad = win + pad_w * 2;
+  int h_pad = hin + pad_h * 2;
+
+  const int zero_len = w_pad;
+  float zero_ptr[zero_len];  // NOLINT
+  memset(zero_ptr, 0, zero_len * sizeof(float));
+
+  float* input_c4 = tmp_work_space;
+  int new_h_stride = w_pad * 4;
+  int new_c_stride = new_h_stride * h_pad;
+
+  int ic_4_stride = w_pad * h_pad * 4;
+  int oc_4_stride = wout * hout * 4;
+
+  int tile_block = 8;
+  int block_count = (size_tile + tile_block - 1) / tile_block;
+
+  int threads = ctx->threads();
+  float* g_tmp_data = tmp_work_space + ic_4 * new_c_stride;
+  int tmp_data_thread_stride = tile_block * (oc_4 + ic_4) * 64;
+  memset(g_tmp_data, 0, tmp_data_thread_stride * sizeof(float));
+  float* g_trans_tmp_data = g_tmp_data + tmp_data_thread_stride;
+  float* g_trans_remain_tmp_data = g_trans_tmp_data + 64;
+
+  // begin compute
+  for (int ni = 0; ni < num; ++ni) {
+// trans input to c4
+
+#pragma omp parallel for num_threads(threads)
+    for (int i = 0; i < ic_4; ++i) {
+      prepack_input_nxwc4_dw(input + ni * in_n_stride,
+                             input_c4 + i * new_c_stride,
+                             i * 4,
+                             -pad_h,
+                             hin + pad_h,
+                             -pad_w,
+                             win + pad_w,
+                             chin,
+                             win,
+                             hin,
+                             zero_ptr);
+    }
+    float* output_ptr = output + ni * out_n_stride;
+
+    const float* weight_ptr = weight;
+    const float* bias_ptr = bias;
+    for (int tbi = 0; tbi < block_count; ++tbi) {
+      float* tmp_data = g_tmp_data;
+      float* trans_tmp_data = g_trans_tmp_data;
+      float* trans_remain_tmp_data = g_trans_remain_tmp_data;
+      int tile_index = tbi * tile_block;
+      int tile_remain = size_tile - tile_index;
+      int tile_count = tile_remain > tile_block ? tile_block : tile_remain;
+
+      // input trans
+      int c_gi_stride = tile_count * oc_4 * 4;
+      int b_gi_stride = tile_count * ic_4 * 4;
+      //*
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int src_x = tw_index + tw_index;
+        int src_y = th_index + th_index;
+        int ex = src_x + 4 > w_pad ? w_pad - src_x : 4;
+        int ey = src_y + 4 > h_pad ? h_pad - src_y : 4;
+
+        float* dst_ptr = tmp_data + ti * 4;
+        const float* src_ptr = input_c4 + (src_y * w_pad + src_x) * 4;
+
+        if (ex == 4 && ey == 4) {
+          // trans input
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            input_trans_c4_4x4(
+                src_ci, 4, w_pad * 4, dst_ci, b_gi_stride, b_gi_stride * 4);
+          }
+        } else {
+          // trans remain input
+          int x_size = ex;
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            // pad
+            memset(trans_remain_tmp_data, 0, 64 * sizeof(float));
+            if (x_size > 0) {
+              for (int yi = 0; yi < ey; ++yi) {
+                float* dst_yi = trans_remain_tmp_data + yi * 16;
+                const float* src_yi = src_ci + w_pad * yi * 4;
+                memcpy(dst_yi, src_yi, x_size * sizeof(float) * 4);
+              }
+            }
+
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            input_trans_c4_4x4(trans_remain_tmp_data,
+                               4,
+                               16,
+                               dst_ci,
+                               b_gi_stride,
+                               b_gi_stride * 4);
+          }  // for ci_4
+        }
+      }
+      //*/
+      // input trans end
+      // *begin compute dot
+      // *
+      //*
+      float* dst_temp_data = tmp_data + tile_block * ic_4 * 64;
+      float* b_ptr = tmp_data;
+      int w_gi_stride = ic_4 * oc_4 * 16;
+#pragma omp parallel for num_threads(threads)
+      for (int gi = 0; gi < 16; ++gi) {
+        float* origin_C = dst_temp_data + gi * c_gi_stride;
+        float* origin_B = b_ptr + gi * b_gi_stride;
+        const float* origin_A = weight + gi * w_gi_stride;
+        sgemm_prepack_c4_small(
+            oc_4 * 4, tile_count, ic_4 * 4, origin_A, origin_B, origin_C, ctx);
+      }
+      //*/
+      //*
+      // output trans
+      float bias_value[4];
+      memset(bias_value, 0, 4 * sizeof(float));
+
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int dst_x = tw_index * 2;
+        int dst_y = th_index * 2;
+
+        int ex = dst_x + 2 > wout ? wout - dst_x : 2;
+        int ey = dst_y + 2 > hout ? hout - dst_y : 2;
+
+        float* dst_ptr = output + (dst_y * wout + dst_x) * 4;
+        float* src_ptr = dst_temp_data + ti * 4;
+
+        if (ex == 2) {
+          // trans output
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+
+            output_trans_c4_post_2x4(src_ci,
+                                     c_gi_stride,
+                                     c_gi_stride * 4,
+                                     trans_remain_tmp_data,
+                                     4,
+                                     8,
+                                     bias_value,
+                                     param.fuse_relu);
+            write_to_output_c4_fp32(trans_remain_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    nullptr);
+          }
+        } else {
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+            // trans output
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            output_trans_c4_post_2x4(src_ci,
+                                     c_gi_stride,
+                                     c_gi_stride * 4,
+                                     trans_remain_tmp_data,
+                                     4,
+                                     8,
+                                     bias_value,
+                                     param.fuse_relu);
+            // copy to dest
+            memset(trans_tmp_data, 0, 16 * sizeof(float));
+            for (int i = 0; i < ey; ++i) {
+              memcpy(trans_tmp_data + i * ex * 4,
+                     trans_remain_tmp_data + i * 8,
+                     ex * sizeof(float) * 4);
+            }
+            write_to_output_c4_fp32(trans_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr,
+                                    nullptr);
+          }
+        }
+      }
+      //*/
+    }  // for block_count
+  }    // for num
+}  // conv_compute
+void output_trans_c4_6x8(const float* src,
+                         int src_stride,
+                         float* dest,
+                         int dest_stride) {
   const float32x4_t src0 = vld1q_f32(src);
   const float32x4_t src1 = vld1q_f32(src + src_stride);
   const float32x4_t src2 = vld1q_f32(src + src_stride * 2);
@@ -381,12 +911,13 @@ void output_trans_c4(const float* src,
   vst1q_f32(dest + dest_stride * 4, dest4);
   vst1q_f32(dest + dest_stride * 5, dest5);
 }
-void output_trans_c4_post(const float* src,
-                          int src_stride,
-                          float* dest,
-                          int dest_stride,
-                          float* bias_value,
-                          bool has_relu = false) {
+
+void output_trans_c4_post_6x8(const float* src,
+                              int src_stride,
+                              float* dest,
+                              int dest_stride,
+                              float* bias_value,
+                              bool has_relu = false) {
   const float32x4_t src0 = vld1q_f32(src);
   const float32x4_t src1 = vld1q_f32(src + src_stride);
   const float32x4_t src2 = vld1q_f32(src + src_stride * 2);
@@ -447,10 +978,10 @@ void output_trans_c4_post(const float* src,
   vst1q_f32(dest + dest_stride * 5, dest5);
 }
 
-void input_trans_c4(const float* src,
-                    int src_stride,
-                    float* dest,
-                    int dest_stride) {
+void input_trans_c4_8x8(const float* src,
+                        int src_stride,
+                        float* dest,
+                        int dest_stride) {
   float32x4_t src0 = vld1q_f32(src);
   float32x4_t src1 = vld1q_f32(src + src_stride);
   float32x4_t src2 = vld1q_f32(src + src_stride * 2);
@@ -497,7 +1028,165 @@ void input_trans_c4(const float* src,
   vst1q_f32(dest + dest_stride * 6, dst6);
   vst1q_f32(dest + dest_stride * 7, dst7);
 }
-void weight_trans_c4(
+
+// BT=[1, 0, -1, 0,
+//    0, 1,  1, 0,
+//    0, -1, 1, 0,
+//    0, 1,  0, -1]
+void input_trans_c4_4x4(const float* src,
+                        int src_stride,
+                        int src_h_stride,
+                        float* dest,
+                        int dest_stride,
+                        int dest_h_stride) {
+  float32x4_t src00 = vld1q_f32(src);
+  float32x4_t src01 = vld1q_f32(src + src_stride);
+  float32x4_t src02 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src03 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src10 = vld1q_f32(src);
+  float32x4_t src11 = vld1q_f32(src + src_stride);
+  float32x4_t src12 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src13 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src20 = vld1q_f32(src);
+  float32x4_t src21 = vld1q_f32(src + src_stride);
+  float32x4_t src22 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src23 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src30 = vld1q_f32(src);
+  float32x4_t src31 = vld1q_f32(src + src_stride);
+  float32x4_t src32 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src33 = vld1q_f32(src + src_stride + src_stride + src_stride);
+
+  float32x4_t dst00 = vsubq_f32(src00, src02);
+  float32x4_t dst10 = vaddq_f32(src01, src02);
+  float32x4_t dst20 = vsubq_f32(src02, src01);
+  float32x4_t dst30 = vsubq_f32(src01, src03);
+
+  float32x4_t dst01 = vsubq_f32(src10, src12);
+  float32x4_t dst11 = vaddq_f32(src11, src12);
+  float32x4_t dst21 = vsubq_f32(src12, src11);
+  float32x4_t dst31 = vsubq_f32(src11, src13);
+
+  float32x4_t dst02 = vsubq_f32(src20, src22);
+  float32x4_t dst12 = vaddq_f32(src21, src22);
+  float32x4_t dst22 = vsubq_f32(src22, src21);
+  float32x4_t dst32 = vsubq_f32(src21, src23);
+
+  float32x4_t dst03 = vsubq_f32(src30, src32);
+  float32x4_t dst13 = vaddq_f32(src31, src32);
+  float32x4_t dst23 = vsubq_f32(src32, src31);
+  float32x4_t dst33 = vsubq_f32(src31, src33);
+
+  float32x4_t dest00 = vsubq_f32(dst00, dst02);
+  float32x4_t dest10 = vaddq_f32(dst01, dst02);
+  float32x4_t dest20 = vsubq_f32(dst02, dst01);
+  float32x4_t dest30 = vsubq_f32(dst01, dst03);
+
+  float32x4_t dest01 = vsubq_f32(dst10, dst12);
+  float32x4_t dest11 = vaddq_f32(dst11, dst12);
+  float32x4_t dest21 = vsubq_f32(dst12, dst11);
+  float32x4_t dest31 = vsubq_f32(dst11, dst13);
+
+  float32x4_t dest02 = vsubq_f32(dst20, dst22);
+  float32x4_t dest12 = vaddq_f32(dst21, dst22);
+  float32x4_t dest22 = vsubq_f32(dst22, dst21);
+  float32x4_t dest32 = vsubq_f32(dst21, dst23);
+
+  float32x4_t dest03 = vsubq_f32(dst30, dst32);
+  float32x4_t dest13 = vaddq_f32(dst31, dst32);
+  float32x4_t dest23 = vsubq_f32(dst32, dst31);
+  float32x4_t dest33 = vsubq_f32(dst31, dst33);
+
+  vst1q_f32(dest, dest00);
+  vst1q_f32(dest + dest_stride, dest10);
+  vst1q_f32(dest + dest_stride + dest_stride, dest20);
+  vst1q_f32(dest + dest_stride + dest_stride + dest_stride, dest30);
+  dest += dest_h_stride;
+  vst1q_f32(dest, dest01);
+  vst1q_f32(dest + dest_stride, dest11);
+  vst1q_f32(dest + dest_stride + dest_stride, dest21);
+  vst1q_f32(dest + dest_stride + dest_stride + dest_stride, dest31);
+  dest += dest_h_stride;
+  vst1q_f32(dest, dest02);
+  vst1q_f32(dest + dest_stride, dest12);
+  vst1q_f32(dest + dest_stride + dest_stride, dest22);
+  vst1q_f32(dest + dest_stride + dest_stride + dest_stride, dest32);
+  dest += dest_h_stride;
+  vst1q_f32(dest, dest03);
+  vst1q_f32(dest + dest_stride, dest13);
+  vst1q_f32(dest + dest_stride + dest_stride, dest23);
+  vst1q_f32(dest + dest_stride + dest_stride + dest_stride, dest33);
+}
+
+// AT=[1, 1,  1,  0,
+//    0, 1, -1, -1]
+void output_trans_c4_post_2x4(const float* src,
+                              int src_stride,
+                              int src_h_stride,
+                              float* dest,
+                              int dest_stride,
+                              int dest_h_stride,
+                              float* bias_value,
+                              bool has_relu) {
+  float32x4_t src00 = vld1q_f32(src);
+  float32x4_t src01 = vld1q_f32(src + src_stride);
+  float32x4_t src02 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src03 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src10 = vld1q_f32(src);
+  float32x4_t src11 = vld1q_f32(src + src_stride);
+  float32x4_t src12 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src13 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src20 = vld1q_f32(src);
+  float32x4_t src21 = vld1q_f32(src + src_stride);
+  float32x4_t src22 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src23 = vld1q_f32(src + src_stride + src_stride + src_stride);
+  src += src_h_stride;
+  float32x4_t src30 = vld1q_f32(src);
+  float32x4_t src31 = vld1q_f32(src + src_stride);
+  float32x4_t src32 = vld1q_f32(src + src_stride + src_stride);
+  float32x4_t src33 = vld1q_f32(src + src_stride + src_stride + src_stride);
+
+  float32x4_t dst00 = vaddq_f32(vaddq_f32(src00, src01), src02);
+  float32x4_t dst10 = vsubq_f32(vsubq_f32(src01, src02), src03);
+  float32x4_t dst01 = vaddq_f32(vaddq_f32(src10, src11), src12);
+  float32x4_t dst11 = vsubq_f32(vsubq_f32(src11, src12), src13);
+  float32x4_t dst02 = vaddq_f32(vaddq_f32(src20, src21), src22);
+  float32x4_t dst12 = vsubq_f32(vsubq_f32(src21, src22), src23);
+  float32x4_t dst03 = vaddq_f32(vaddq_f32(src30, src31), src32);
+  float32x4_t dst13 = vsubq_f32(vsubq_f32(src31, src32), src33);
+
+  float32x4_t dest00 = vaddq_f32(vaddq_f32(dst00, dst01), dst02);
+  float32x4_t dest10 = vsubq_f32(vsubq_f32(dst01, dst02), dst03);
+  float32x4_t dest01 = vaddq_f32(vaddq_f32(dst10, dst11), dst12);
+  float32x4_t dest11 = vsubq_f32(vsubq_f32(dst11, dst12), dst13);
+
+  if (bias_value) {
+    float32x4_t bias = vld1q_f32(bias_value);
+    dest00 = vaddq_f32(dest00, bias);
+    dest10 = vaddq_f32(dest10, bias);
+    dest01 = vaddq_f32(dest01, bias);
+    dest11 = vaddq_f32(dest11, bias);
+  }
+
+  if (has_relu) {
+    float32x4_t zeros = vdupq_n_f32(0);
+    dest00 = vmaxq_f32(dest00, zeros);
+    dest10 = vmaxq_f32(dest10, zeros);
+    dest01 = vmaxq_f32(dest01, zeros);
+    dest11 = vmaxq_f32(dest11, zeros);
+  }
+
+  vst1q_f32(dest, dest00);
+  vst1q_f32(dest + dest_stride, dest10);
+  dest += dest_h_stride;
+  vst1q_f32(dest, dest01);
+  vst1q_f32(dest + dest_stride, dest11);
+}
+void weight_trans_c4_8x8(
     float* dest, const float* din, int ch_in, int ch_out, void* workspace) {
   const float coeff[8][3] = {{1.0f, 0.0f, 0.0f},
                              {-2.0f / 9, -2.0f / 9, -2.0f / 9},
@@ -558,6 +1247,63 @@ void weight_trans_c4(
   }
 }
 
+void weight_trans_c4_4x4(
+    float* dest, const float* din, int ch_in, int ch_out, void* workspace) {
+  const float coeff[4][3] = {{1.0f, 0.0f, 0.0f},
+                             {0.5f, 0.5f, 0.5f},
+                             {0.5f, -0.5f, 0.5f},
+                             {0.0f, 0.0f, 1.0f}};
+
+  float* ptr_out = static_cast<float*>(workspace);
+
+  for (int i = 0; i < ch_out; i++) {
+    for (int j = 0; j < ch_in; j++) {
+      const float* kernel0 =
+          static_cast<const float*>(din) + (i * ch_in + j) * 9;
+      float* ptr_channel = ptr_out + (i * ch_in + j) * 16;
+
+      //! transform kernel, transposed
+      const float* k0 = kernel0;
+      const float* k1 = kernel0 + 3;
+      const float* k2 = kernel0 + 6;
+
+      //! h
+      float tmp[4][3];
+      for (int i = 0; i < 4; i++) {
+        tmp[i][0] =
+            k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2];
+        tmp[i][1] =
+            k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2];
+        tmp[i][2] =
+            k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2];
+      }
+
+      //! v
+      for (int j = 0; j < 4; j++) {
+        float* tmpp = &tmp[j][0];
+        for (int i = 0; i < 4; i++) {
+          ptr_channel[j * 4 + i] = tmpp[0] * coeff[i][0] +
+                                   tmpp[1] * coeff[i][1] +
+                                   tmpp[2] * coeff[i][2];
+        }
+      }
+    }
+  }
+
+  int oc_pad = (ch_out + 3) / 4 * 4;
+  int ic_pad = (ch_in + 3) / 4 * 4;
+  int c_stride = ic_pad * oc_pad;
+  for (int i = 0; i < ch_out * ch_in * 16; ++i) {
+    int new_c = i % 16;
+    int new_oc = i / ch_in / 16 / 4;
+    int new_ic = i / 16 % (ch_in * 4) % ch_in;
+    int new_inner = i / ch_in / 16 % 4;
+    int dest_ind =
+        new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
+    dest[dest_ind] = ptr_out[i];
+  }
+}
+
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
index b4972a1ecab151947f8aaa7d6db0f6e82a08e5e4..5cee02b639af7e04a9184af765a5e96be4cb4cdb 100644
--- a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
@@ -76,6 +76,7 @@ void conv_3x3s1_direct_fp32(const float* i_data,
   const int threads = ctx->threads();
   int l2_size = ctx->llc_size() / sizeof(float);
   auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
 
   const int pad_h = paddings[0];
   const int pad_w = paddings[2];
@@ -469,7 +470,8 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                                 oh,
                                 ow,
                                 flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
       }
       const float* weight_remain_ptr = weights + c_round_down * w_stride;
 #pragma omp parallel for num_threads(threads)
@@ -780,7 +782,8 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                                 oh,
                                 ow,
                                 flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
       }
     }
   }
diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
index e4c9fb99ef9a6b5d3987a1efd5a644f322ea043c..6f056677378ad0499e0f2ce8b0dd56cee5d6a6ae 100644
--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -32,6 +32,7 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
                                  const int w_in,
                                  const int h_out,
                                  const int w_out,
+                                 const operators::ActivationParam act_param,
                                  ARMContext *ctx);
 
 void conv_depthwise_3x3s1p0_bias_s(float *dout,
@@ -46,6 +47,7 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
                                    const int w_in,
                                    const int h_out,
                                    const int w_out,
+                                   const operators::ActivationParam act_param,
                                    ARMContext *ctx);
 
 void conv_depthwise_3x3s1p1_bias(float *dout,
@@ -60,6 +62,7 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
                                  const int w_in,
                                  const int h_out,
                                  const int w_out,
+                                 const operators::ActivationParam act_param,
                                  ARMContext *ctx);
 
 void conv_depthwise_3x3s1p1_bias_s(float *dout,
@@ -74,6 +77,7 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
                                    const int w_in,
                                    const int h_out,
                                    const int w_out,
+                                   const operators::ActivationParam act_param,
                                    ARMContext *ctx);
 
 void conv_depthwise_3x3s1_fp32(const float *din,
@@ -90,6 +94,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                int pad,
                                bool flag_bias,
                                bool flag_relu,
+                               const operators::ActivationParam act_param,
                                ARMContext *ctx) {
   if (pad == 0) {
     if (w_in > 5) {
@@ -105,6 +110,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                   w_in,
                                   h_out,
                                   w_out,
+                                  act_param,
                                   ctx);
     } else {
       conv_depthwise_3x3s1p0_bias_s(dout,
@@ -119,6 +125,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                     w_in,
                                     h_out,
                                     w_out,
+                                    act_param,
                                     ctx);
     }
   }
@@ -136,6 +143,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                   w_in,
                                   h_out,
                                   w_out,
+                                  act_param,
                                   ctx);
     } else {
       conv_depthwise_3x3s1p1_bias_s(dout,
@@ -150,11 +158,12 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                     w_in,
                                     h_out,
                                     w_out,
+                                    act_param,
                                     ctx);
     }
   }
 }
-
+// clang-format on
 #ifdef __aarch64__
 #define INIT_S1                                                   \
   "PRFM PLDL1KEEP, [%[din_ptr0]] \n"                              \
@@ -255,14 +264,12 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
   "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
                                                                           \
-  "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/           \
-  "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
+  "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234 */ /* r4 */         \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/
 
 #define LEFT_RESULT_S1                                                      \
-  /* r4 */                                                                  \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/    \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/    \
-                                                                            \
   "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */                  \
   "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */                  \
   "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/            \
@@ -345,16 +352,15 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
   "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
                                                                           \
-  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
-  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ /* r3 */         \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
 
 #define MID_RESULT_S1                                                      \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
   "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
                                                                            \
   "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
@@ -411,30 +417,31 @@ void conv_depthwise_3x3s1_fp32(const float *din,
 
 #define RIGHT_COMPUTE_S1                                                  \
   "3:                             \n"                                     \
+  "movi v20.4s, #0 \n"                                                    \
   "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"                           \
   "ld1 {v22.4s}, [%[doutr0]]         \n"                                  \
   "ld1 {v23.4s}, [%[doutr1]]         \n"                                  \
   "ld1 {v24.4s}, [%[doutr2]]         \n"                                  \
   "ld1 {v25.4s}, [%[doutr3]]         \n"                                  \
                                                                           \
-  "bif v0.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v1.16b, %[vzero].16b, v19.16b \n"                                  \
-  "bif v2.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v3.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v0.16b, v20.16b, v18.16b \n"                                       \
+  "bif v1.16b, v20.16b, v19.16b \n"                                       \
+  "bif v2.16b, v20.16b, v18.16b \n"                                       \
+  "bif v3.16b, v20.16b, v19.16b \n"                                       \
                                                                           \
-  "bif v4.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v5.16b, %[vzero].16b, v19.16b \n"                                  \
-  "bif v6.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v7.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v4.16b, v20.16b, v18.16b \n"                                       \
+  "bif v5.16b, v20.16b, v19.16b \n"                                       \
+  "bif v6.16b, v20.16b, v18.16b \n"                                       \
+  "bif v7.16b, v20.16b, v19.16b \n"                                       \
                                                                           \
   "ext  v16.16b, v0.16b, v1.16b, #4 \n"                  /* v16 = 1234*/  \
   "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */         \
   "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
                                                                           \
-  "bif v8.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v9.16b, %[vzero].16b, v19.16b \n"                                  \
-  "bif v10.16b, %[vzero].16b, v18.16b \n"                                 \
-  "bif v11.16b, %[vzero].16b, v19.16b \n"                                 \
+  "bif v8.16b, v20.16b, v18.16b \n"                                       \
+  "bif v9.16b, v20.16b, v19.16b \n"                                       \
+  "bif v10.16b, v20.16b, v18.16b \n"                                      \
+  "bif v11.16b, v20.16b, v19.16b \n"                                      \
                                                                           \
   "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
                                                                           \
@@ -467,15 +474,13 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
   "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
                                                                           \
-  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
-  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */ /* r3 */         \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/
 
 #define RIGHT_RESULT_S1                                                    \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
   "bif v12.16b, v22.16b, v18.16b \n"                                       \
                                                                            \
   "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
@@ -520,10 +525,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "st1 {v15.4s}, [%[doutr3]], #16     \n"
 
 #define LEFT_RESULT_S1_RELU                                               \
-  /* r4 */                                                                \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
-                                                                          \
   "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                          \
   "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                          \
                                                                           \
@@ -570,14 +571,113 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
   "blt 3f                         \n"
 
+#define LEFT_RESULT_S1_RELU6                                              \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                          \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/                          \
+  "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/                          \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n"           /* vst1q_f32() */         \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n"           /* vst1q_f32() */         \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
+  "ld1 {v12.4s}, [%[bias_val]]      \n"  /*vdupq_n_f32(bias_val)*/        \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
+                                                                          \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+                                                                          \
+  "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/                          \
+                                                                          \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
+                                                                          \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                          \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+                                                                          \
+  "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/                          \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n"  /* vst1q_f32() */                  \
+  "cmp  %w[cnt], #1                \n"                                    \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "blt 3f \n"
+
+#define LEFT_RESULT_S1_LEAKY_RELU                                         \
+  "cmhs v18.4s, v12.4s,  %[vzero].4s \n" /* vcgeq_u32 */                  \
+  "cmhs v19.4s, v13.4s,  %[vzero].4s \n" /* vcgeq_u32 */                  \
+  "fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */                        \
+  "fmul v21.4s, v12.4s, %[vscale].4s \n" /* mul */                        \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "bif  v12.16b, v20.16b, v18.16b \n" /* choose*/                         \
+  "bif  v13.16b, v21.16b, v19.16b \n" /* choose*/                         \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"   /*vld1q_f32(din_ptr0)*/             \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n"           /* vst1q_f32() */         \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n"           /* vst1q_f32() */         \
+                                                                          \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
+  "cmhs v18.4s, v14.4s,  %[vzero].4s \n" /* vcgeq_u32 */                  \
+  "fmul v20.4s, v14.4s, %[vscale].4s \n" /* mul */                        \
+                                                                          \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+                                                                          \
+  "bif  v14.16b, v20.16b, v18.16b \n" /* choose*/                         \
+                                                                          \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
+                                                                          \
+  "cmhs v18.4s, v15.4s,  %[vzero].4s \n" /* vcgeq_u32 */                  \
+  "fmul v20.4s, v15.4s, %[vscale].4s \n" /* mul */                        \
+  "ld1 {v14.4s}, [%[bias_val]]      \n"  /*vdupq_n_f32(bias_val)*/        \
+  "bif  v15.16b, v20.16b, v18.16b \n"    /* choose*/                      \
+  "cmp  %w[cnt], #1                \n"                                    \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n"   /* vst1q_f32() */                 \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "blt 3f                         \n"
+
 #define MID_RESULT_S1_RELU                                                 \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/                          \
+  "movi v20.4s, #0 \n"                                                     \
+  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                                \
                                                                            \
   "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
   "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
@@ -598,7 +698,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
                                                                            \
   "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/                          \
+  "fmax v13.4s, v13.4s, v20.4s \n"       /*relu*/                          \
                                                                            \
   "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
   "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
@@ -617,7 +717,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   /* r3 */                                                                 \
   "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
   "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-  "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/                         \
+  "fmax v14.4s, v14.4s, v20.4s \n"        /*relu*/                         \
                                                                            \
   "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
                                                                            \
@@ -633,20 +733,157 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                                            \
   "subs %w[cnt], %w[cnt], #1 \n"                                           \
                                                                            \
-  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
+  "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/                                \
                                                                            \
   "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
   "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
                                                                            \
   "bne 1b \n"
 
-#define RIGHT_RESULT_S1_RELU                                               \
+#define MID_RESULT_S1_RELU6                                                \
+  "movi v20.4s, #0 \n"                                                     \
+  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                                \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/                \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v12.4s}, [%[bias_val]]      \n"  /*vdupq_n_f32(bias_val)*/         \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fmax v13.4s, v13.4s, v20.4s \n"       /*relu*/                          \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/                \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "ld1 {v13.4s}, [%[bias_val]]      \n"   /*vdupq_n_f32(bias_val)*/        \
+  "fmax v14.4s, v14.4s, v20.4s \n"        /*relu*/                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/                           \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmax v15.4s, v15.4s, v20.4s \n"      /*relu*/                           \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define MID_RESULT_S1_LEAKY_RELU                                           \
+  "movi v21.4s, #0 \n"                                                     \
+  "cmhs v18.4s, v12.4s,  v21.4s \n"      /* vcgeq_u32 */                   \
+  "fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif  v12.16b, v20.16b, v18.16b \n" /* choose*/                          \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/                \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "cmhs v18.4s, v13.4s,  v21.4s \n"      /* vcgeq_u32 */                   \
+  "fmul v20.4s, v13.4s, %[vscale].4s \n" /* mul */                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+  "bif  v13.16b, v20.16b, v18.16b \n"   /* choose*/                        \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/                \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
   /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "ld1 {v13.4s}, [%[bias_val]]      \n"   /*vdupq_n_f32(bias_val)*/        \
+  "cmhs v18.4s, v14.4s,  v21.4s \n"       /* vcgeq_u32 */                  \
+  "fmul v20.4s, v14.4s, %[vscale].4s \n"  /* mul */                        \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif  v14.16b, v20.16b, v18.16b \n" /* choose*/                          \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "cmhs v18.4s, v15.4s,  v21.4s \n"      /* vcgeq_u32 */                   \
+  "fmul v20.4s, v15.4s, %[vscale].4s \n" /* mul */                         \
+                                                                           \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+  "bif  v15.16b, v20.16b, v18.16b \n"   /* choose*/                        \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
                                                                            \
-  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                           \
+  "bne 1b \n"
+
+#define RIGHT_RESULT_S1_RELU                                               \
+  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                                \
                                                                            \
   "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
   "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
@@ -664,7 +901,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
                                                                            \
   "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
-  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                           \
+  "fmax v13.4s, v13.4s, v20.4s \n" /*relu*/                                \
                                                                            \
   "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
   "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
@@ -680,7 +917,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                         \
   "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
                                                                            \
-  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                           \
+  "fmax v14.4s, v14.4s, v20.4s \n" /*relu*/                                \
                                                                            \
   "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
                                                                            \
@@ -690,72 +927,184 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                                            \
   "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
                                                                            \
-  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
+  "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/                                \
                                                                            \
   "bif v15.16b, v25.16b, v18.16b \n"                                       \
                                                                            \
   "st1 {v15.4s}, [%[doutr3]], #16     \n"
 
-#define COMPUTE_S_S1                       \
-  "prfm pldl1keep, [%[din0]]\n"            \
-  "prfm pldl1keep, [%[din1]]\n"            \
-  "prfm pldl1keep, [%[din2]]\n"            \
-  "prfm pldl1keep, [%[din3]]\n"            \
-                                           \
-  "ld1 {v0.4s}, [%[din0]], #16\n"          \
-  "ld1 {v1.4s}, [%[din1]], #16\n"          \
-  "ld1 {v2.4s}, [%[din2]], #16\n"          \
-  "ld1 {v3.4s}, [%[din3]], #16\n"          \
-                                           \
-  "bif v0.16b, %[zero].16b, %[mask].16b\n" \
-  "bif v1.16b, %[zero].16b, %[mask].16b\n" \
-  "bif v2.16b, %[zero].16b, %[mask].16b\n" \
-  "bif v3.16b, %[zero].16b, %[mask].16b\n" \
-                                           \
-  "ext v4.16b, %[zero].16b, v0.16b, #12\n" \
-  "ext v5.16b, %[zero].16b, v1.16b, #12\n" \
-  "ext v6.16b, %[zero].16b, v2.16b, #12\n" \
-  "ext v7.16b, %[zero].16b, v3.16b, #12\n" \
-                                           \
-  "ext v8.16b, v0.16b, %[zero].16b, #4\n"  \
-  "ext v9.16b, v1.16b, %[zero].16b, #4\n"  \
-  "ext v10.16b, v2.16b, %[zero].16b, #4\n" \
-  "ext v11.16b, v3.16b, %[zero].16b, #4\n" \
-                                           \
-  "fmul v12.4s, v0.4s, %[wr0].s[1]\n"      \
-  "fmul v13.4s, v1.4s, %[wr0].s[1]\n"      \
-                                           \
-  "fmul v14.4s, v1.4s, %[wr1].s[1]\n"      \
-  "fmul v15.4s, v2.4s, %[wr1].s[1]\n"      \
-                                           \
-  "fmul v16.4s, v2.4s, %[wr2].s[1]\n"      \
-  "fmul v17.4s, v3.4s, %[wr2].s[1]\n"      \
-                                           \
-  "fmla v12.4s, v4.4s, %[wr0].s[0]\n"      \
-  "fmla v13.4s, v5.4s, %[wr0].s[0]\n"      \
-                                           \
-  "fmla v14.4s, v5.4s, %[wr1].s[0]\n"      \
-  "fmla v15.4s, v6.4s, %[wr1].s[0]\n"      \
-                                           \
-  "fmla v16.4s, v6.4s, %[wr2].s[0]\n"      \
-  "fmla v17.4s, v7.4s, %[wr2].s[0]\n"      \
-                                           \
-  "fmla v12.4s, v8.4s, %[wr0].s[2]\n"      \
-  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"      \
-                                           \
-  "fmla v14.4s, v9.4s, %[wr1].s[2]\n"      \
-  "fmla v15.4s, v10.4s, %[wr1].s[2]\n"     \
-                                           \
-  "fmla v16.4s, v10.4s, %[wr2].s[2]\n"     \
-  "fmla v17.4s, v11.4s, %[wr2].s[2]\n"     \
-                                           \
-  "fadd v12.4s, v12.4s, v14.4s\n"          \
-  "fadd v12.4s, v12.4s, v16.4s\n"          \
-                                           \
-  "fadd v13.4s, v13.4s, v15.4s\n"          \
-  "fadd v13.4s, v13.4s, v17.4s\n"          \
-                                           \
-  "fadd v12.4s, v12.4s, %[bias].4s\n"      \
+#define RIGHT_RESULT_S1_RELU6                                             \
+  "fmax v12.4s, v12.4s, v20.4s \n" /*relu*/                               \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmin v12.4s, v12.4s, %[vsix].4s \n" /*relu6*/                          \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */         \
+  "bif v12.16b, v22.16b, v18.16b \n"                                      \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmax v13.4s, v13.4s, v20.4s \n"      /*relu*/                          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                 \
+                                                                          \
+  "fmin v13.4s, v13.4s, %[vsix].4s \n" /*relu6*/                          \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                 \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                \
+  "bif v13.16b, v23.16b, v18.16b \n"                                      \
+                                                                          \
+  "fmla v15.4s ,  v10.4s,  v20.s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                          \
+  "fmax v14.4s, v14.4s, v20.4s \n"        /*relu*/                        \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                        \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmin v14.4s, v14.4s, %[vsix].4s \n" /*relu6*/                          \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "bif v14.16b, v24.16b, v18.16b \n"                                      \
+  "fmax v15.4s, v15.4s, v20.4s \n" /*relu*/                               \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                 \
+                                                                          \
+  "fmin v15.4s, v15.4s, %[vsix].4s \n" /*relu6*/                          \
+  "bif v15.16b, v25.16b, v18.16b \n"                                      \
+                                                                          \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define RIGHT_RESULT_S1_LEAKY_RELU                                        \
+  "movi v1.4s, #0 \n"                                                     \
+  "cmhs v20.4s, v12.4s,  v1.4s \n"       /* vcgeq_u32 */                  \
+  "fmul v21.4s, v12.4s, %[vscale].4s \n" /* mul */                        \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "bif  v12.16b, v21.16b, v20.16b \n" /* choose*/                         \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */         \
+  "bif v12.16b, v22.16b, v18.16b \n"                                      \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "cmhs v20.4s, v13.4s,  v1.4s \n"       /* vcgeq_u32 */                  \
+  "fmul v21.4s, v13.4s, %[vscale].4s \n" /* mul */                        \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                 \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "bif v13.16b, v21.16b, v20.16b \n"                                      \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                 \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                \
+                                                                          \
+  "bif v13.16b, v23.16b, v18.16b \n"                                      \
+                                                                          \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/ \
+                                                                          \
+  "cmhs v20.4s, v14.4s,  v1.4s \n"        /* vcgeq_u32 */                 \
+  "fmul v21.4s, v14.4s, %[vscale].4s \n"  /* mul */                       \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                        \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "bif v14.16b, v21.16b, v20.16b \n"                                      \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "bif v14.16b, v24.16b, v18.16b \n"                                      \
+                                                                          \
+  "cmhs v20.4s, v15.4s,  v1.4s \n"       /* vcgeq_u32 */                  \
+  "fmul v21.4s, v15.4s, %[vscale].4s \n" /* mul */                        \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                 \
+  "bif v15.16b, v21.16b, v20.16b \n"                                      \
+  "bif v15.16b, v25.16b, v18.16b \n"                                      \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define COMPUTE_S_S1                        \
+  "prfm pldl1keep, [%[din0]]\n"             \
+  "prfm pldl1keep, [%[din1]]\n"             \
+  "prfm pldl1keep, [%[din2]]\n"             \
+  "prfm pldl1keep, [%[din3]]\n"             \
+                                            \
+  "ld1 {v0.4s}, [%[din0]], #16\n"           \
+  "ld1 {v1.4s}, [%[din1]], #16\n"           \
+  "ld1 {v2.4s}, [%[din2]], #16\n"           \
+  "ld1 {v3.4s}, [%[din3]], #16\n"           \
+                                            \
+  "bif v0.16b, %[vzero].16b, %[mask].16b\n" \
+  "bif v1.16b, %[vzero].16b, %[mask].16b\n" \
+  "bif v2.16b, %[vzero].16b, %[mask].16b\n" \
+  "bif v3.16b, %[vzero].16b, %[mask].16b\n" \
+                                            \
+  "ext v4.16b, %[vzero].16b, v0.16b, #12\n" \
+  "ext v5.16b, %[vzero].16b, v1.16b, #12\n" \
+  "ext v6.16b, %[vzero].16b, v2.16b, #12\n" \
+  "ext v7.16b, %[vzero].16b, v3.16b, #12\n" \
+                                            \
+  "ext v8.16b, v0.16b, %[vzero].16b, #4\n"  \
+  "ext v9.16b, v1.16b, %[vzero].16b, #4\n"  \
+  "ext v10.16b, v2.16b, %[vzero].16b, #4\n" \
+  "ext v11.16b, v3.16b, %[vzero].16b, #4\n" \
+                                            \
+  "fmul v12.4s, v0.4s, %[wr0].s[1]\n"       \
+  "fmul v13.4s, v1.4s, %[wr0].s[1]\n"       \
+                                            \
+  "fmul v14.4s, v1.4s, %[wr1].s[1]\n"       \
+  "fmul v15.4s, v2.4s, %[wr1].s[1]\n"       \
+                                            \
+  "fmul v16.4s, v2.4s, %[wr2].s[1]\n"       \
+  "fmul v17.4s, v3.4s, %[wr2].s[1]\n"       \
+                                            \
+  "fmla v12.4s, v4.4s, %[wr0].s[0]\n"       \
+  "fmla v13.4s, v5.4s, %[wr0].s[0]\n"       \
+                                            \
+  "fmla v14.4s, v5.4s, %[wr1].s[0]\n"       \
+  "fmla v15.4s, v6.4s, %[wr1].s[0]\n"       \
+                                            \
+  "fmla v16.4s, v6.4s, %[wr2].s[0]\n"       \
+  "fmla v17.4s, v7.4s, %[wr2].s[0]\n"       \
+                                            \
+  "fmla v12.4s, v8.4s, %[wr0].s[2]\n"       \
+  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"       \
+                                            \
+  "fmla v14.4s, v9.4s, %[wr1].s[2]\n"       \
+  "fmla v15.4s, v10.4s, %[wr1].s[2]\n"      \
+                                            \
+  "fmla v16.4s, v10.4s, %[wr2].s[2]\n"      \
+  "fmla v17.4s, v11.4s, %[wr2].s[2]\n"      \
+                                            \
+  "fadd v12.4s, v12.4s, v14.4s\n"           \
+  "fadd v12.4s, v12.4s, v16.4s\n"           \
+                                            \
+  "fadd v13.4s, v13.4s, v15.4s\n"           \
+  "fadd v13.4s, v13.4s, v17.4s\n"           \
+                                            \
+  "fadd v12.4s, v12.4s, %[bias].4s\n"       \
   "fadd v13.4s, v13.4s, %[bias].4s\n"
 
 #define RESULT_S_S1             \
@@ -765,16 +1114,42 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "st1 {v12.4s}, [%[out1]]\n"   \
   "st1 {v13.4s}, [%[out2]]\n"
 
-#define RESULT_S_S1_RELU              \
-  "prfm pldl1keep, [%[out1]]\n"       \
-  "prfm pldl1keep, [%[out2]]\n"       \
-                                      \
-  "fmax v12.4s, v12.4s, %[zero].4s\n" \
-  "fmax v13.4s, v13.4s, %[zero].4s\n" \
-                                      \
-  "st1 {v12.4s}, [%[out1]]\n"         \
+#define RESULT_S_S1_RELU               \
+  "prfm pldl1keep, [%[out1]]\n"        \
+  "prfm pldl1keep, [%[out2]]\n"        \
+                                       \
+  "fmax v12.4s, v12.4s, %[vzero].4s\n" \
+  "fmax v13.4s, v13.4s, %[vzero].4s\n" \
+                                       \
+  "st1 {v12.4s}, [%[out1]]\n"          \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU6              \
+  "prfm pldl1keep, [%[out1]]\n"        \
+  "prfm pldl1keep, [%[out2]]\n"        \
+                                       \
+  "fmax v12.4s, v12.4s, %[vzero].4s\n" \
+  "fmax v13.4s, v13.4s, %[vzero].4s\n" \
+                                       \
+  "fmin v12.4s, v12.4s, %[vsix].4s\n"  \
+  "fmin v13.4s, v13.4s, %[vsix].4s\n"  \
+                                       \
+  "st1 {v12.4s}, [%[out1]]\n"          \
   "st1 {v13.4s}, [%[out2]]\n"
 
+#define RESULT_S_S1_LEAKY_RELU                           \
+  "prfm pldl1keep, [%[out1]]\n"                          \
+  "prfm pldl1keep, [%[out2]]\n"                          \
+                                                         \
+  "cmhs v18.4s, v12.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "cmhs v19.4s, v13.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v20.4s, v12.4s, %[vscale].4s \n" /* mul */       \
+  "fmul v21.4s, v13.4s, %[vscale].4s \n" /* mul */       \
+                                                         \
+  "bif v12.16b, v20.16b, v18.16b \n"                     \
+  "bif v13.16b, v21.16b, v19.16b \n"                     \
+  "st1 {v12.4s}, [%[out1]]\n"                            \
+  "st1 {v13.4s}, [%[out2]]\n"
 #define COMPUTE_S_S1_P0                                   \
   "prfm pldl1keep, [%[din0]]\n"                           \
   "prfm pldl1keep, [%[din1]]\n"                           \
@@ -786,17 +1161,17 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "ld1 {v4.4s, v5.4s}, [%[din2]]\n"                       \
   "ld1 {v6.4s, v7.4s}, [%[din3]]\n"                       \
                                                           \
-  "bif v0.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v1.16b, %[zero].16b, %[mask2].16b\n"               \
+  "bif v0.16b, %[vzero].16b, %[mask1].16b\n"              \
+  "bif v1.16b, %[vzero].16b, %[mask2].16b\n"              \
                                                           \
-  "bif v2.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v3.16b, %[zero].16b, %[mask2].16b\n"               \
+  "bif v2.16b, %[vzero].16b, %[mask1].16b\n"              \
+  "bif v3.16b, %[vzero].16b, %[mask2].16b\n"              \
                                                           \
-  "bif v4.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v5.16b, %[zero].16b, %[mask2].16b\n"               \
+  "bif v4.16b, %[vzero].16b, %[mask1].16b\n"              \
+  "bif v5.16b, %[vzero].16b, %[mask2].16b\n"              \
                                                           \
-  "bif v6.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v7.16b, %[zero].16b, %[mask2].16b\n"               \
+  "bif v6.16b, %[vzero].16b, %[mask1].16b\n"              \
+  "bif v7.16b, %[vzero].16b, %[mask2].16b\n"              \
                                                           \
   "ext v8.16b, v0.16b, v1.16b, #4\n"                      \
   "ext v9.16b, v0.16b, v1.16b, #8\n"                      \
@@ -849,7 +1224,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                     // "st1 {v12.4s}, [%[out1]]\n" \
                     // "st1 {v13.4s}, [%[out2]]\n" \
 
-
 #else
 #define INIT_S1                                                    \
   "pld [%[din0_ptr]]                             @ preload data\n" \
@@ -1129,6 +1503,66 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "vdup.32 q5, %[bias_val]                            @ and \n"               \
   "blt  3f                                @ jump to main loop start point\n"
 
+#define LEFT_RESULT_S1_RELU6                                                  \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.f32 {d28-d29}, [%[six_ptr]] @ load six \n"                            \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vmin.f32 q4, q4, q14 @ relu6 \n"                                           \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+                                                                              \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                                    \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+  "vmin.f32 q5, q5, q14 @ relu6 \n"                                           \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define LEFT_RESULT_S1_LEAKY_RELU                                             \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+  "vld1.f32 {d28-d29}, [%[scale_ptr]] @ load scale \n"                        \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+  "vcge.f32 q15, q4, %q[vzero]        @ q0 > 0 \n"                            \
+  "vmul.f32 q6, q4, q14 \n"                                                   \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vbif q4, q6, q15 @ choose \n"                                              \
+  "vcge.f32 q7, q5, %q[vzero]        @ q0 > 0 \n"                             \
+  "vmul.f32 q6, q5, q14 \n"                                                   \
+                                                                              \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vbif q5, q6, q7 @ choose \n"                                               \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
 #define MID_RESULT_S1_RELU                                               \
   /* r3 */                                                               \
   "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
@@ -1157,6 +1591,69 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                                          \
   "bne    1b                             @ jump to main loop start point\n"
 
+#define MID_RESULT_S1_RELU6                                              \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[six_ptr]]!    @ load din r0\n"                 \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vmin.f32 q4, q4, q14             @ relu6 \n"                          \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+                                                                         \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                               \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmin.f32 q5, q5, q14             @ relu6 \n"                          \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define MID_RESULT_S1_LEAKY_RELU                                         \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[scale_ptr]]!    @ load din r0\n"               \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vcge.f32 q15, q4, %q[vzero]        @ q0 > 0 \n"                       \
+  "vmul.f32 q6, q4, q14 \n"                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vbif q4, q6, q15 @ choose \n"                                         \
+  "vcge.f32 q7, q5, %q[vzero]        @ q0 > 0 \n"                        \
+  "vmul.f32 q6, q4, q14 \n"                                              \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+                                                                         \
+  "vbif q5, q6, q7 @ choose \n"                                          \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
 #define RIGHT_RESULT_S1_RELU                                            \
   /* r3 */                                                              \
   "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
@@ -1178,6 +1675,58 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                                                         \
   "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
 
+#define RIGHT_RESULT_S1_RELU6                                           \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vld1.32  {d28-d29}, [%[six_ptr]]    @ load din r0\n"                 \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vmin.f32 q4, q4, q14             @ relu6 \n"                         \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                              \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmin.f32 q5, q5, q14             @ relu6 \n"                         \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define RIGHT_RESULT_S1_LEAKY_RELU                                      \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vld1.32  {d28-d29}, [%[scale_ptr]]!    @ load din r0\n"              \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vcge.f32 q15, q4, %q[vzero]        @ q0 > 0 \n"                      \
+  "vmul.f32 q6, q4, q14 \n"                                             \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+  "vbif q4, q6, q15 @ choose \n"                                        \
+                                                                        \
+  "vcge.f32 q7, q5, %q[vzero]        @ q0 > 0 \n"                       \
+  "vmul.f32 q6, q5, q14 \n"                                             \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+  "vbif q5, q6, q7 @ choose \n"                                         \
+                                                                        \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
 #define COMPUTE_S_S1                 \
   "pld [%[din0]]\n"                  \
   "pld [%[din1]]\n"                  \
@@ -1251,6 +1800,36 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "vst1.32 {d28-d29}, [%[out1]]\n" \
   "vst1.32 {d30-d31}, [%[out2]]\n"
 
+#define RESULT_S_S1_RELU6              \
+  "pld [%[out1]]\n"                    \
+  "pld [%[out2]]\n"                    \
+                                       \
+  "vld1.32 {d20-d21}, [%[six_ptr]] \n" \
+  "vmax.f32 q14, q14, %q[vzero]\n"     \
+  "vmax.f32 q15, q15, %q[vzero]\n"     \
+                                       \
+  "vmin.f32 q14, q14, q10 \n"          \
+  "vmin.f32 q15, q15, q10 \n"          \
+                                       \
+  "vst1.32 {d28-d29}, [%[out1]]\n"     \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define RESULT_S_S1_LEAKY_RELU                      \
+  "pld [%[out1]]\n"                                 \
+  "pld [%[out2]]\n"                                 \
+                                                    \
+  "vld1.32 {d18-d19}, [%[scale_ptr]] \n"            \
+  "vcge.f32 q10, q14, %q[vzero]        @ q0 > 0 \n" \
+  "vcge.f32 q11, q15, %q[vzero]        @ q0 > 0 \n" \
+  "vmul.f32 q12, q14, q9 \n"                        \
+  "vmul.f32 q13, q15, q9 \n"                        \
+                                                    \
+  "vbif q14, q10, q12 \n"                           \
+  "vbif q15, q11, q13 \n"                           \
+                                                    \
+  "vst1.32 {d28-d29}, [%[out1]]\n"                  \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
 #define COMPUTE_S_S1_P0                                                       \
   "pld [%[din0]]\n"                                                           \
   "pld [%[din1]]\n"                                                           \
@@ -1333,6 +1912,413 @@ void conv_depthwise_3x3s1_fp32(const float *din,
   "vadd.f32 q15, q5, q9         @ q4 += q10 \n"
 
 #endif
+
+#ifdef __aarch64__
+void act_switch_3x3s1p1(const float *din_ptr0,
+                        const float *din_ptr1,
+                        const float *din_ptr2,
+                        const float *din_ptr3,
+                        const float *din_ptr4,
+                        const float *din_ptr5,
+                        float *doutr0,
+                        float *doutr1,
+                        float *doutr2,
+                        float *doutr3,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        unsigned int *vmask,
+                        unsigned int *rmask,
+                        float32x4_t vzero,
+                        float *vbias,
+                        int cnt,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [vzero] "w"(vzero)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
+                MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [vsix] "w"(vsix),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [vzero] "w"(vzero)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
+                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
+                     : [cnt] "+r"(cnt),
+                       [din_ptr0] "+r"(din_ptr0),
+                       [din_ptr1] "+r"(din_ptr1),
+                       [din_ptr2] "+r"(din_ptr2),
+                       [din_ptr3] "+r"(din_ptr3),
+                       [din_ptr4] "+r"(din_ptr4),
+                       [din_ptr5] "+r"(din_ptr5),
+                       [doutr0] "+r"(doutr0),
+                       [doutr1] "+r"(doutr1),
+                       [doutr2] "+r"(doutr2),
+                       [doutr3] "+r"(doutr3)
+                     : [w0] "w"(wr0),
+                       [w1] "w"(wr1),
+                       [w2] "w"(wr2),
+                       [vscale] "w"(vscale),
+                       [bias_val] "r"(vbias),
+                       [vmask] "r"(vmask),
+                       [rmask] "r"(rmask),
+                       [vzero] "w"(vzero)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "v23",
+                       "v24",
+                       "v25");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                     MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                 : [cnt] "+r"(cnt),
+                   [din_ptr0] "+r"(din_ptr0),
+                   [din_ptr1] "+r"(din_ptr1),
+                   [din_ptr2] "+r"(din_ptr2),
+                   [din_ptr3] "+r"(din_ptr3),
+                   [din_ptr4] "+r"(din_ptr4),
+                   [din_ptr5] "+r"(din_ptr5),
+                   [doutr0] "+r"(doutr0),
+                   [doutr1] "+r"(doutr1),
+                   [doutr2] "+r"(doutr2),
+                   [doutr3] "+r"(doutr3)
+                 : [w0] "w"(wr0),
+                   [w1] "w"(wr1),
+                   [w2] "w"(wr2),
+                   [bias_val] "r"(vbias),
+                   [vmask] "r"(vmask),
+                   [rmask] "r"(rmask),
+                   [vzero] "w"(vzero)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19",
+                   "v20",
+                   "v21",
+                   "v22",
+                   "v23",
+                   "v24",
+                   "v25");
+  }
+}
+#else
+void act_switch_3x3s1p1(const float *din_ptr0,
+                        const float *din_ptr1,
+                        const float *din_ptr2,
+                        const float *din_ptr3,
+                        float *doutr0,
+                        float *doutr1,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        unsigned int *vmask_ptr,
+                        unsigned int *rmask_ptr,
+                        float32x4_t vzero,
+                        float bias_val,
+                        int cnt,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+            : [dout_ptr1] "+r"(doutr0),
+              [dout_ptr2] "+r"(doutr1),
+              [din0_ptr] "+r"(din_ptr0),
+              [din1_ptr] "+r"(din_ptr1),
+              [din2_ptr] "+r"(din_ptr2),
+              [din3_ptr] "+r"(din_ptr3),
+              [cnt] "+r"(cnt),
+              [rmask] "+r"(rmask_ptr),
+              [vmask] "+r"(vmask_ptr)
+            : [wr0] "w"(wr0),
+              [wr1] "w"(wr1),
+              [wr2] "w"(wr2),
+              [bias_val] "r"(bias_val),
+              [vzero] "w"(vzero)
+            : "cc",
+              "memory",
+              "q4",
+              "q5",
+              "q6",
+              "q7",
+              "q8",
+              "q9",
+              "q10",
+              "q11",
+              "q12",
+              "q13",
+              "q14",
+              "q15");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU6 MID_COMPUTE_S1
+                MID_RESULT_S1_RELU6 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU6
+            : [dout_ptr1] "+r"(doutr0),
+              [dout_ptr2] "+r"(doutr1),
+              [din0_ptr] "+r"(din_ptr0),
+              [din1_ptr] "+r"(din_ptr1),
+              [din2_ptr] "+r"(din_ptr2),
+              [din3_ptr] "+r"(din_ptr3),
+              [cnt] "+r"(cnt),
+              [rmask] "+r"(rmask_ptr),
+              [vmask] "+r"(vmask_ptr)
+            : [wr0] "w"(wr0),
+              [wr1] "w"(wr1),
+              [wr2] "w"(wr2),
+              [bias_val] "r"(bias_val),
+              [six_ptr] "r"(vsix),
+              [vzero] "w"(vzero)
+            : "cc",
+              "memory",
+              "q4",
+              "q5",
+              "q6",
+              "q7",
+              "q8",
+              "q9",
+              "q10",
+              "q11",
+              "q12",
+              "q13",
+              "q14",
+              "q15");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_LEAKY_RELU
+                         MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+                             RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_LEAKY_RELU
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias_val] "r"(bias_val),
+                       [scale_ptr] "r"(vscale),
+                       [vzero] "w"(vzero)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                     MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                 : [dout_ptr1] "+r"(doutr0),
+                   [dout_ptr2] "+r"(doutr1),
+                   [din0_ptr] "+r"(din_ptr0),
+                   [din1_ptr] "+r"(din_ptr1),
+                   [din2_ptr] "+r"(din_ptr2),
+                   [din3_ptr] "+r"(din_ptr3),
+                   [cnt] "+r"(cnt),
+                   [rmask] "+r"(rmask_ptr),
+                   [vmask] "+r"(vmask_ptr)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [bias_val] "r"(bias_val),
+                   [vzero] "w"(vzero)
+                 : "cc",
+                   "memory",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15");
+  }
+}
+#endif
+// clang-format on
 /**
  * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
  * width > 4
@@ -1349,6 +2335,7 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
                                  const int w_in,
                                  const int h_out,
                                  const int w_out,
+                                 const operators::ActivationParam act_param,
                                  ARMContext *ctx) {
   //! pad is done implicit
   const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
@@ -1486,106 +2473,25 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
         }
 
         int cnt = cnt_col;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        } else {
-          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
-                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-                       : [cnt] "+r"(cnt),
-                         [din_ptr0] "+r"(din_ptr0),
-                         [din_ptr1] "+r"(din_ptr1),
-                         [din_ptr2] "+r"(din_ptr2),
-                         [din_ptr3] "+r"(din_ptr3),
-                         [din_ptr4] "+r"(din_ptr4),
-                         [din_ptr5] "+r"(din_ptr5),
-                         [doutr0] "+r"(doutr0),
-                         [doutr1] "+r"(doutr1),
-                         [doutr2] "+r"(doutr2),
-                         [doutr3] "+r"(doutr3)
-                       : [w0] "w"(wr0),
-                         [w1] "w"(wr1),
-                         [w2] "w"(wr2),
-                         [bias_val] "r"(vbias),
-                         [vmask] "r"(vmask),
-                         [rmask] "r"(rmask),
-                         [vzero] "w"(vzero)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17",
-                         "v18",
-                         "v19",
-                         "v20",
-                         "v21",
-                         "v22",
-                         "v23",
-                         "v24",
-                         "v25");
-        }
+        act_switch_3x3s1p1(din_ptr0,
+                           din_ptr1,
+                           din_ptr2,
+                           din_ptr3,
+                           din_ptr4,
+                           din_ptr5,
+                           doutr0,
+                           doutr1,
+                           doutr2,
+                           doutr3,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask,
+                           rmask,
+                           vzero,
+                           vbias,
+                           cnt,
+                           act_param);
         dout_ptr = dout_ptr + 4 * w_out;
       }
 #else
@@ -1598,7 +2504,6 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
 
         doutr0 = dout_ptr;
         doutr1 = dout_ptr + w_out;
-        // unsigned int* rst_mask = rmask;
 
         if (i == 0) {
           din_ptr0 = zero_ptr;
@@ -1635,77 +2540,314 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
         int cnt = cnt_col;
         unsigned int *rmask_ptr = rmask;
         unsigned int *vmask_ptr = vmask;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-              : [dout_ptr1] "+r"(doutr0),
-                [dout_ptr2] "+r"(doutr1),
-                [din0_ptr] "+r"(din_ptr0),
-                [din1_ptr] "+r"(din_ptr1),
-                [din2_ptr] "+r"(din_ptr2),
-                [din3_ptr] "+r"(din_ptr3),
-                [cnt] "+r"(cnt),
-                [rmask] "+r"(rmask_ptr),
-                [vmask] "+r"(vmask_ptr)
-              : [wr0] "w"(wr0),
-                [wr1] "w"(wr1),
-                [wr2] "w"(wr2),
-                [bias_val] "r"(bias_val),
-                [vzero] "w"(vzero)
-              : "cc",
-                "memory",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        } else {
-          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
-                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
+        act_switch_3x3s1p1(din_ptr0,
+                           din_ptr1,
+                           din_ptr2,
+                           din_ptr3,
+                           doutr0,
+                           doutr1,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask_ptr,
+                           rmask_ptr,
+                           vzero,
+                           bias_val,
+                           cnt,
+                           act_param);
         dout_ptr += 2 * w_out;
       }  //! end of processing mid rows
 #endif
     }
   }
 }
-
+void act_switch_3x3s1p1_s(const float *din_ptr0,
+                          const float *din_ptr1,
+                          const float *din_ptr2,
+                          const float *din_ptr3,
+                          float *doutr0,
+                          float *doutr1,
+                          float32x4_t wr0,
+                          float32x4_t wr1,
+                          float32x4_t wr2,
+                          uint32x4_t vmask_rp,
+                          float32x4_t vzero,
+                          float32x4_t wbias,
+                          const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+#ifdef __aarch64__
+    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+#else
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+#endif
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [vsix] "w"(vsix),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU6
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [six_ptr] "r"(vsix),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [vscale] "w"(vscale),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [mask] "w"(vmask_rp),
+                       [bias] "w"(wbias),
+                       [scale_ptr] "r"(vscale),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                 : [din0] "+r"(din_ptr0),
+                   [din1] "+r"(din_ptr1),
+                   [din2] "+r"(din_ptr2),
+                   [din3] "+r"(din_ptr3)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [vzero] "w"(vzero),
+                   [mask] "w"(vmask_rp),
+                   [bias] "w"(wbias),
+                   [out1] "r"(doutr0),
+                   [out2] "r"(doutr1)
+                 : "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15",
+                   "v16",
+                   "v17");
+#else
+    asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                 : [din0] "+r"(din_ptr0),
+                   [din1] "+r"(din_ptr1),
+                   [din2] "+r"(din_ptr2),
+                   [din3] "+r"(din_ptr3)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [vzero] "w"(vzero),
+                   [mask] "w"(vmask_rp),
+                   [bias] "w"(wbias),
+                   [out1] "r"(doutr0),
+                   [out2] "r"(doutr1)
+                 : "cc",
+                   "memory",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15");
+#endif
+  }
+}
 /**
  * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
  * width <= 4
@@ -1722,6 +2864,7 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
                                    const int w_in,
                                    const int h_out,
                                    const int w_out,
+                                   const operators::ActivationParam act_param,
                                    ARMContext *ctx) {
   //! 3x3s1 convolution, implemented by direct algorithm
   //! pad is done implicit
@@ -1772,7 +2915,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
         if (hs == -1) {
           dr0 = zero;
         }
-
         switch (he - h_in) {
           case 2:
             dr2 = zero;
@@ -1782,127 +2924,19 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
           default:
             break;
         }
-#ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [zero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17");
-        } else {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [zero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17");
-        }
-#else
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-#endif
+        act_switch_3x3s1p1_s(dr0,
+                             dr1,
+                             dr2,
+                             dr3,
+                             out_buf1,
+                             out_buf2,
+                             wr0,
+                             wr1,
+                             wr2,
+                             vmask_rp,
+                             vzero,
+                             wbias,
+                             act_param);
         for (int w = 0; w < w_out; ++w) {
           *doutr0++ = out_buf1[w];
           *doutr1++ = out_buf2[w];
@@ -1916,6 +2950,490 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
   }      // end of processing batchs
 }
 
+#ifdef __aarch64__
+void act_switch_3x3s1p0(const float *din_ptr0,
+                        const float *din_ptr1,
+                        const float *din_ptr2,
+                        const float *din_ptr3,
+                        const float *din_ptr4,
+                        const float *din_ptr5,
+                        float *doutr0,
+                        float *doutr1,
+                        float *doutr2,
+                        float *doutr3,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        unsigned int *vmask,
+                        unsigned int *rmask,
+                        float32x4_t vzero,
+                        float *vbias,
+                        int cnt,
+                        int remain,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S1
+            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+            MID_COMPUTE_S1 MID_RESULT_S1_RELU
+            "cmp  %w[remain], #1             \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1
+                RIGHT_RESULT_S1_RELU "0: \n"
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [vzero] "w"(vzero),
+              [remain] "r"(remain)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S1
+            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+            MID_COMPUTE_S1 MID_RESULT_S1_RELU6
+            "cmp  %w[remain], #1             \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1
+                RIGHT_RESULT_S1_RELU6 "0: \n"
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [vsix] "w"(vsix),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [remain] "r"(remain)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(
+            INIT_S1
+            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+            MID_COMPUTE_S1 MID_RESULT_S1_LEAKY_RELU
+            "cmp  %w[remain], #1             \n"
+            "blt 0f                         \n" RIGHT_COMPUTE_S1
+                RIGHT_RESULT_S1_LEAKY_RELU "0: \n"
+            : [cnt] "+r"(cnt),
+              [din_ptr0] "+r"(din_ptr0),
+              [din_ptr1] "+r"(din_ptr1),
+              [din_ptr2] "+r"(din_ptr2),
+              [din_ptr3] "+r"(din_ptr3),
+              [din_ptr4] "+r"(din_ptr4),
+              [din_ptr5] "+r"(din_ptr5),
+              [doutr0] "+r"(doutr0),
+              [doutr1] "+r"(doutr1),
+              [doutr2] "+r"(doutr2),
+              [doutr3] "+r"(doutr3)
+            : [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [vscale] "w"(vscale),
+              [bias_val] "r"(vbias),
+              [vmask] "r"(vmask),
+              [rmask] "r"(rmask),
+              [remain] "r"(remain)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22",
+              "v23",
+              "v24",
+              "v25");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(
+        INIT_S1
+        "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+        "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+        "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+        "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+        "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+        "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+        MID_COMPUTE_S1 MID_RESULT_S1
+        "cmp  %w[remain], #1             \n"
+        "blt 0f                         \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+        "0: \n"
+        : [cnt] "+r"(cnt),
+          [din_ptr0] "+r"(din_ptr0),
+          [din_ptr1] "+r"(din_ptr1),
+          [din_ptr2] "+r"(din_ptr2),
+          [din_ptr3] "+r"(din_ptr3),
+          [din_ptr4] "+r"(din_ptr4),
+          [din_ptr5] "+r"(din_ptr5),
+          [doutr0] "+r"(doutr0),
+          [doutr1] "+r"(doutr1),
+          [doutr2] "+r"(doutr2),
+          [doutr3] "+r"(doutr3)
+        : [w0] "w"(wr0),
+          [w1] "w"(wr1),
+          [w2] "w"(wr2),
+          [bias_val] "r"(vbias),
+          [vmask] "r"(vmask),
+          [rmask] "r"(rmask),
+          [vzero] "w"(vzero),
+          [remain] "r"(remain)
+        : "cc",
+          "memory",
+          "v0",
+          "v1",
+          "v2",
+          "v3",
+          "v4",
+          "v5",
+          "v6",
+          "v7",
+          "v8",
+          "v9",
+          "v10",
+          "v11",
+          "v12",
+          "v13",
+          "v14",
+          "v15",
+          "v16",
+          "v17",
+          "v18",
+          "v19",
+          "v20",
+          "v21",
+          "v22",
+          "v23",
+          "v24",
+          "v25");
+  }
+}
+#else
+void act_switch_3x3s1p0(const float *din_ptr0,
+                        const float *din_ptr1,
+                        const float *din_ptr2,
+                        const float *din_ptr3,
+                        float *doutr0,
+                        float *doutr1,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        unsigned int *vmask_ptr,
+                        unsigned int *rmask_ptr,
+                        float32x4_t vzero,
+                        float bias_val,
+                        int cnt,
+                        int remain,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(INIT_S1
+                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "vext.32  q6, q8, q9, #1     @ 0012\n"
+                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                         MID_RESULT_S1_RELU
+                     "cmp  %[remain], #1             \n"
+                     "blt 0f                         \n" RIGHT_COMPUTE_S1
+                         RIGHT_RESULT_S1_RELU "0:                         \n"
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [bias_val] "r"(bias_val),
+                       [vzero] "w"(vzero),
+                       [remain] "r"(remain)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(INIT_S1
+                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "vext.32  q6, q8, q9, #1     @ 0012\n"
+                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                         MID_RESULT_S1_RELU6
+                     "cmp  %[remain], #1             \n"
+                     "blt 0f                         \n" RIGHT_COMPUTE_S1
+                         RIGHT_RESULT_S1_RELU6 "0:                         \n"
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [six_ptr] "r"(vsix),
+                       [bias_val] "r"(bias_val),
+                       [vzero] "w"(vzero),
+                       [remain] "r"(remain)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(INIT_S1
+                     "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                     "vext.32  q6, q8, q9, #1     @ 0012\n"
+                     "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                         MID_RESULT_S1_LEAKY_RELU
+                     "cmp  %[remain], #1             \n"
+                     "blt 0f                         \n" RIGHT_COMPUTE_S1
+                         RIGHT_RESULT_S1_LEAKY_RELU
+                     "0:                         \n"
+                     : [dout_ptr1] "+r"(doutr0),
+                       [dout_ptr2] "+r"(doutr1),
+                       [din0_ptr] "+r"(din_ptr0),
+                       [din1_ptr] "+r"(din_ptr1),
+                       [din2_ptr] "+r"(din_ptr2),
+                       [din3_ptr] "+r"(din_ptr3),
+                       [cnt] "+r"(cnt),
+                       [rmask] "+r"(rmask_ptr),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [scale_ptr] "r"(vscale),
+                       [bias_val] "r"(bias_val),
+                       [vzero] "w"(vzero),
+                       [remain] "r"(remain)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(
+        INIT_S1
+        "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+        "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+        "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+        "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+        "vext.32  q6, q8, q9, #1     @ 0012\n"
+        "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1 MID_RESULT_S1
+        "cmp  %[remain], #1             \n"
+        "blt 0f                         \n" RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+        "0:                         \n"
+        : [dout_ptr1] "+r"(doutr0),
+          [dout_ptr2] "+r"(doutr1),
+          [din0_ptr] "+r"(din_ptr0),
+          [din1_ptr] "+r"(din_ptr1),
+          [din2_ptr] "+r"(din_ptr2),
+          [din3_ptr] "+r"(din_ptr3),
+          [cnt] "+r"(cnt),
+          [rmask] "+r"(rmask_ptr),
+          [vmask] "+r"(vmask_ptr)
+        : [wr0] "w"(wr0),
+          [wr1] "w"(wr1),
+          [wr2] "w"(wr2),
+          [bias_val] "r"(bias_val),
+          [vzero] "w"(vzero),
+          [remain] "r"(remain)
+        : "cc",
+          "memory",
+          "q4",
+          "q5",
+          "q6",
+          "q7",
+          "q8",
+          "q9",
+          "q10",
+          "q11",
+          "q12",
+          "q13",
+          "q14",
+          "q15");
+  }
+}
+#endif
 /**
  * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
  * width > 4
@@ -1932,6 +3450,7 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
                                  const int w_in,
                                  const int h_out,
                                  const int w_out,
+                                 const operators::ActivationParam act_param,
                                  ARMContext *ctx) {
   //! pad is done implicit
   const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
@@ -2060,15 +3579,16 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
         }
 
         int cnt = tile_w;
+        /*
         if (flag_relu) {
           asm volatile(
               INIT_S1
-              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  // vld1q_f32(din_ptr0)
+              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" // vld1q_f32(din_ptr0)
+              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   // v16 = 1234
+              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   // v17 = 2345
+              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       // vld1q_f32(din_ptr0)
+              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      // vld1q_f32(din_ptr0)
               MID_COMPUTE_S1 MID_RESULT_S1_RELU
               "cmp  %w[remain], #1             \n"
               "blt 0f                         \n" RIGHT_COMPUTE_S1
@@ -2123,12 +3643,12 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
         } else {
           asm volatile(
               INIT_S1
-              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  // vld1q_f32(din_ptr0)
+              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" // vld1q_f32(din_ptr0)
+              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   // v16 = 1234
+              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   // v17 = 2345
+              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       // vld1q_f32(din_ptr0)
+              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      // vld1q_f32(din_ptr0)
               MID_COMPUTE_S1 MID_RESULT_S1
               "cmp  %w[remain], #1             \n"
               "blt 0f                         \n" RIGHT_COMPUTE_S1
@@ -2181,6 +3701,27 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
                 "v24",
                 "v25");
         }
+        */
+        act_switch_3x3s1p0(din_ptr0,
+                           din_ptr1,
+                           din_ptr2,
+                           din_ptr3,
+                           din_ptr4,
+                           din_ptr5,
+                           doutr0,
+                           doutr1,
+                           doutr2,
+                           doutr3,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask,
+                           rmask,
+                           vzero,
+                           vbias,
+                           cnt,
+                           remain,
+                           act_param);
         dout_ptr = dout_ptr + 4 * w_out;
       }
 #else
@@ -2219,6 +3760,7 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
         int cnt = tile_w;
         unsigned int *rmask_ptr = rmask;
         unsigned int *vmask_ptr = vmask;
+        /*
         if (flag_relu) {
           asm volatile(INIT_S1
                        "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
@@ -2301,13 +3843,328 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
                          "q13",
                          "q14",
                          "q15");
-        }
+        }*/
+        act_switch_3x3s1p0(din_ptr0,
+                           din_ptr1,
+                           din_ptr2,
+                           din_ptr3,
+                           doutr0,
+                           doutr1,
+                           wr0,
+                           wr1,
+                           wr2,
+                           vmask_ptr,
+                           rmask_ptr,
+                           vzero,
+                           bias_val,
+                           cnt,
+                           remain,
+                           act_param);
         dout_ptr += 2 * w_out;
       }  //! end of processing mid rows
 #endif
     }
   }
 }
+void act_switch_3x3s1p0_s(const float *din_ptr0,
+                          const float *din_ptr1,
+                          const float *din_ptr2,
+                          const float *din_ptr3,
+                          float *doutr0,
+                          float *doutr1,
+                          float32x4_t wr0,
+                          float32x4_t wr1,
+                          float32x4_t wr2,
+                          uint32x4_t vmask_rp1,
+                          uint32x4_t vmask_rp2,
+                          float32x4_t vzero,
+                          float32x4_t wbias,
+                          unsigned int *vmask_ptr,
+                          float bias_val,
+                          const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+#ifdef __aarch64__
+    float32x4_t vsix = vdupq_n_f32(act_param.Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param.Leaky_relu_alpha);
+#else
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+#endif
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vbias] "w"(wbias),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [vzero] "w"(vzero),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [bias_val] "r"(bias_val),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vbias] "w"(wbias),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [vzero] "w"(vzero),
+                       [vsix] "w"(vsix),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU6
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [six_ptr] "r"(vsix),
+                       [bias_val] "r"(bias_val),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vbias] "w"(wbias),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [vzero] "w"(vzero),
+                       [vscale] "w"(vscale),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+        break;
+#else
+        asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_LEAKY_RELU
+                     : [din0] "+r"(din_ptr0),
+                       [din1] "+r"(din_ptr1),
+                       [din2] "+r"(din_ptr2),
+                       [din3] "+r"(din_ptr3),
+                       [vmask] "+r"(vmask_ptr)
+                     : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
+                       [vzero] "w"(vzero),
+                       [scale_ptr] "r"(vscale),
+                       [bias_val] "r"(bias_val),
+                       [out1] "r"(doutr0),
+                       [out2] "r"(doutr1)
+                     : "cc",
+                       "memory",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+        break;
+#endif
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                 : [din0] "+r"(din_ptr0),
+                   [din1] "+r"(din_ptr1),
+                   [din2] "+r"(din_ptr2),
+                   [din3] "+r"(din_ptr3)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [vbias] "w"(wbias),
+                   [mask1] "w"(vmask_rp1),
+                   [mask2] "w"(vmask_rp2),
+                   [vzero] "w"(vzero),
+                   [out1] "r"(doutr0),
+                   [out2] "r"(doutr1)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15");
+#else
+    asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                 : [din0] "+r"(din_ptr0),
+                   [din1] "+r"(din_ptr1),
+                   [din2] "+r"(din_ptr2),
+                   [din3] "+r"(din_ptr3),
+                   [vmask] "+r"(vmask_ptr)
+                 : [wr0] "w"(wr0),
+                   [wr1] "w"(wr1),
+                   [wr2] "w"(wr2),
+                   [vzero] "w"(vzero),
+                   [bias_val] "r"(bias_val),
+                   [out1] "r"(doutr0),
+                   [out2] "r"(doutr1)
+                 : "cc",
+                   "memory",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15");
+#endif
+  }
+}
 /**
  * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
  * width <= 4
@@ -2324,6 +4181,7 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
                                    const int w_in,
                                    const int h_out,
                                    const int w_out,
+                                   const operators::ActivationParam act_param,
                                    ARMContext *ctx) {
   //! 3x3s1 convolution, implemented by direct algorithm
   //! pad is done implicit
@@ -2355,15 +4213,22 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
       float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
       float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
 
-#ifdef __aarch64__
+      // #ifdef __aarch64__
+      //       float32x4_t wbias;
+      //       if (flag_bias) {
+      //         wbias = vdupq_n_f32(bias[i]);
+      //       } else {
+      //         wbias = vdupq_n_f32(0.f);
+      //       }
+      // #endif  // __aarch64__
       float32x4_t wbias;
+      float bias_val = 0.f;
       if (flag_bias) {
         wbias = vdupq_n_f32(bias[i]);
+        bias_val = bias[i];
       } else {
         wbias = vdupq_n_f32(0.f);
       }
-#endif  // __aarch64__
-
       float out_buf1[4];
       float out_buf2[4];
       float trash_buf[4];
@@ -2396,135 +4261,154 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
               break;
           }
         }
-#ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vbias] "w"(wbias),
-                         [mask1] "w"(vmask_rp1),
-                         [mask2] "w"(vmask_rp2),
-                         [zero] "w"(vzero),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        } else {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vbias] "w"(wbias),
-                         [mask1] "w"(vmask_rp1),
-                         [mask2] "w"(vmask_rp2),
-                         [zero] "w"(vzero),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        }
-#else
+        /*
+        #ifdef __aarch64__
+                if (flag_relu) {
+                  asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                               : [din0] "+r"(dr0),
+                                 [din1] "+r"(dr1),
+                                 [din2] "+r"(dr2),
+                                 [din3] "+r"(dr3)
+                               : [wr0] "w"(wr0),
+                                 [wr1] "w"(wr1),
+                                 [wr2] "w"(wr2),
+                                 [vbias] "w"(wbias),
+                                 [mask1] "w"(vmask_rp1),
+                                 [mask2] "w"(vmask_rp2),
+                                 [vzero] "w"(vzero),
+                                 [out1] "r"(out_buf1),
+                                 [out2] "r"(out_buf2)
+                               : "cc",
+                                 "memory",
+                                 "v0",
+                                 "v1",
+                                 "v2",
+                                 "v3",
+                                 "v4",
+                                 "v5",
+                                 "v6",
+                                 "v7",
+                                 "v8",
+                                 "v9",
+                                 "v10",
+                                 "v11",
+                                 "v12",
+                                 "v13",
+                                 "v14",
+                                 "v15");
+                } else {
+                  asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                               : [din0] "+r"(dr0),
+                                 [din1] "+r"(dr1),
+                                 [din2] "+r"(dr2),
+                                 [din3] "+r"(dr3)
+                               : [wr0] "w"(wr0),
+                                 [wr1] "w"(wr1),
+                                 [wr2] "w"(wr2),
+                                 [vbias] "w"(wbias),
+                                 [mask1] "w"(vmask_rp1),
+                                 [mask2] "w"(vmask_rp2),
+                                 [vzero] "w"(vzero),
+                                 [out1] "r"(out_buf1),
+                                 [out2] "r"(out_buf2)
+                               : "cc",
+                                 "memory",
+                                 "v0",
+                                 "v1",
+                                 "v2",
+                                 "v3",
+                                 "v4",
+                                 "v5",
+                                 "v6",
+                                 "v7",
+                                 "v8",
+                                 "v9",
+                                 "v10",
+                                 "v11",
+                                 "v12",
+                                 "v13",
+                                 "v14",
+                                 "v15");
+                }
+        #else
+                unsigned int *vmask_ptr = vmask;
+                float bias_val = flag_bias ? bias[i] : 0.f;
+                if (flag_relu) {
+                  asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                               : [din0] "+r"(dr0),
+                                 [din1] "+r"(dr1),
+                                 [din2] "+r"(dr2),
+                                 [din3] "+r"(dr3),
+                                 [vmask] "+r"(vmask_ptr)
+                               : [wr0] "w"(wr0),
+                                 [wr1] "w"(wr1),
+                                 [wr2] "w"(wr2),
+                                 [vzero] "w"(vzero),
+                                 [bias_val] "r"(bias_val),
+                                 [out1] "r"(out_buf1),
+                                 [out2] "r"(out_buf2)
+                               : "cc",
+                                 "memory",
+                                 "q4",
+                                 "q5",
+                                 "q6",
+                                 "q7",
+                                 "q8",
+                                 "q9",
+                                 "q10",
+                                 "q11",
+                                 "q12",
+                                 "q13",
+                                 "q14",
+                                 "q15");
+                } else {
+                  asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                               : [din0] "+r"(dr0),
+                                 [din1] "+r"(dr1),
+                                 [din2] "+r"(dr2),
+                                 [din3] "+r"(dr3),
+                                 [vmask] "+r"(vmask_ptr)
+                               : [wr0] "w"(wr0),
+                                 [wr1] "w"(wr1),
+                                 [wr2] "w"(wr2),
+                                 [vzero] "w"(vzero),
+                                 [bias_val] "r"(bias_val),
+                                 [out1] "r"(out_buf1),
+                                 [out2] "r"(out_buf2)
+                               : "cc",
+                                 "memory",
+                                 "q4",
+                                 "q5",
+                                 "q6",
+                                 "q7",
+                                 "q8",
+                                 "q9",
+                                 "q10",
+                                 "q11",
+                                 "q12",
+                                 "q13",
+                                 "q14",
+                                 "q15");
+                }
+        #endif
+        */
         unsigned int *vmask_ptr = vmask;
-        float bias_val = flag_bias ? bias[i] : 0.f;
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [bias_val] "r"(bias_val),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [bias_val] "r"(bias_val),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-#endif
+        act_switch_3x3s1p0_s(dr0,
+                             dr1,
+                             dr2,
+                             dr3,
+                             out_buf1,
+                             out_buf2,
+                             wr0,
+                             wr1,
+                             wr2,
+                             vmask_rp1,
+                             vmask_rp2,
+                             vzero,
+                             wbias,
+                             vmask_ptr,
+                             bias_val,
+                             act_param);
         for (int w = 0; w < w_out; ++w) {
           *doutr0++ = out_buf1[w];
           *doutr1++ = out_buf2[w];
diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
index 08e5efecd751bcca534ba7a47035c5f70fa1f6bf..fd54e214cf27e001e21efcf255b09113bbe12d19 100644
--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
@@ -25,6 +25,785 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
+// clang-format off
+#ifdef __aarch64__
+#define COMPUTE \
+          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/ \
+          "ldp    q6, q7,   [%[inr1]], #32\n" /* load input r1*/ \
+          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/ \
+          "ldp    q8, q9,   [%[inr1]], #32\n" /* load input r1*/ \
+          "ldp    q4, q5,   [%[inr0]]\n"      /* load input r0*/ \
+          "ldp    q10, q11, [%[inr1]]\n"      /* load input r1*/ \
+          /*  r0, r1, mul w0, get out r0, r1 */ \
+          "fmul   v15.4s ,  %[w0].4s,  v0.4s\n" /* outr00 = w0 * r0, 0*/ \
+          "fmul   v16.4s ,  %[w0].4s,  v1.4s\n" /* outr01 = w0 * r0, 1*/ \
+          "fmul   v17.4s ,  %[w0].4s,  v2.4s\n" /* outr02 = w0 * r0, 2*/ \
+          "fmul   v18.4s ,  %[w0].4s,  v3.4s\n" /* outr03 = w0 * r0, 3*/ \
+          "fmul   v19.4s ,  %[w0].4s,  v6.4s\n" /* outr10 = w0 * r1, 0*/ \
+          "fmul   v20.4s ,  %[w0].4s,  v7.4s\n" /* outr11 = w0 * r1, 1*/ \
+          "fmul   v21.4s ,  %[w0].4s,  v8.4s\n" /* outr12 = w0 * r1, 2*/ \
+          "fmul   v22.4s ,  %[w0].4s,  v9.4s\n" /* outr13 = w0 * r1, 3*/ \
+          /*  r0, r1, mul w1, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w1].4s,  v1.4s\n" /* outr00 = w1 * r0[1]*/ \
+          "ldp    q0, q1,   [%[inr2]], #32\n"     /* load input r2*/ \
+          "fmla   v16.4s ,  %[w1].4s,  v2.4s\n" /* outr01 = w1 * r0[2]*/ \
+          "fmla   v17.4s ,  %[w1].4s,  v3.4s\n" /* outr02 = w1 * r0[3]*/ \
+          "fmla   v18.4s ,  %[w1].4s,  v4.4s\n" /* outr03 = w1 * r0[4]*/ \
+          "fmla   v19.4s ,  %[w1].4s,  v7.4s\n" /* outr10 = w1 * r1[1]*/ \
+          "fmla   v20.4s ,  %[w1].4s,  v8.4s\n" /* outr11 = w1 * r1[2]*/ \
+          "fmla   v21.4s ,  %[w1].4s,  v9.4s\n" /* outr12 = w1 * r1[3]*/ \
+          "fmla   v22.4s ,  %[w1].4s,  v10.4s\n"/* outr13 = w1 * r1[4]*/ \
+          /*  r0, r1, mul w2, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w2].4s,  v2.4s\n" /* outr00 = w2 * r0[2]*/ \
+          "fmla   v16.4s ,  %[w2].4s,  v3.4s\n" /* outr01 = w2 * r0[3]*/ \
+          "ldp    q2, q3,   [%[inr2]], #32\n"     /* load input r2*/ \
+          "fmla   v17.4s ,  %[w2].4s,  v4.4s\n" /* outr02 = w2 * r0[4]*/ \
+          "fmla   v18.4s ,  %[w2].4s,  v5.4s\n" /* outr03 = w2 * r0[5]*/ \
+          "ldp    q4, q5,   [%[inr2]]\n"          /* load input r2*/ \
+          "fmla   v19.4s ,  %[w2].4s,  v8.4s\n" /* outr10 = w2 * r1[2]*/ \
+          "fmla   v20.4s ,  %[w2].4s,  v9.4s\n" /* outr11 = w2 * r1[3]*/ \
+          "fmla   v21.4s ,  %[w2].4s,  v10.4s\n"/* outr12 = w2 * r1[4]*/ \
+          "fmla   v22.4s ,  %[w2].4s,  v11.4s\n"/* outr13 = w2 * r1[5]*/ \
+          /*  r1, r2, mul w3, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w3].4s,  v6.4s\n" /* outr00 = w3 * r1[0]*/ \
+          "fmla   v16.4s ,  %[w3].4s,  v7.4s\n" /* outr01 = w3 * r1[1]*/ \
+          "fmla   v17.4s ,  %[w3].4s,  v8.4s\n" /* outr02 = w3 * r1[2]*/ \
+          "fmla   v18.4s ,  %[w3].4s,  v9.4s\n" /* outr03 = w3 * r1[3]*/ \
+          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr10 = w3 * r2[0]*/ \
+          "fmla   v20.4s ,  %[w3].4s,  v1.4s\n" /* outr11 = w3 * r2[1]*/ \
+          "fmla   v21.4s ,  %[w3].4s,  v2.4s\n" /* outr12 = w3 * r2[2]*/ \
+          "fmla   v22.4s ,  %[w3].4s,  v3.4s\n" /* outr13 = w3 * r2[3]*/ \
+          /*  r1, r2, mul w4, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w4].4s,  v7.4s\n" /* outr00 = w4 * r1[1]*/ \
+          "ldp    q6, q7,   [%[inr3]], #32\n"     /* load input r3*/ \
+          "fmla   v16.4s ,  %[w4].4s,  v8.4s\n" /* outr01 = w4 * r1[2]*/ \
+          "fmla   v17.4s ,  %[w4].4s,  v9.4s\n" /* outr02 = w4 * r1[3]*/ \
+          "fmla   v18.4s ,  %[w4].4s,  v10.4s\n"/* outr03 = w4 * r1[4]*/ \
+          "ldp    x0, x1, [%[outl]]  \n" \
+          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr10 = w4 * r2[1]*/ \
+          "fmla   v20.4s ,  %[w4].4s,  v2.4s\n" /* outr11 = w4 * r2[2]*/ \
+          "fmla   v21.4s ,  %[w4].4s,  v3.4s\n" /* outr12 = w4 * r2[3]*/ \
+          "fmla   v22.4s ,  %[w4].4s,  v4.4s\n" /* outr13 = w4 * r2[4]*/ \
+          /*  r1, r2, mul w5, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w5].4s,  v8.4s\n" /* outr00 = w5 * r1[2]*/ \
+          "fmla   v16.4s ,  %[w5].4s,  v9.4s\n" /* outr01 = w5 * r1[3]*/ \
+          "ldp    q8, q9,   [%[inr3]], #32\n"     /* load input r3*/ \
+          "fmla   v17.4s ,  %[w5].4s,  v10.4s\n"/* outr02 = w5 * r1[4]*/ \
+          "fmla   v18.4s ,  %[w5].4s,  v11.4s\n"/* outr03 = w5 * r1[5]*/ \
+          "ldp    q10, q11,   [%[inr3]]\n"        /* load input r3*/ \
+          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr10 = w5 * r2[2]*/ \
+          "fmla   v20.4s ,  %[w5].4s,  v3.4s\n" /* outr11 = w5 * r2[3]*/ \
+          "fmla   v21.4s ,  %[w5].4s,  v4.4s\n" /* outr12 = w5 * r2[4]*/ \
+          "fmla   v22.4s ,  %[w5].4s,  v5.4s\n" /* outr13 = w5 * r2[5]*/ \
+          /*  r2, r3, mul w6, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w6].4s,  v0.4s\n" /* outr00 = w6 * r2[0]*/ \
+          "fmla   v16.4s ,  %[w6].4s,  v1.4s\n" /* outr01 = w6 * r2[1]*/ \
+          "fmla   v17.4s ,  %[w6].4s,  v2.4s\n" /* outr02 = w6 * r2[2]*/ \
+          "fmla   v18.4s ,  %[w6].4s,  v3.4s\n" /* outr03 = w6 * r2[3]*/ \
+          "ldp    x2, x3, [%[outl], #16]  \n" \
+          "fmla   v19.4s ,  %[w6].4s,  v6.4s\n" /* outr10 = w6 * r3[0]*/ \
+          "fmla   v20.4s ,  %[w6].4s,  v7.4s\n" /* outr11 = w6 * r3[1]*/ \
+          "fmla   v21.4s ,  %[w6].4s,  v8.4s\n" /* outr12 = w6 * r3[2]*/ \
+          "fmla   v22.4s ,  %[w6].4s,  v9.4s\n" /* outr13 = w6 * r3[3]*/ \
+          /*  r2, r3, mul w7, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w7].4s,  v1.4s\n" /* outr00 = w7 * r2[1]*/ \
+          "fmla   v16.4s ,  %[w7].4s,  v2.4s\n" /* outr01 = w7 * r2[2]*/ \
+          "fmla   v17.4s ,  %[w7].4s,  v3.4s\n" /* outr02 = w7 * r2[3]*/ \
+          "fmla   v18.4s ,  %[w7].4s,  v4.4s\n" /* outr03 = w7 * r2[4]*/ \
+          "ldp    x4, x5, [%[outl], #32]  \n" \
+          "fmla   v19.4s ,  %[w7].4s,  v7.4s\n" /* outr10 = w7 * r3[1]*/ \
+          "fmla   v20.4s ,  %[w7].4s,  v8.4s\n" /* outr11 = w7 * r3[2]*/ \
+          "fmla   v21.4s ,  %[w7].4s,  v9.4s\n" /* outr12 = w7 * r3[3]*/ \
+          "fmla   v22.4s ,  %[w7].4s,  v10.4s\n"/* outr13 = w7 * r3[4]*/ \
+          /*  r2, r3, mul w8, get out r0, r1 */ \
+          "fmla   v15.4s ,  %[w8].4s,  v2.4s\n" /* outr00 = w8 * r2[2]*/ \
+          "fmla   v16.4s ,  %[w8].4s,  v3.4s\n" /* outr01 = w8 * r2[3]*/ \
+          "fmla   v17.4s ,  %[w8].4s,  v4.4s\n" /* outr02 = w8 * r2[0]*/ \
+          "fmla   v18.4s ,  %[w8].4s,  v5.4s\n" /* outr03 = w8 * r2[1]*/ \
+          "ldp    x6, x7, [%[outl], #48]  \n" \
+          "fmla   v19.4s ,  %[w8].4s,  v8.4s\n" /* outr10 = w8 * r3[2]*/ \
+          "fmla   v20.4s ,  %[w8].4s,  v9.4s\n" /* outr11 = w8 * r3[3]*/ \
+          "fmla   v21.4s ,  %[w8].4s,  v10.4s\n"/* outr12 = w8 * r3[0]*/ \
+          "fmla   v22.4s ,  %[w8].4s,  v11.4s\n"/* outr13 = w8 * r3[1]*/ \
+          \
+          "fadd   v15.4s, v15.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v16.4s, v16.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v17.4s, v17.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v18.4s, v18.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v19.4s, v19.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v20.4s, v20.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v21.4s, v21.4s, %[vbias].4s\n"/* add bias */ \
+          "fadd   v22.4s, v22.4s, %[vbias].4s\n"/* add bias */ \
+          /* transpose */ \
+          "trn1   v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/ \
+          "trn2   v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/ \
+          "trn1   v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/ \
+          "trn2   v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/ \
+          "trn1   v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/ \
+          "trn2   v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/ \
+          "trn1   v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/ \
+          "trn2   v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/ \
+          "trn1   v15.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/ \
+          "trn2   v19.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/ \
+          "trn1   v17.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/ \
+          "trn2   v21.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/ \
+          "trn1   v16.2d, v4.2d, v6.2d\n"  /* r1: a0a1a2a3*/ \
+          "trn2   v20.2d, v4.2d, v6.2d\n"  /* r1: c0c1c2c3*/ \
+          "trn1   v18.2d, v5.2d, v7.2d\n"  /* r1: b0b1b2b3*/ \
+          "trn2   v22.2d, v5.2d, v7.2d\n"  /* r1: d0d1d2d3*/
+
+#define RELU \
+          "movi   v0.4s, #0\n"             /* for relu */ \
+          "ldr x0,    [%[outl], #80]\n" \
+          "fmax   v15.4s, v15.4s, v0.4s\n" \
+          "fmax   v16.4s, v16.4s, v0.4s\n" \
+          "fmax   v17.4s, v17.4s, v0.4s\n" \
+          "fmax   v18.4s, v18.4s, v0.4s\n" \
+          "ld1 {v1.4s}, [x0]\n" \
+          "fmax   v19.4s, v19.4s, v0.4s\n" \
+          "fmax   v20.4s, v20.4s, v0.4s\n" \
+          "fmax   v21.4s, v21.4s, v0.4s\n" \
+          "fmax   v22.4s, v22.4s, v0.4s\n" \
+          "ldr x0,    [%[outl]]\n" \
+
+#define RELU6 \
+          "fmin   v15.4s, v15.4s, v1.4s\n" \
+          "fmin   v16.4s, v16.4s, v1.4s\n" \
+          "fmin   v17.4s, v17.4s, v1.4s\n" \
+          "fmin   v18.4s, v18.4s, v1.4s\n" \
+          "fmin   v19.4s, v19.4s, v1.4s\n" \
+          "fmin   v20.4s, v20.4s, v1.4s\n" \
+          "fmin   v21.4s, v21.4s, v1.4s\n" \
+          "fmin   v22.4s, v22.4s, v1.4s\n"
+
+#define LEAKY_RELU \
+          "movi   v0.4s, #0\n"             /* for relu */ \
+          "ldr x0,    [%[outl], #88]\n" \
+          "cmhs v1.4s, v15.4s,  v0.4s \n" /* vcgeq_u32 */ \
+          "cmhs v2.4s, v16.4s,  v0.4s \n" /* vcgeq_u32 */ \
+          "ld1 {v9.4s}, [x0] \n" \
+          "cmhs v3.4s, v17.4s,  v0.4s \n" /* vcgeq_u32 */ \
+          "cmhs v4.4s, v18.4s,  v0.4s \n" /* vcgeq_u32 */ \
+          "ldr x0,    [%[outl]] \n" \
+          "fmul v5.4s, v15.4s, v9.4s \n" /* mul */ \
+          "fmul v6.4s, v16.4s, v9.4s \n" /* mul */ \
+          "fmul v7.4s, v17.4s, v9.4s \n" /* mul */ \
+          "fmul v8.4s, v18.4s, v9.4s \n" /* mul */ \
+          "bif  v15.16b, v5.16b, v1.16b \n" /* choose*/ \
+          "bif  v16.16b, v6.16b, v2.16b \n" /* choose*/ \
+          "bif  v17.16b, v7.16b, v3.16b \n" /* choose*/ \
+          "bif  v18.16b, v8.16b, v4.16b \n" /* choose*/ \
+          "cmhs v1.4s, v19.4s,  v0.4s \n" /* vcgeq_u32 */ \
+          "cmhs v2.4s, v20.4s,  v0.4s \n" /* vcgeq_u32 */ \
+          "cmhs v3.4s, v21.4s,  v0.4s \n" /* vcgeq_u32 */ \
+          "cmhs v4.4s, v22.4s,  v0.4s \n" /* vcgeq_u32 */ \
+          "fmul v5.4s, v19.4s, v9.4s \n" /* mul */ \
+          "fmul v6.4s, v20.4s, v9.4s \n" /* mul */ \
+          "fmul v7.4s, v21.4s, v9.4s \n" /* mul */ \
+          "fmul v8.4s, v22.4s, v9.4s \n" /* mul */ \
+          "bif  v19.16b, v5.16b, v1.16b \n" /* choose*/ \
+          "bif  v20.16b, v6.16b, v2.16b \n" /* choose*/ \
+          "bif  v21.16b, v7.16b, v3.16b \n" /* choose*/ \
+          "bif  v22.16b, v8.16b, v4.16b \n" /* choose*/
+
+#define STORE \
+          "cbnz   %w[flag_mask], 1f\n" \
+          "str    q15, [x0]\n" /* save outc00 */ \
+          "str    q16, [x4]\n" /* save outc01 */ \
+          "str    q17, [x1]\n" /* save outc10 */ \
+          "str    q18, [x5]\n" /* save outc11 */ \
+          "str    q19, [x2]\n" /* save outc20 */ \
+          "str    q20, [x6]\n" /* save outc21 */ \
+          "str    q21, [x3]\n" /* save outc30 */ \
+          "str    q22, [x7]\n" /* save outc31 */ \
+          "b 2f\n" \
+          "1:\n" \
+          "str  q15, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q17, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q19, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q21, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q16, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q18, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q20, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "str  q22, [%[out]], #16 \n" /* save remain to pre_out */ \
+          "2:\n"
+#else
+#define COMPUTE \
+          /* load weights */ \
+          "vld1.32    {d10-d13}, [%[wc0]]!      @ load w0, w1, to q5, q6\n" \
+          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w2, to q7\n" \
+          /* load r0, r1 */ \
+          "vld1.32    {d0-d3}, [%[r0]]!         @ load r0, q0, q1\n" \
+          "vld1.32    {d4-d7}, [%[r0]]!         @ load r0, q2, q3\n" \
+          /* main loop */ \
+          "0:                                   @ main loop\n" \
+          /* mul r0 with w0, w1, w2, get out r0 */ \
+          "vmul.f32   q8, q5, q0                @ w0 * inr00\n" \
+          "vmul.f32   q9, q5, q1                @ w0 * inr01\n" \
+          "vmul.f32   q10, q5, q2               @ w0 * inr02\n" \
+          "vmul.f32   q11, q5, q3               @ w0 * inr03\n" \
+          "vmla.f32   q8, q6, q1                @ w1 * inr01\n" \
+          "vld1.32    {d0-d3}, [%[r0]]          @ load r0, q0, q1\n" \
+          "vmla.f32   q9, q6, q2                @ w1 * inr02\n" \
+          "vmla.f32   q10, q6, q3               @ w1 * inr03\n" \
+          "vmla.f32   q11, q6, q0               @ w1 * inr04\n" \
+          "vmla.f32   q8, q7, q2                @ w2 * inr02\n" \
+          "vmla.f32   q9, q7, q3                @ w2 * inr03\n" \
+          "vld1.32    {d4-d7}, [%[r1]]!         @ load r0, q2, q3\n" \
+          "vmla.f32   q10, q7, q0               @ w2 * inr04\n" \
+          "vmla.f32   q11, q7, q1               @ w2 * inr05\n" \
+          "vld1.32    {d0-d3}, [%[r1]]!         @ load r0, q0, q1\n" \
+          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w3 to q4\n" \
+          /* mul r1 with w0-w5, get out r0, r1 */ \
+          "vmul.f32   q12, q5, q2               @ w0 * inr10\n" \
+          "vmul.f32   q13, q5, q3               @ w0 * inr11\n" \
+          "vmul.f32   q14, q5, q0               @ w0 * inr12\n" \
+          "vmul.f32   q15, q5, q1               @ w0 * inr13\n" \
+          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w4 to q5\n" \
+          "vmla.f32   q8, q4, q2                @ w3 * inr10\n" \
+          "vmla.f32   q9, q4, q3                @ w3 * inr11\n" \
+          "vmla.f32   q10, q4, q0               @ w3 * inr12\n" \
+          "vmla.f32   q11, q4, q1               @ w3 * inr13\n" \
+          /* mul r1 with w1, w4, get out r1, r0 */ \
+          "vmla.f32   q8, q5, q3                @ w4 * inr11\n" \
+          "vmla.f32   q12, q6, q3               @ w1 * inr11\n" \
+          "vld1.32    {d4-d7}, [%[r1]]          @ load r1, q2, q3\n" \
+          "vmla.f32   q9, q5, q0                @ w4 * inr12\n" \
+          "vmla.f32   q13, q6, q0               @ w1 * inr12\n" \
+          "vmla.f32   q10, q5, q1               @ w4 * inr13\n" \
+          "vmla.f32   q14, q6, q1               @ w1 * inr13\n" \
+          "vmla.f32   q11, q5, q2               @ w4 * inr14\n" \
+          "vmla.f32   q15, q6, q2               @ w1 * inr14\n" \
+          "vld1.32    {d12-d13}, [%[wc0]]!      @ load w5 to q6\n" \
+          /* mul r1 with w2, w5, get out r1, r0 */ \
+          "vmla.f32   q12, q7, q0               @ w2 * inr12\n" \
+          "vmla.f32   q13, q7, q1               @ w2 * inr13\n" \
+          "vmla.f32   q8, q6, q0                @ w5 * inr12\n" \
+          "vmla.f32   q9, q6, q1                @ w5 * inr13\n" \
+          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, q0, q1\n" \
+          "vmla.f32   q14, q7, q2               @ w2 * inr14\n" \
+          "vmla.f32   q15, q7, q3               @ w2 * inr15\n" \
+          "vmla.f32   q10, q6, q2               @ w5 * inr14\n" \
+          "vmla.f32   q11, q6, q3               @ w5 * inr15\n" \
+          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, q0, q1\n" \
+          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w6, to q7\n" \
+          /* mul r2 with w3-w8, get out r0, r1 */ \
+          "vmla.f32   q12, q4, q0               @ w3 * inr20\n" \
+          "vmla.f32   q13, q4, q1               @ w3 * inr21\n" \
+          "vmla.f32   q14, q4, q2               @ w3 * inr22\n" \
+          "vmla.f32   q15, q4, q3               @ w3 * inr23\n" \
+          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w7, to q4\n" \
+          "vmla.f32   q8,  q7, q0               @ w6 * inr20\n" \
+          "vmla.f32   q9,  q7, q1               @ w6 * inr21\n" \
+          "vmla.f32   q10, q7, q2               @ w6 * inr22\n" \
+          "vmla.f32   q11, q7, q3               @ w6 * inr23\n" \
+          /* mul r2 with w4, w7, get out r1, r0 */ \
+          "vmla.f32   q8,  q4, q1               @ w7 * inr21\n" \
+          "vmla.f32   q12, q5, q1               @ w4 * inr21\n" \
+          "vld1.32    {d0-d3}, [%[r2]]          @ load r2, q0, q1\n" \
+          "vmla.f32   q9,  q4, q2               @ w7 * inr22\n" \
+          "vmla.f32   q13, q5, q2               @ w4 * inr22\n" \
+          "vmla.f32   q10, q4, q3               @ w7 * inr23\n" \
+          "vmla.f32   q14, q5, q3               @ w4 * inr23\n" \
+          "vmla.f32   q11, q4, q0               @ w7 * inr24\n" \
+          "vmla.f32   q15, q5, q0               @ w4 * inr24\n" \
+          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w8 to q5\n" \
+          /* mul r1 with w5, w8, get out r1, r0 */ \
+          "vmla.f32   q12, q6, q2               @ w5 * inr22\n" \
+          "vmla.f32   q13, q6, q3               @ w5 * inr23\n" \
+          "vmla.f32   q8,  q5, q2               @ w8 * inr22\n" \
+          "vmla.f32   q9,  q5, q3               @ w8 * inr23\n" \
+          "vld1.32    {d4-d7}, [%[r3]]!         @ load r3, q2, q3\n" \
+          "ldr r4,    [%[outl], #32]            @ load bias addr to r4\n" \
+          "vmla.f32   q14, q6, q0               @ w5 * inr24\n" \
+          "vmla.f32   q15, q6, q1               @ w5 * inr25\n" \
+          "vmla.f32   q10, q5, q0               @ w8 * inr24\n" \
+          "vmla.f32   q11, q5, q1               @ w8 * inr25\n" \
+          "vld1.32    {d0-d3}, [%[r3]]!         @ load r3, q0, q1\n" \
+          "sub %[wc0], %[wc0], #144      @ wc0 - 144 to start address\n" \
+          /* mul r3 with w6, w7, w8, get out r1 */ \
+          "vmla.f32   q12, q7, q2               @ w6 * inr30\n" \
+          "vmla.f32   q13, q7, q3               @ w6 * inr31\n" \
+          "vmla.f32   q14, q7, q0               @ w6 * inr32\n" \
+          "vmla.f32   q15, q7, q1               @ w6 * inr33\n" \
+          "vmla.f32   q12, q4, q3               @ w7 * inr31\n" \
+          "vld1.32    {d4-d7}, [%[r3]]          @ load r3, q2, q3\n" \
+          "vld1.32    {d12-d13}, [r4]           @ load bias\n" \
+          "vmla.f32   q13, q4, q0               @ w7 * inr32\n" \
+          "vmla.f32   q14, q4, q1               @ w7 * inr33\n" \
+          "vmla.f32   q15, q4, q2               @ w7 * inr34\n" \
+          "ldr r0,    [%[outl]]                 @ load outc00 to r0\n" \
+          "vmla.f32   q12, q5, q0               @ w8 * inr32\n" \
+          "vmla.f32   q13, q5, q1               @ w8 * inr33\n" \
+          "ldr r5,    [%[outl], #36]            @ load flag_relu to r5\n" \
+          "vmla.f32   q14, q5, q2               @ w8 * inr34\n" \
+          "vmla.f32   q15, q5, q3               @ w8 * inr35\n" \
+          "ldr r1,    [%[outl], #4]             @ load outc10 to r1\n" \
+          "vadd.f32   q8, q8, q6                @ r00 add bias\n" \
+          "vadd.f32   q9, q9, q6                @ r01 add bias\n" \
+          "vadd.f32   q10, q10, q6              @ r02 add bias\n" \
+          "vadd.f32   q11, q11, q6              @ r03 add bias\n" \
+          "ldr r2,    [%[outl], #8]             @ load outc20 to r2\n" \
+          "vadd.f32   q12, q12, q6              @ r10 add bias\n" \
+          "vadd.f32   q13, q13, q6              @ r11 add bias\n" \
+          "vadd.f32   q14, q14, q6              @ r12 add bias\n" \
+          "vadd.f32   q15, q15, q6              @ r13 add bias\n" \
+          "ldr r3,    [%[outl], #12]            @ load outc30 to r3\n" \
+          "vmov.u32   q7, #0                    @ mov zero to q7\n"
+#define RELU \
+          "vmax.f32  q8, q8, q7                 @ r00 relu\n" \
+          "vmax.f32  q9, q9, q7                 @ r01 relu\n" \
+          "vmax.f32  q10, q10, q7               @ r02 relu\n" \
+          "vmax.f32  q11, q11, q7               @ r03 relu\n" \
+          "vmax.f32  q12, q12, q7               @ r10 relu\n" \
+          "vmax.f32  q13, q13, q7               @ r11 relu\n" \
+          "vmax.f32  q14, q14, q7               @ r12 relu\n" \
+          "vmax.f32  q15, q15, q7               @ r13 relu\n"
+
+#define RELU6 \
+          "ldr r4,    [%[outl], #40]            @ load six to r4\n" \
+          "vld1.32 {d12-d13}, [r4] @load data \n" \
+          "vmin.f32  q8, q8, q6                 @ r00 relu\n" \
+          "vmin.f32  q9, q9, q6                 @ r01 relu\n" \
+          "vmin.f32  q10, q10, q6               @ r02 relu\n" \
+          "vmin.f32  q11, q11, q6               @ r03 relu\n" \
+          "vmin.f32  q12, q12, q6               @ r10 relu\n" \
+          "vmin.f32  q13, q13, q6               @ r11 relu\n" \
+          "vmin.f32  q14, q14, q6               @ r12 relu\n" \
+          "vmin.f32  q15, q15, q6               @ r13 relu\n"
+
+#define LEAKY_RELU \
+          "ldr r4,    [%[outl], #44]            @ load scale to r4\n" \
+          "vld1.32 {d12-d13}, [r4] @load data \n" \
+          "vcge.f32 q0, q8, q7        @ q0 > 0 \n"   \
+          "vcge.f32 q1, q9, q7        @ q0 > 0 \n"   \
+          "vmul.f32 q4, q8, q6 \n"  \
+          "vmul.f32 q5, q9, q6 \n"  \
+          "vcge.f32 q2, q10, q7        @ q0 > 0 \n"   \
+          "vcge.f32 q3, q11, q7        @ q0 > 0 \n"   \
+          "vbif q8, q4, q0 @ choose \n" \
+          "vbif q9, q5, q1 @ choose \n" \
+          "vmul.f32 q4, q10, q6 \n"  \
+          "vmul.f32 q5, q11, q6 \n"  \
+          "vbif q10, q4, q2 @ choose \n" \
+          "vbif q11, q5, q3 @ choose \n" \
+          "vcge.f32 q0, q12, q7        @ q0 > 0 \n"   \
+          "vcge.f32 q1, q13, q7        @ q0 > 0 \n"   \
+          "vmul.f32 q4, q12, q6 \n"  \
+          "vmul.f32 q5, q13, q6 \n"  \
+          "vcge.f32 q2, q14, q7        @ q0 > 0 \n"   \
+          "vcge.f32 q3, q15, q7        @ q0 > 0 \n"   \
+          "vbif q12, q4, q0 @ choose \n" \
+          "vbif q13, q5, q1 @ choose \n" \
+          "vmul.f32 q4, q14, q6 \n"  \
+          "vmul.f32 q5, q15, q6 \n"  \
+          "vbif q14, q4, q2 @ choose \n" \
+          "vbif q15, q5, q3 @ choose \n"
+
+#define STORE \
+          "ldr r4,   [%[outl], #16]   @ load outc01 to r4\n" \
+          "vtrn.32   q8, q9           @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n" \
+          "vtrn.32   q10, q11         @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n" \
+          "vtrn.32   q12, q13         @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n" \
+          "vtrn.32   q14, q15         @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n" \
+          "ldr r5,   [%[outl], #20]   @ load outc11 to r5\n" \
+          "vswp      d17, d20         @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n" \
+          "vswp      d19, d22         @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n" \
+          "vswp      d25, d28         @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n" \
+          "vswp      d27, d30         @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n" \
+          "cmp %[flag_mask], #0       @ cmp flag mask\n" \
+          "bne 2f\n" \
+          "vst1.32   {d16-d17}, [r0]  @ save outc00\n" \
+          "vst1.32   {d18-d19}, [r1]  @ save outc10\n" \
+          "vst1.32   {d20-d21}, [r2]  @ save outc20\n" \
+          "vst1.32   {d22-d23}, [r3]  @ save outc30\n" \
+          "vst1.32   {d24-d25}, [r4]  @ save outc01\n" \
+          "vst1.32   {d26-d27}, [r5]  @ save outc11\n" \
+          "ldr r0,   [%[outl], #24]   @ load outc21 to r0\n" \
+          "ldr r1,   [%[outl], #28]   @ load outc31 to r1\n" \
+          "vst1.32   {d28-d29}, [r0]  @ save outc21\n" \
+          "vst1.32   {d30-d31}, [r1]  @ save outc31\n" \
+          "b 3f                       @ branch end\n" \
+          "2: \n" \
+          "vst1.32 {d16-d17}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d18-d19}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d20-d21}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d22-d23}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d24-d25}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d26-d27}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d28-d29}, [%[out0]]!  @ save remain to pre_out\n" \
+          "vst1.32 {d30-d31}, [%[out0]]!  @ save remain to pre_out\n" \
+          "3: \n"
+#endif
+// clang-format on
+void act_switch_3x3s1(const float* inr0,
+                      const float* inr1,
+                      const float* inr2,
+                      const float* inr3,
+                      float* out0,
+                      const float* weight_c,
+                      float flag_mask,
+                      void* outl_ptr,
+                      float32x4_t w0,
+                      float32x4_t w1,
+                      float32x4_t w2,
+                      float32x4_t w3,
+                      float32x4_t w4,
+                      float32x4_t w5,
+                      float32x4_t w6,
+                      float32x4_t w7,
+                      float32x4_t w8,
+                      float32x4_t vbias,
+                      const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [inr3] "+r"(inr3),
+                       [out] "+r"(out0)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [vbias] "w"(vbias),
+                       [outl] "r"(outl_ptr),
+                       [flag_mask] "r"(flag_mask)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "x0",
+                       "x1",
+                       "x2",
+                       "x3",
+                       "x4",
+                       "x5",
+                       "x6",
+                       "x7");
+#else
+        asm volatile(COMPUTE RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [r3] "+r"(inr3),
+                       [out0] "+r"(out0),
+                       [wc0] "+r"(weight_c)
+                     : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15",
+                       "r0",
+                       "r1",
+                       "r2",
+                       "r3",
+                       "r4",
+                       "r5");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [inr3] "+r"(inr3),
+                       [out] "+r"(out0)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [vbias] "w"(vbias),
+                       [outl] "r"(outl_ptr),
+                       [flag_mask] "r"(flag_mask)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "x0",
+                       "x1",
+                       "x2",
+                       "x3",
+                       "x4",
+                       "x5",
+                       "x6",
+                       "x7");
+#else
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [r3] "+r"(inr3),
+                       [out0] "+r"(out0),
+                       [wc0] "+r"(weight_c)
+                     : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15",
+                       "r0",
+                       "r1",
+                       "r2",
+                       "r3",
+                       "r4",
+                       "r5");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [inr3] "+r"(inr3),
+                       [out] "+r"(out0)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [vbias] "w"(vbias),
+                       [outl] "r"(outl_ptr),
+                       [flag_mask] "r"(flag_mask)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "x0",
+                       "x1",
+                       "x2",
+                       "x3",
+                       "x4",
+                       "x5",
+                       "x6",
+                       "x7");
+#else
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [r3] "+r"(inr3),
+                       [out0] "+r"(out0),
+                       [wc0] "+r"(weight_c)
+                     : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15",
+                       "r0",
+                       "r1",
+                       "r2",
+                       "r3",
+                       "r4",
+                       "r5");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(COMPUTE STORE
+                 : [inr0] "+r"(inr0),
+                   [inr1] "+r"(inr1),
+                   [inr2] "+r"(inr2),
+                   [inr3] "+r"(inr3),
+                   [out] "+r"(out0)
+                 : [w0] "w"(w0),
+                   [w1] "w"(w1),
+                   [w2] "w"(w2),
+                   [w3] "w"(w3),
+                   [w4] "w"(w4),
+                   [w5] "w"(w5),
+                   [w6] "w"(w6),
+                   [w7] "w"(w7),
+                   [w8] "w"(w8),
+                   [vbias] "w"(vbias),
+                   [outl] "r"(outl_ptr),
+                   [flag_mask] "r"(flag_mask)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v15",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19",
+                   "v20",
+                   "v21",
+                   "v22",
+                   "x0",
+                   "x1",
+                   "x2",
+                   "x3",
+                   "x4",
+                   "x5",
+                   "x6",
+                   "x7");
+#else
+    asm volatile(COMPUTE STORE
+                 : [r0] "+r"(inr0),
+                   [r1] "+r"(inr1),
+                   [r2] "+r"(inr2),
+                   [r3] "+r"(inr3),
+                   [out0] "+r"(out0),
+                   [wc0] "+r"(weight_c)
+                 : [flag_mask] "r"(flag_mask), [outl] "r"(outl_ptr)
+                 : "cc",
+                   "memory",
+                   "q0",
+                   "q1",
+                   "q2",
+                   "q3",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15",
+                   "r0",
+                   "r1",
+                   "r2",
+                   "r3",
+                   "r4",
+                   "r5");
+#endif
+  }
+}
 void conv_3x3s1_depthwise_fp32(const float* i_data,
                                float* o_data,
                                int bs,
@@ -37,6 +816,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
                                const float* weights,
                                const float* bias,
                                const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                                ARMContext* ctx) {
   int threads = ctx->threads();
 
@@ -78,6 +858,31 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
   remain = remain > 0 ? remain : 0;
   int row_len = win_round * out_c_block;
 
+  float six_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+  float scale_ptr[4] = {1.f, 1.f, 1.f, 1.f};
+  float relu_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+  if (act_param.has_active) {
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        break;
+      case lite_api::ActivationType::kRelu6:
+        six_ptr[0] = act_param.Relu_clipped_coef;
+        six_ptr[1] = act_param.Relu_clipped_coef;
+        six_ptr[2] = act_param.Relu_clipped_coef;
+        six_ptr[3] = act_param.Relu_clipped_coef;
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        scale_ptr[0] = act_param.Leaky_relu_alpha;
+        scale_ptr[1] = act_param.Leaky_relu_alpha;
+        scale_ptr[2] = act_param.Leaky_relu_alpha;
+        scale_ptr[3] = act_param.Leaky_relu_alpha;
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  }
   for (int n = 0; n < bs; ++n) {
     const float* din_batch = i_data + n * ic * size_in_channel;
     float* dout_batch = o_data + n * oc * size_out_channel;
@@ -147,6 +952,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
           outc21 = ptr_write;
           outc31 = ptr_write;
         }
+
         float* outl[] = {outc00,
                          outc10,
                          outc20,
@@ -156,361 +962,54 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
                          outc21,
                          outc31,
                          reinterpret_cast<float*>(bias_local),
-                         reinterpret_cast<float*>(flag_relu)};
+                         reinterpret_cast<float*>(relu_ptr),
+                         reinterpret_cast<float*>(six_ptr),
+                         reinterpret_cast<float*>(scale_ptr)};
         void* outl_ptr = reinterpret_cast<void*>(outl);
         for (int w = 0; w < w_loop; ++w) {
           bool flag_mask = (w == w_loop - 1) && flag_remain;
           float* out0 = pre_out;
-// clang-format off
 #ifdef __aarch64__
-          asm volatile(
-          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
-          "ldp    q6, q7,   [%[inr1]], #32\n" /* load input r1*/
-          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
-          "ldp    q8, q9,   [%[inr1]], #32\n" /* load input r1*/
-          "ldp    q4, q5,   [%[inr0]]\n"      /* load input r0*/
-          "ldp    q10, q11, [%[inr1]]\n"      /* load input r1*/
-          /*  r0, r1, mul w0, get out r0, r1 */
-          "fmul   v15.4s ,  %[w0].4s,  v0.4s\n" /* outr00 = w0 * r0, 0*/
-          "fmul   v16.4s ,  %[w0].4s,  v1.4s\n" /* outr01 = w0 * r0, 1*/
-          "fmul   v17.4s ,  %[w0].4s,  v2.4s\n" /* outr02 = w0 * r0, 2*/
-          "fmul   v18.4s ,  %[w0].4s,  v3.4s\n" /* outr03 = w0 * r0, 3*/
-          "fmul   v19.4s ,  %[w0].4s,  v6.4s\n" /* outr10 = w0 * r1, 0*/
-          "fmul   v20.4s ,  %[w0].4s,  v7.4s\n" /* outr11 = w0 * r1, 1*/
-          "fmul   v21.4s ,  %[w0].4s,  v8.4s\n" /* outr12 = w0 * r1, 2*/
-          "fmul   v22.4s ,  %[w0].4s,  v9.4s\n" /* outr13 = w0 * r1, 3*/
-          /*  r0, r1, mul w1, get out r0, r1 */
-          "fmla   v15.4s ,  %[w1].4s,  v1.4s\n" /* outr00 = w1 * r0[1]*/
-          "ldp    q0, q1,   [%[inr2]], #32\n"     /* load input r2*/
-          "fmla   v16.4s ,  %[w1].4s,  v2.4s\n" /* outr01 = w1 * r0[2]*/
-          "fmla   v17.4s ,  %[w1].4s,  v3.4s\n" /* outr02 = w1 * r0[3]*/
-          "fmla   v18.4s ,  %[w1].4s,  v4.4s\n" /* outr03 = w1 * r0[4]*/
-          "fmla   v19.4s ,  %[w1].4s,  v7.4s\n" /* outr10 = w1 * r1[1]*/
-          "fmla   v20.4s ,  %[w1].4s,  v8.4s\n" /* outr11 = w1 * r1[2]*/
-          "fmla   v21.4s ,  %[w1].4s,  v9.4s\n" /* outr12 = w1 * r1[3]*/
-          "fmla   v22.4s ,  %[w1].4s,  v10.4s\n"/* outr13 = w1 * r1[4]*/
-          /*  r0, r1, mul w2, get out r0, r1 */
-          "fmla   v15.4s ,  %[w2].4s,  v2.4s\n" /* outr00 = w2 * r0[2]*/
-          "fmla   v16.4s ,  %[w2].4s,  v3.4s\n" /* outr01 = w2 * r0[3]*/
-          "ldp    q2, q3,   [%[inr2]], #32\n"     /* load input r2*/
-          "fmla   v17.4s ,  %[w2].4s,  v4.4s\n" /* outr02 = w2 * r0[4]*/
-          "fmla   v18.4s ,  %[w2].4s,  v5.4s\n" /* outr03 = w2 * r0[5]*/
-          "ldp    q4, q5,   [%[inr2]]\n"          /* load input r2*/
-          "fmla   v19.4s ,  %[w2].4s,  v8.4s\n" /* outr10 = w2 * r1[2]*/
-          "fmla   v20.4s ,  %[w2].4s,  v9.4s\n" /* outr11 = w2 * r1[3]*/
-          "fmla   v21.4s ,  %[w2].4s,  v10.4s\n"/* outr12 = w2 * r1[4]*/
-          "fmla   v22.4s ,  %[w2].4s,  v11.4s\n"/* outr13 = w2 * r1[5]*/
-          /*  r1, r2, mul w3, get out r0, r1 */
-          "fmla   v15.4s ,  %[w3].4s,  v6.4s\n" /* outr00 = w3 * r1[0]*/
-          "fmla   v16.4s ,  %[w3].4s,  v7.4s\n" /* outr01 = w3 * r1[1]*/
-          "fmla   v17.4s ,  %[w3].4s,  v8.4s\n" /* outr02 = w3 * r1[2]*/
-          "fmla   v18.4s ,  %[w3].4s,  v9.4s\n" /* outr03 = w3 * r1[3]*/
-          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr10 = w3 * r2[0]*/
-          "fmla   v20.4s ,  %[w3].4s,  v1.4s\n" /* outr11 = w3 * r2[1]*/
-          "fmla   v21.4s ,  %[w3].4s,  v2.4s\n" /* outr12 = w3 * r2[2]*/
-          "fmla   v22.4s ,  %[w3].4s,  v3.4s\n" /* outr13 = w3 * r2[3]*/
-          /*  r1, r2, mul w4, get out r0, r1 */
-          "fmla   v15.4s ,  %[w4].4s,  v7.4s\n" /* outr00 = w4 * r1[1]*/
-          "ldp    q6, q7,   [%[inr3]], #32\n"     /* load input r3*/
-          "fmla   v16.4s ,  %[w4].4s,  v8.4s\n" /* outr01 = w4 * r1[2]*/
-          "fmla   v17.4s ,  %[w4].4s,  v9.4s\n" /* outr02 = w4 * r1[3]*/
-          "fmla   v18.4s ,  %[w4].4s,  v10.4s\n"/* outr03 = w4 * r1[4]*/
-          "ldp    x0, x1, [%[outl]]  \n"
-          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr10 = w4 * r2[1]*/
-          "fmla   v20.4s ,  %[w4].4s,  v2.4s\n" /* outr11 = w4 * r2[2]*/
-          "fmla   v21.4s ,  %[w4].4s,  v3.4s\n" /* outr12 = w4 * r2[3]*/
-          "fmla   v22.4s ,  %[w4].4s,  v4.4s\n" /* outr13 = w4 * r2[4]*/
-          /*  r1, r2, mul w5, get out r0, r1 */
-          "fmla   v15.4s ,  %[w5].4s,  v8.4s\n" /* outr00 = w5 * r1[2]*/
-          "fmla   v16.4s ,  %[w5].4s,  v9.4s\n" /* outr01 = w5 * r1[3]*/
-          "ldp    q8, q9,   [%[inr3]], #32\n"     /* load input r3*/
-          "fmla   v17.4s ,  %[w5].4s,  v10.4s\n"/* outr02 = w5 * r1[4]*/
-          "fmla   v18.4s ,  %[w5].4s,  v11.4s\n"/* outr03 = w5 * r1[5]*/
-          "ldp    q10, q11,   [%[inr3]]\n"        /* load input r3*/
-          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr10 = w5 * r2[2]*/
-          "fmla   v20.4s ,  %[w5].4s,  v3.4s\n" /* outr11 = w5 * r2[3]*/
-          "fmla   v21.4s ,  %[w5].4s,  v4.4s\n" /* outr12 = w5 * r2[4]*/
-          "fmla   v22.4s ,  %[w5].4s,  v5.4s\n" /* outr13 = w5 * r2[5]*/
-          /*  r2, r3, mul w6, get out r0, r1 */
-          "fmla   v15.4s ,  %[w6].4s,  v0.4s\n" /* outr00 = w6 * r2[0]*/
-          "fmla   v16.4s ,  %[w6].4s,  v1.4s\n" /* outr01 = w6 * r2[1]*/
-          "fmla   v17.4s ,  %[w6].4s,  v2.4s\n" /* outr02 = w6 * r2[2]*/
-          "fmla   v18.4s ,  %[w6].4s,  v3.4s\n" /* outr03 = w6 * r2[3]*/
-          "ldp    x2, x3, [%[outl], #16]  \n"
-          "fmla   v19.4s ,  %[w6].4s,  v6.4s\n" /* outr10 = w6 * r3[0]*/
-          "fmla   v20.4s ,  %[w6].4s,  v7.4s\n" /* outr11 = w6 * r3[1]*/
-          "fmla   v21.4s ,  %[w6].4s,  v8.4s\n" /* outr12 = w6 * r3[2]*/
-          "fmla   v22.4s ,  %[w6].4s,  v9.4s\n" /* outr13 = w6 * r3[3]*/
-          /*  r2, r3, mul w7, get out r0, r1 */
-          "fmla   v15.4s ,  %[w7].4s,  v1.4s\n" /* outr00 = w7 * r2[1]*/
-          "fmla   v16.4s ,  %[w7].4s,  v2.4s\n" /* outr01 = w7 * r2[2]*/
-          "fmla   v17.4s ,  %[w7].4s,  v3.4s\n" /* outr02 = w7 * r2[3]*/
-          "fmla   v18.4s ,  %[w7].4s,  v4.4s\n" /* outr03 = w7 * r2[4]*/
-          "ldp    x4, x5, [%[outl], #32]  \n"
-          "fmla   v19.4s ,  %[w7].4s,  v7.4s\n" /* outr10 = w7 * r3[1]*/
-          "fmla   v20.4s ,  %[w7].4s,  v8.4s\n" /* outr11 = w7 * r3[2]*/
-          "fmla   v21.4s ,  %[w7].4s,  v9.4s\n" /* outr12 = w7 * r3[3]*/
-          "fmla   v22.4s ,  %[w7].4s,  v10.4s\n"/* outr13 = w7 * r3[4]*/
-          /*  r2, r3, mul w8, get out r0, r1 */
-          "fmla   v15.4s ,  %[w8].4s,  v2.4s\n" /* outr00 = w8 * r2[2]*/
-          "fmla   v16.4s ,  %[w8].4s,  v3.4s\n" /* outr01 = w8 * r2[3]*/
-          "fmla   v17.4s ,  %[w8].4s,  v4.4s\n" /* outr02 = w8 * r2[0]*/
-          "fmla   v18.4s ,  %[w8].4s,  v5.4s\n" /* outr03 = w8 * r2[1]*/
-          "ldp    x6, x7, [%[outl], #48]  \n"
-          "fmla   v19.4s ,  %[w8].4s,  v8.4s\n" /* outr10 = w8 * r3[2]*/
-          "fmla   v20.4s ,  %[w8].4s,  v9.4s\n" /* outr11 = w8 * r3[3]*/
-          "fmla   v21.4s ,  %[w8].4s,  v10.4s\n"/* outr12 = w8 * r3[0]*/
-          "fmla   v22.4s ,  %[w8].4s,  v11.4s\n"/* outr13 = w8 * r3[1]*/
-
-          "fadd   v15.4s, v15.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v16.4s, v16.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v17.4s, v17.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v18.4s, v18.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v19.4s, v19.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v20.4s, v20.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v21.4s, v21.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v22.4s, v22.4s, %[vbias].4s\n"/* add bias */
-
-          /* transpose */
-          "trn1   v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/
-          "trn2   v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/
-          "trn1   v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/
-          "trn2   v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/
-          "trn1   v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/
-          "trn2   v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/
-          "trn1   v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/
-          "trn2   v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/
-          "trn1   v15.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
-          "trn2   v19.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
-          "trn1   v17.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
-          "trn2   v21.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
-          "trn1   v16.2d, v4.2d, v6.2d\n"  /* r1: a0a1a2a3*/
-          "trn2   v20.2d, v4.2d, v6.2d\n"  /* r1: c0c1c2c3*/
-          "trn1   v18.2d, v5.2d, v7.2d\n"  /* r1: b0b1b2b3*/
-          "trn2   v22.2d, v5.2d, v7.2d\n"  /* r1: d0d1d2d3*/
-
-          "cbz    %w[flag_relu],  0f\n"    /* skip relu*/
-          "movi   v0.4s, #0\n"             /* for relu */
-          "fmax   v15.4s, v15.4s, v0.4s\n"
-          "fmax   v16.4s, v16.4s, v0.4s\n"
-          "fmax   v17.4s, v17.4s, v0.4s\n"
-          "fmax   v18.4s, v18.4s, v0.4s\n"
-          "fmax   v19.4s, v19.4s, v0.4s\n"
-          "fmax   v20.4s, v20.4s, v0.4s\n"
-          "fmax   v21.4s, v21.4s, v0.4s\n"
-          "fmax   v22.4s, v22.4s, v0.4s\n"
-          "0:\n"
-          "cbnz   %w[flag_mask], 1f\n"
-          "str    q15, [x0]\n" /* save outc00 */
-          "str    q16, [x4]\n" /* save outc01 */
-          "str    q17, [x1]\n" /* save outc10 */
-          "str    q18, [x5]\n" /* save outc11 */
-          "str    q19, [x2]\n" /* save outc20 */
-          "str    q20, [x6]\n" /* save outc21 */
-          "str    q21, [x3]\n" /* save outc30 */
-          "str    q22, [x7]\n" /* save outc31 */
-          "b 2f\n"
-          "1:\n"
-          "str  q15, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q17, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q19, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q21, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q16, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q18, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q20, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q22, [%[out]], #16 \n" /* save remain to pre_out */
-          "2:\n"
-          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
-           [inr2] "+r"(inr2), [inr3] "+r"(inr3),
-           [out]"+r"(out0)
-          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
-           [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
-           [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
-           [vbias]"w" (vbias), [outl] "r" (outl_ptr),
-           [flag_mask] "r" (flag_mask), [flag_relu] "r" (flag_relu)
-          : "cc", "memory",
-            "v0","v1","v2","v3","v4","v5","v6","v7",
-            "v8", "v9", "v10", "v11", "v15",
-            "v16","v17","v18","v19","v20","v21","v22",
-            "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"
-          );
+          act_switch_3x3s1(inr0,
+                           inr1,
+                           inr2,
+                           inr3,
+                           out0,
+                           weight_c,
+                           flag_mask,
+                           outl_ptr,
+                           w0,
+                           w1,
+                           w2,
+                           w3,
+                           w4,
+                           w5,
+                           w6,
+                           w7,
+                           w8,
+                           vbias,
+                           act_param);
 #else
-          asm volatile(
-          /* load weights */
-          "vld1.32    {d10-d13}, [%[wc0]]!      @ load w0, w1, to q5, q6\n"
-          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w2, to q7\n"
-          /* load r0, r1 */
-          "vld1.32    {d0-d3}, [%[r0]]!         @ load r0, q0, q1\n"
-          "vld1.32    {d4-d7}, [%[r0]]!         @ load r0, q2, q3\n"
-          /* main loop */
-          "0:                                   @ main loop\n"
-          /* mul r0 with w0, w1, w2, get out r0 */
-          "vmul.f32   q8, q5, q0                @ w0 * inr00\n"
-          "vmul.f32   q9, q5, q1                @ w0 * inr01\n"
-          "vmul.f32   q10, q5, q2               @ w0 * inr02\n"
-          "vmul.f32   q11, q5, q3               @ w0 * inr03\n"
-          "vmla.f32   q8, q6, q1                @ w1 * inr01\n"
-          "vld1.32    {d0-d3}, [%[r0]]          @ load r0, q0, q1\n"
-          "vmla.f32   q9, q6, q2                @ w1 * inr02\n"
-          "vmla.f32   q10, q6, q3               @ w1 * inr03\n"
-          "vmla.f32   q11, q6, q0               @ w1 * inr04\n"
-          "vmla.f32   q8, q7, q2                @ w2 * inr02\n"
-          "vmla.f32   q9, q7, q3                @ w2 * inr03\n"
-          "vld1.32    {d4-d7}, [%[r1]]!         @ load r0, q2, q3\n"
-          "vmla.f32   q10, q7, q0               @ w2 * inr04\n"
-          "vmla.f32   q11, q7, q1               @ w2 * inr05\n"
-          "vld1.32    {d0-d3}, [%[r1]]!         @ load r0, q0, q1\n"
-          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w3 to q4\n"
-          /* mul r1 with w0-w5, get out r0, r1 */
-          "vmul.f32   q12, q5, q2               @ w0 * inr10\n"
-          "vmul.f32   q13, q5, q3               @ w0 * inr11\n"
-          "vmul.f32   q14, q5, q0               @ w0 * inr12\n"
-          "vmul.f32   q15, q5, q1               @ w0 * inr13\n"
-          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w4 to q5\n"
-          "vmla.f32   q8, q4, q2                @ w3 * inr10\n"
-          "vmla.f32   q9, q4, q3                @ w3 * inr11\n"
-          "vmla.f32   q10, q4, q0               @ w3 * inr12\n"
-          "vmla.f32   q11, q4, q1               @ w3 * inr13\n"
-          /* mul r1 with w1, w4, get out r1, r0 */
-          "vmla.f32   q8, q5, q3                @ w4 * inr11\n"
-          "vmla.f32   q12, q6, q3               @ w1 * inr11\n"
-          "vld1.32    {d4-d7}, [%[r1]]          @ load r1, q2, q3\n"
-          "vmla.f32   q9, q5, q0                @ w4 * inr12\n"
-          "vmla.f32   q13, q6, q0               @ w1 * inr12\n"
-          "vmla.f32   q10, q5, q1               @ w4 * inr13\n"
-          "vmla.f32   q14, q6, q1               @ w1 * inr13\n"
-          "vmla.f32   q11, q5, q2               @ w4 * inr14\n"
-          "vmla.f32   q15, q6, q2               @ w1 * inr14\n"
-          "vld1.32    {d12-d13}, [%[wc0]]!      @ load w5 to q6\n"
-          /* mul r1 with w2, w5, get out r1, r0 */
-          "vmla.f32   q12, q7, q0               @ w2 * inr12\n"
-          "vmla.f32   q13, q7, q1               @ w2 * inr13\n"
-          "vmla.f32   q8, q6, q0                @ w5 * inr12\n"
-          "vmla.f32   q9, q6, q1                @ w5 * inr13\n"
-          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, q0, q1\n"
-          "vmla.f32   q14, q7, q2               @ w2 * inr14\n"
-          "vmla.f32   q15, q7, q3               @ w2 * inr15\n"
-          "vmla.f32   q10, q6, q2               @ w5 * inr14\n"
-          "vmla.f32   q11, q6, q3               @ w5 * inr15\n"
-          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, q0, q1\n"
-          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w6, to q7\n"
-          /* mul r2 with w3-w8, get out r0, r1 */
-          "vmla.f32   q12, q4, q0               @ w3 * inr20\n"
-          "vmla.f32   q13, q4, q1               @ w3 * inr21\n"
-          "vmla.f32   q14, q4, q2               @ w3 * inr22\n"
-          "vmla.f32   q15, q4, q3               @ w3 * inr23\n"
-          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w7, to q4\n"
-          "vmla.f32   q8,  q7, q0               @ w6 * inr20\n"
-          "vmla.f32   q9,  q7, q1               @ w6 * inr21\n"
-          "vmla.f32   q10, q7, q2               @ w6 * inr22\n"
-          "vmla.f32   q11, q7, q3               @ w6 * inr23\n"
-          /* mul r2 with w4, w7, get out r1, r0 */
-          "vmla.f32   q8,  q4, q1               @ w7 * inr21\n"
-          "vmla.f32   q12, q5, q1               @ w4 * inr21\n"
-          "vld1.32    {d0-d3}, [%[r2]]          @ load r2, q0, q1\n"
-          "vmla.f32   q9,  q4, q2               @ w7 * inr22\n"
-          "vmla.f32   q13, q5, q2               @ w4 * inr22\n"
-          "vmla.f32   q10, q4, q3               @ w7 * inr23\n"
-          "vmla.f32   q14, q5, q3               @ w4 * inr23\n"
-          "vmla.f32   q11, q4, q0               @ w7 * inr24\n"
-          "vmla.f32   q15, q5, q0               @ w4 * inr24\n"
-          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w8 to q5\n"
-          /* mul r1 with w5, w8, get out r1, r0 */
-          "vmla.f32   q12, q6, q2               @ w5 * inr22\n"
-          "vmla.f32   q13, q6, q3               @ w5 * inr23\n"
-          "vmla.f32   q8,  q5, q2               @ w8 * inr22\n"
-          "vmla.f32   q9,  q5, q3               @ w8 * inr23\n"
-          "vld1.32    {d4-d7}, [%[r3]]!         @ load r3, q2, q3\n"
-          "ldr r4,    [%[outl], #32]            @ load bias addr to r4\n"
-          "vmla.f32   q14, q6, q0               @ w5 * inr24\n"
-          "vmla.f32   q15, q6, q1               @ w5 * inr25\n"
-          "vmla.f32   q10, q5, q0               @ w8 * inr24\n"
-          "vmla.f32   q11, q5, q1               @ w8 * inr25\n"
-          "vld1.32    {d0-d3}, [%[r3]]!         @ load r3, q0, q1\n"
-          "sub %[wc0], %[wc0], #144      @ wc0 - 144 to start address\n"
-          /* mul r3 with w6, w7, w8, get out r1 */
-          "vmla.f32   q12, q7, q2               @ w6 * inr30\n"
-          "vmla.f32   q13, q7, q3               @ w6 * inr31\n"
-          "vmla.f32   q14, q7, q0               @ w6 * inr32\n"
-          "vmla.f32   q15, q7, q1               @ w6 * inr33\n"
-          "vmla.f32   q12, q4, q3               @ w7 * inr31\n"
-          "vld1.32    {d4-d7}, [%[r3]]          @ load r3, q2, q3\n"
-          "vld1.32    {d12-d13}, [r4]           @ load bias\n"
-          "vmla.f32   q13, q4, q0               @ w7 * inr32\n"
-          "vmla.f32   q14, q4, q1               @ w7 * inr33\n"
-          "vmla.f32   q15, q4, q2               @ w7 * inr34\n"
-          "ldr r0,    [%[outl]]                 @ load outc00 to r0\n"
-          "vmla.f32   q12, q5, q0               @ w8 * inr32\n"
-          "vmla.f32   q13, q5, q1               @ w8 * inr33\n"
-          "ldr r5,    [%[outl], #36]            @ load flag_relu to r5\n"
-          "vmla.f32   q14, q5, q2               @ w8 * inr34\n"
-          "vmla.f32   q15, q5, q3               @ w8 * inr35\n"
-          "ldr r1,    [%[outl], #4]             @ load outc10 to r1\n"
-          "vadd.f32   q8, q8, q6                @ r00 add bias\n"
-          "vadd.f32   q9, q9, q6                @ r01 add bias\n"
-          "vadd.f32   q10, q10, q6              @ r02 add bias\n"
-          "vadd.f32   q11, q11, q6              @ r03 add bias\n"
-          "ldr r2,    [%[outl], #8]             @ load outc20 to r2\n"
-          "vadd.f32   q12, q12, q6              @ r10 add bias\n"
-          "vadd.f32   q13, q13, q6              @ r11 add bias\n"
-          "vadd.f32   q14, q14, q6              @ r12 add bias\n"
-          "vadd.f32   q15, q15, q6              @ r13 add bias\n"
-          "ldr r3,    [%[outl], #12]            @ load outc30 to r3\n"
-          "vmov.u32   q7, #0                    @ mov zero to q7\n"
-          "cmp  r5, #0                          @ cmp flag relu\n"
-          "beq  1f                              @ skip relu\n"
-          "vmax.f32  q8, q8, q7                 @ r00 relu\n"
-          "vmax.f32  q9, q9, q7                 @ r01 relu\n"
-          "vmax.f32  q10, q10, q7               @ r02 relu\n"
-          "vmax.f32  q11, q11, q7               @ r03 relu\n"
-          "vmax.f32  q12, q12, q7               @ r10 relu\n"
-          "vmax.f32  q13, q13, q7               @ r11 relu\n"
-          "vmax.f32  q14, q14, q7               @ r12 relu\n"
-          "vmax.f32  q15, q15, q7               @ r13 relu\n"
-          "1:\n"
-          "ldr r4,   [%[outl], #16]   @ load outc01 to r4\n"
-          "vtrn.32   q8, q9           @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n"
-          "vtrn.32   q10, q11         @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n"
-          "vtrn.32   q12, q13         @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n"
-          "vtrn.32   q14, q15         @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n"
-          "ldr r5,   [%[outl], #20]   @ load outc11 to r5\n"
-          "vswp      d17, d20         @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n"
-          "vswp      d19, d22         @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n"
-          "vswp      d25, d28         @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n"
-          "vswp      d27, d30         @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n"
-          "cmp %[flag_mask], #0       @ cmp flag mask\n"
-          "bne 2f\n"
-          "vst1.32   {d16-d17}, [r0]  @ save outc00\n"
-          "vst1.32   {d18-d19}, [r1]  @ save outc10\n"
-          "vst1.32   {d20-d21}, [r2]  @ save outc20\n"
-          "vst1.32   {d22-d23}, [r3]  @ save outc30\n"
-          "vst1.32   {d24-d25}, [r4]  @ save outc01\n"
-          "vst1.32   {d26-d27}, [r5]  @ save outc11\n"
-          "ldr r0,   [%[outl], #24]   @ load outc21 to r0\n"
-          "ldr r1,   [%[outl], #28]   @ load outc31 to r1\n"
-          "vst1.32   {d28-d29}, [r0]  @ save outc21\n"
-          "vst1.32   {d30-d31}, [r1]  @ save outc31\n"
-          "b 3f                       @ branch end\n"
-          "2: \n"
-          "vst1.32 {d16-d17}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d18-d19}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d20-d21}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d22-d23}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d24-d25}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d26-d27}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d28-d29}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d30-d31}, [%[out0]]!  @ save remain to pre_out\n"
-          "3: \n"
-          : [r0] "+r"(inr0), [r1] "+r"(inr1),
-            [r2] "+r"(inr2), [r3] "+r"(inr3),
-            [out0] "+r"(out0), [wc0] "+r"(weight_c)
-          : [flag_mask] "r" (flag_mask), [outl] "r" (outl_ptr)
-          : "cc", "memory",
-            "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-            "q10", "q11", "q12", "q13","q14", "q15", "r0", "r1", "r2", "r3", "r4", "r5"
-          );
-#endif  //  __arch64__
-          // clang-format on
+          act_switch_3x3s1(inr0,
+                           inr1,
+                           inr2,
+                           inr3,
+                           out0,
+                           weight_c,
+                           flag_mask,
+                           outl_ptr,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           vbias,
+                           act_param);
+#endif
           outl[0] += 4;
           outl[1] += 4;
           outl[2] += 4;
@@ -519,6 +1018,10 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
           outl[5] += 4;
           outl[6] += 4;
           outl[7] += 4;
+          inr0 += 16;
+          inr1 += 16;
+          inr2 += 16;
+          inr3 += 16;
           if (flag_mask) {
             memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
             memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
diff --git a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
index 807135f57dfadf690277ab57bd5597e9470ae549..f5b196efcca3f3f35367f2fea5e8f475b7147f48 100644
--- a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
@@ -75,6 +75,7 @@ void conv_3x3s2_direct_fp32(const float* i_data,
   //! prepack input to tmp buffer
   //! write output to tmp buffer
   auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
   const int threads = ctx->threads();
   int l2_size = ctx->llc_size() / sizeof(float);
   const int pad_w = paddings[2];
@@ -510,7 +511,8 @@ void conv_3x3s2_direct_fp32(const float* i_data,
                                 oh,
                                 ow,
                                 flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
       }
 
 #pragma omp parallel for num_threads(threads)
@@ -839,7 +841,8 @@ void conv_3x3s2_direct_fp32(const float* i_data,
                                 oh,
                                 ow,
                                 flag_relu,
-                                ptr_write);
+                                ptr_write,
+                                &act_param);
       }
     }
   }
diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
index 455781e37e0747950e6740f6db45c1ce8c0e96c8..602239a1fe1675c6eecb5b45a8e526ada98a56bb 100644
--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -205,14 +205,12 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                                           \
   "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"                        \
   "fadd v16.4s, v16.4s, v11.4s                  \n"                       \
-  "fadd v16.4s, v16.4s, v12.4s                  \n"
+  "fadd v16.4s, v16.4s, v12.4s                  \n" /* r4 */              \
+  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"                          \
+  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"                          \
+  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"
 
 #define LEFT_RESULT_S2                              \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
-                                                    \
   "st1 {v16.4s}, [%[outptr0]], #16              \n" \
                                                     \
   "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
@@ -244,53 +242,52 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                     \
   "blt 1f                                     \n"
 
-#define MID_COMPUTE_S2                                      \
-  "2:                                          \n" /* r0 */ \
-  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"            \
-  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"            \
-  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v2.16b, v18.16b, #4     \n"                \
-  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" /* r1 */    \
-  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"            \
-  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"            \
-  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v4.16b, v19.16b, #4     \n"                \
-                                                            \
-  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n" /* r2 */    \
-  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"            \
-  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"            \
-                                                            \
-  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"            \
-  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"            \
-                                                            \
-  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"           \
-  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v6.16b, v20.16b, #4     \n"                \
-                                                            \
-  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n" /* r3 */    \
-  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"            \
-  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"            \
-  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v8.16b, v21.16b, #4     \n"                \
-                                                            \
-  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"             \
-                                                            \
-  "fadd v16.4s, v16.4s, v11.4s                  \n"         \
-  "fadd v16.4s, v16.4s, v12.4s                  \n"
+#define MID_COMPUTE_S2                                       \
+  "2:                                          \n" /* r0 */  \
+  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"             \
+  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"             \
+  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"            \
+                                                             \
+  "ext  v10.16b, v2.16b, v18.16b, #4     \n"                 \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" /* r1 */     \
+  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"             \
+  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"             \
+  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"            \
+                                                             \
+  "ext  v10.16b, v4.16b, v19.16b, #4     \n"                 \
+                                                             \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n" /* r2 */     \
+  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"             \
+  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"             \
+                                                             \
+  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"             \
+  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"             \
+                                                             \
+  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"            \
+  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"            \
+                                                             \
+  "ext  v10.16b, v6.16b, v20.16b, #4     \n"                 \
+                                                             \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n" /* r3 */     \
+  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"             \
+  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"             \
+  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"            \
+                                                             \
+  "ext  v10.16b, v8.16b, v21.16b, #4     \n"                 \
+                                                             \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"              \
+                                                             \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"          \
+  "fadd v16.4s, v16.4s, v12.4s                  \n" /* r4 */ \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"             \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"             \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"            \
+                                                             \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"              \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"             \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"
 
 #define MID_RESULT_S2                               \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
-                                                    \
-  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
-  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
-  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
   "st1 {v16.4s}, [%[outptr0]], #16              \n" \
                                                     \
   "fadd v17.4s, v17.4s, v13.4s                  \n" \
@@ -360,14 +357,12 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                            \
   "fadd v16.4s, v16.4s, v11.4s                  \n"        \
   "fadd v16.4s, v16.4s, v12.4s                  \n"        \
-  "ld1 {v1.4s}, [%[outptr1]]                  \n"
+  "ld1 {v1.4s}, [%[outptr1]]                  \n" /* r4 */ \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"           \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"           \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"
 
 #define RIGHT_RESULT_S2                             \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
-                                                    \
   "bif  v16.16b, v0.16b, %[wmask].16b    \n"        \
                                                     \
   "fadd v17.4s, v17.4s, v13.4s                  \n" \
@@ -382,11 +377,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "4:                                          \n"
 
 #define LEFT_RESULT_S2_RELU                         \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
-                                                    \
   "fmax v16.4s, v16.4s, %[vzero].4s            \n"  \
                                                     \
   "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
@@ -424,14 +414,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "blt 1f                                     \n"
 
 #define MID_RESULT_S2_RELU                                    \
-  /* r4 */                                                    \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
-                                                              \
-  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"               \
-  "ld1 {v15.4s}, [%[inptr0]]                 \n"              \
-  "ld1 {v18.4s}, [%[inptr1]]                 \n"              \
   "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
                                                               \
   "fadd v17.4s, v17.4s, v13.4s                  \n"           \
@@ -457,11 +439,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
   "bne  2b                                    \n"
 
 #define RIGHT_RESULT_S2_RELU                                  \
-  /* r4 */                                                    \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
-                                                              \
   "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
                                                               \
   "fadd v17.4s, v17.4s, v13.4s                  \n"           \
diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h
index e4279d9a728bc7af0f14a00b781db449fc426582..c4fe965d0b17fa56d76812af14b40bddbc5b313a 100644
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -20,6 +20,7 @@
 #include "lite/backends/arm/math/sgemm.h"
 #include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
@@ -28,6 +29,7 @@ namespace arm {
 namespace math {
 
 #define LITEMAX(a, b) ((a) > (b) ? (a) : (b))
+#define LITEMIN(a, b) ((a) < (b) ? (a) : (b))
 #define ROUNDUP(a, b) ((((a) + (b)-1) / (b)) * (b))
 
 template <PrecisionType Ptype>
@@ -589,7 +591,238 @@ inline void prepack_input_nxwc8_int8_dw(const int8_t* din,
     }
   }
 }
+// clang-format off
+#ifdef __aarch64__
+#define NCHWC1_TRANS_FP32_COMPUTE                                      \
+  "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q1, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q2, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q3, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "movi v20.4s, #0                \n" /* for relu */                   \
+  "1:                             \n" /* main loop*/
+
+#define NCHWC1_TRANS_FP32_RELU                 \
+  "fmax   v0.4s, v0.4s, v20.4s    \n" /*relu*/ \
+  "fmax   v1.4s, v1.4s, v20.4s    \n" /*relu*/ \
+  "fmax   v2.4s, v2.4s, v20.4s    \n" /*relu*/ \
+  "fmax   v3.4s, v3.4s, v20.4s    \n" /*relu*/
+
+#define NCHWC1_TRANS_FP32_RELU6                    \
+  "fmin   v0.4s, v0.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v1.4s, v1.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v2.4s, v2.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v3.4s, v3.4s, %[six].4s  \n" /* relu6 */
+
+#define NCHWC1_TRANS_FP32_LEAKY_RELU                   \
+  "cmhs v4.4s, v0.4s, v20.4s \n"       /* vcgeq_u32 */ \
+  "cmhs v5.4s, v1.4s, v20.4s \n"       /* vcgeq_u32 */ \
+  "cmhs v6.4s, v2.4s, v20.4s \n"       /* vcgeq_u32 */ \
+  "cmhs v7.4s, v3.4s, v20.4s \n"       /* vcgeq_u32 */ \
+  "fmul v8.4s, v0.4s, %[scale].4s \n"  /* mul */       \
+  "fmul v9.4s, v1.4s, %[scale].4s \n"  /* mul */       \
+  "fmul v10.4s, v2.4s, %[scale].4s \n" /* mul */       \
+  "fmul v11.4s, v3.4s, %[scale].4s \n" /* mul */       \
+  "bif  v0.16b, v8.16b, v4.16b \n"     /* choose*/     \
+  "bif  v1.16b, v9.16b, v5.16b \n"     /* choose*/     \
+  "bif  v2.16b, v10.16b, v6.16b \n"    /* choose*/     \
+  "bif  v3.16b, v11.16b, v7.16b \n"    /* choose*/
+
+#define NCHWC1_TRANS_FP32_STORE                                        \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/               \
+                                                                       \
+  "str    q0, [%[doutc0r0]], #16  \n" /* store c0r0*/                  \
+  "str    q1, [%[doutc0r0]], #16  \n" /* store c0r0*/                  \
+  "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q1, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "str    q2, [%[doutc0r0]], #16  \n" /* store c0r0*/                  \
+  "str    q3, [%[doutc0r0]], #16  \n" /* store c2r0*/                  \
+  "ldr q2, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "ldr q3, [%[ptr_din]], #16      \n" /* load data, c0r0, c1r0, c0r1*/ \
+                                                                       \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC1_TRANS_FP32_COMPUTE                                       \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0 \n" \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!                 @ load data, c0r0 \n" \
+  "vmov.u32 q15, #0                       @ dump zero\n"                \
+  "1:                                     @ main loop\n"
 
+#define NCHWC1_TRANS_FP32_RELU                      \
+  "vmax.f32   q0, q0, q15                 @ relu\n" \
+  "vmax.f32   q1, q1, q15                 @ relu\n" \
+  "vmax.f32   q2, q2, q15                 @ relu\n" \
+  "vmax.f32   q3, q3, q15                 @ relu\n"
+
+#define NCHWC1_TRANS_FP32_RELU6                  \
+  "vmin.f32   q0, q0, %q[six]        @ relu6 \n" \
+  "vmin.f32   q1, q1, %q[six]        @ relu6 \n" \
+  "vmin.f32   q2, q2, %q[six]        @ relu6 \n" \
+  "vmin.f32   q3, q3, %q[six]        @ relu6 \n"
+
+#define NCHWC1_TRANS_FP32_LEAKY_RELU          \
+  "vcge.f32   q5, q0, q15        @ q0 > 0 \n" \
+  "vcge.f32   q6, q1, q15        @ q0 > 0 \n" \
+  "vcge.f32   q7, q2, q15        @ q0 > 0 \n" \
+  "vcge.f32   q8, q3, q15        @ q0 > 0 \n" \
+  "vmul.f32 q9, q0, %q[scale] \n"             \
+  "vmul.f32 q10, q1, %q[scale] \n"            \
+  "vmul.f32 q11, q2, %q[scale] \n"            \
+  "vmul.f32 q12, q3, %q[scale] \n"            \
+  "vbif q0, q9, q5 @ choose \n"               \
+  "vbif q1, q10, q6 @ choose \n"              \
+  "vbif q2, q11, q7 @ choose \n"              \
+  "vbif q3, q12, q8 @ choose \n"
+
+#define NCHWC1_TRANS_FP32_STORE                                 \
+  "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result    \n" \
+  "vst1.32  {d2-d3}, [%[doutc0r0]]!       @ store result,  \n"  \
+  "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"   \
+                                                                \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"       \
+  "vst1.32  {d4-d5}, [%[doutc0r0]]!       @ store result    \n" \
+  "vst1.32  {d6-d7}, [%[doutc0r0]]!       @ store result,  \n"  \
+                                                                \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!         @ load data \n"       \
+                                                                \
+  "bne    1b                              @ jump to main loop\n"
+#endif
+// clang-format on
+inline void act_switch_c1_fp32(const float* din_ptr,
+                               float* doutc0_ptr,
+                               int cnt_loop,
+                               const operators::ActivationParam* act_param) {
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t six = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t scale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_RELU
+                         NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_RELU
+                         NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_RELU
+                         NCHWC1_TRANS_FP32_RELU6 NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [six] "w"(six)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_RELU
+                         NCHWC1_TRANS_FP32_RELU6 NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [six] "w"(six)
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_LEAKY_RELU
+                         NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [scale] "w"(scale)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v20");
+#else
+        asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_LEAKY_RELU
+                         NCHWC1_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [scale] "w"(scale)
+                     : "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [cnt] "+r"(cnt_loop),
+                   [ptr_din] "+r"(din_ptr)
+                 :
+                 : "v0", "v1", "v2", "v3", "v20");
+#else
+    asm volatile(NCHWC1_TRANS_FP32_COMPUTE NCHWC1_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [ptr_din] "+r"(din_ptr),
+                   [cnt] "+r"(cnt_loop)
+                 :
+                 : "q0", "q1", "q2", "q3", "q15");
+#endif
+  }
+}
 /*wirte result in outputs
 * input din: [n, c, h, w], output dout: [n, c, h, w]
 */
@@ -605,13 +838,14 @@ inline bool write_to_output_c1_fp32(const float* din,
                                     int height,
                                     int width,
                                     bool flag_relu,
-                                    float* trash_ptr) {
+                                    float* trash_ptr,
+                                    operators::ActivationParam* act_param) {
   if (cs > channel) {
     return true;
   }
 
   const int c1 = 1;
-  const int w4 = 4;
+  const int w4 = 16;
 
   int size_c_out = width * height;
 
@@ -623,98 +857,53 @@ inline bool write_to_output_c1_fp32(const float* din,
 
   int w_round = we - ws;
   int cnt = (width - ws) / w4;
-
+  int remain = (width - ws) % w4;
   for (int i = 0; i < size_h; i++) {
     int size_w = i * width;
     float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
     const float* din_hei_ptr = ptr_din + i * w_round * c1;
     if (cnt > 0) {
       int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "fmax   v1.4s, v0.4s, v20.4s    \n" /*relu*/
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q1, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "bne    1b                      \n" /* jump to main loop*/
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v20");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-
-            "vmax.f32   q1, q0, q15                 @ relu\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-
-            "vst1.32  {d2-d3}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "1:                             \n" /* main loop*/
-            "str    q0, [%[doutc0r0]], #16  \n" /* store c2r0*/
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "ldr q0, [%[ptr_din]], #16      \n" /* load data, c0r0, c0r1, c0r2,
-                                                   c0r3 */
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0");
-#else
-        asm volatile(
-            "vld1.32 {d0-d1}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c0r1, c0r2, c0r3\n"
-            "1:                                     @ main loop\n"
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-            "vld1.32 {d0-d1}, [%[ptr_din]]!         @ load data \n"
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0");
-#endif
-      }
+      act_switch_c1_fp32(din_hei_ptr, doutc0_ptr, cnt_loop, act_param);
     }
-    if (we > width) {
+    if (remain > 0) {
       int offset = i * w_round * c1 + c1 * w4 * cnt;
       din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          din_hei_ptr++;
+      doutc0_ptr += w4 * cnt;
+      int j = w4 * cnt;
+      if (act_param != nullptr && act_param->has_active) {
+        float six = act_param->Relu_clipped_coef;
+        float scale = act_param->Leaky_relu_alpha;
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+            for (; j < width; ++j) {
+              *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
+              din_hei_ptr++;
+            }
+            break;
+          case lite_api::ActivationType::kRelu6:
+            /* 0 <= din <= 6 */
+            for (; j < width; ++j) {
+              float tmp = LITEMAX(din_hei_ptr[0], 0.f);
+              *(doutc0_ptr++) = LITEMIN(tmp, six);
+              din_hei_ptr++;
+            }
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+            /*din = din >= 0 ? din : din * scale*/
+            for (; j < width; ++j) {
+              if (din_hei_ptr[0] >= 0) {
+                *(doutc0_ptr++) = din_hei_ptr[0];
+              } else {
+                *(doutc0_ptr++) = din_hei_ptr[0] * scale;
+              }
+              din_hei_ptr++;
+            }
+            break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
         }
       } else {
         for (; j < width; ++j) {
@@ -725,6 +914,7 @@ inline bool write_to_output_c1_fp32(const float* din,
   }
   return true;
 }
+// clang-format off
 #ifdef __aarch64__
 #define NCHWC2_TRANS_FP32_COMPUTE                                      \
   "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1*/ \
@@ -740,6 +930,18 @@ inline bool write_to_output_c1_fp32(const float* din,
   "fmax   v2.4s, v4.4s, v20.4s    \n" /*relu*/ \
   "fmax   v3.4s, v5.4s, v20.4s    \n" /*relu*/
 
+#define NCHWC2_TRANS_FP32_RELU6                    \
+  "fmin   v2.4s, v2.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v3.4s, v3.4s, %[six].4s  \n" /* relu6 */
+
+#define NCHWC2_TRANS_FP32_LEAKY_RELU                  \
+  "cmhs v6.4s, v2.4s, v20.4s \n"      /* vcgeq_u32 */ \
+  "cmhs v7.4s, v3.4s, v20.4s \n"      /* vcgeq_u32 */ \
+  "fmul v4.4s, v2.4s, %[scale].4s \n" /* mul */       \
+  "fmul v5.4s, v3.4s, %[scale].4s \n" /* mul */       \
+  "bif  v2.16b, v4.16b, v6.16b \n"    /* choose*/     \
+  "bif  v3.16b, v5.16b, v7.16b \n"    /* choose*/
+
 #define NCHWC2_TRANS_FP32_STORE                          \
   "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
                                                          \
@@ -749,8 +951,7 @@ inline bool write_to_output_c1_fp32(const float* din,
   "bne    1b                      \n" /* jump to main loop*/
 #else
 #define NCHWC2_TRANS_FP32_COMPUTE                                      \
-  "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, " \
-  "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"                       \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data, c0r0, c1r0 \n"  \
   "vmov.u32 q15, #0                       @ dump zero\n"               \
   "1:                                     @ main loop\n"               \
   "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "   \
@@ -764,11 +965,21 @@ inline bool write_to_output_c1_fp32(const float* din,
   "vmax.f32   q0, q0, q15                 @ relu\n" \
   "vmax.f32   q1, q1, q15                 @ relu\n"
 
+#define NCHWC2_TRANS_FP32_RELU6                  \
+  "vmin.f32   q0, q0, %q[six]        @ relu6 \n" \
+  "vmin.f32   q1, q1, %q[six]        @ relu6 \n"
+
+#define NCHWC2_TRANS_FP32_LEAKY_RELU          \
+  "vcge.f32   q5, q0, q15        @ q0 > 0 \n" \
+  "vcge.f32   q6, q1, q15        @ q0 > 0 \n" \
+  "vmul.f32 q9, q0, %q[scale] \n"             \
+  "vmul.f32 q10, q1, %q[scale] \n"            \
+  "vbif q0, q9, q5 @ choose \n"               \
+  "vbif q1, q10, q6 @ choose \n"
+
 #define NCHWC2_TRANS_FP32_STORE                                 \
-  "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add " \
-  "pointer\n"                                                   \
-  "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add " \
-  "pointer\n"                                                   \
+  "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add pointer\n"   \
+  "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add pointer\n"   \
                                                                 \
   "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"   \
                                                                 \
@@ -776,6 +987,151 @@ inline bool write_to_output_c1_fp32(const float* din,
                                                                 \
   "bne    1b                              @ jump to main loop\n"
 #endif
+// clang-format on
+inline void act_switch_c2_fp32(const float* din_ptr,
+                               float* doutc0_ptr,
+                               float* doutc1_ptr,
+                               int cnt_loop,
+                               const operators::ActivationParam* act_param) {
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t six = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t scale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
+#ifdef __aarch64__
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_RELU6 NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [six] "w"(six)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_RELU6 NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [six] "w"(six)
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_LEAKY_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [scale] "w"(scale)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v20");
+#else
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_LEAKY_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [scale] "w"(scale)
+                     : "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [cnt] "+r"(cnt_loop),
+                   [ptr_din] "+r"(din_ptr)
+                 :
+                 : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
+#else
+    asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [ptr_din] "+r"(din_ptr),
+                   [cnt] "+r"(cnt_loop)
+                 :
+                 : "q0", "q1", "q2", "q3", "q15");
+#endif
+  }
+}
 /*wirte result in outputs
 * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
 */
@@ -791,11 +1147,11 @@ inline bool write_to_output_c2_fp32(const float* din,
                                     int height,
                                     int width,
                                     bool flag_relu,
-                                    float* trash_ptr) {
+                                    float* trash_ptr,
+                                    operators::ActivationParam* act_param) {
   if (cs > channel) {
     return true;
   }
-
   const int c2 = 2;
   const int w4 = 4;
 
@@ -828,55 +1184,56 @@ inline bool write_to_output_c2_fp32(const float* din,
     const float* din_hei_ptr = ptr_din + i * w_round * c2;
     if (cnt > 0) {
       int cnt_loop = cnt;
-      if (flag_relu) {
-#ifdef __aarch64__
-        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
-                         NCHWC2_TRANS_FP32_STORE
-                     : [doutc0r0] "+r"(doutc0_ptr),
-                       [doutc1r0] "+r"(doutc1_ptr),
-                       [cnt] "+r"(cnt_loop),
-                       [ptr_din] "+r"(din_hei_ptr)
-                     :
-                     : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
-#else
-        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
-                         NCHWC2_TRANS_FP32_STORE
-                     : [doutc0r0] "+r"(doutc0_ptr),
-                       [doutc1r0] "+r"(doutc1_ptr),
-                       [ptr_din] "+r"(din_hei_ptr),
-                       [cnt] "+r"(cnt_loop)
-                     :
-                     : "q0", "q1", "q2", "q3", "q15");
-#endif
-      } else {
-#ifdef __aarch64__
-        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
-                     : [doutc0r0] "+r"(doutc0_ptr),
-                       [doutc1r0] "+r"(doutc1_ptr),
-                       [cnt] "+r"(cnt_loop),
-                       [ptr_din] "+r"(din_hei_ptr)
-                     :
-                     : "v0", "v1", "v2", "v3", "v4", "v5");
-#else
-        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
-                     : [doutc0r0] "+r"(doutc0_ptr),
-                       [doutc1r0] "+r"(doutc1_ptr),
-                       [ptr_din] "+r"(din_hei_ptr),
-                       [cnt] "+r"(cnt_loop)
-                     :
-                     : "q0", "q1", "q2", "q3", "q15");
-#endif
-      }
+      act_switch_c2_fp32(
+          din_hei_ptr, doutc0_ptr, doutc1_ptr, cnt_loop, act_param);
     }
     if (we > width) {
       int offset = i * w_round * c2 + c2 * w4 * cnt;
       din_hei_ptr = ptr_din + offset;
+      doutc0_ptr += w4 * cnt;
+      doutc1_ptr += w4 * cnt;
       int j = we - w4;
-      if (flag_relu) {
-        for (; j < width; ++j) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
-          din_hei_ptr += 2;
+      if (act_param != nullptr && act_param->has_active) {
+        float six = act_param->Relu_clipped_coef;
+        float scale = act_param->Leaky_relu_alpha;
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+            for (; j < width; ++j) {
+              *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
+              *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
+              din_hei_ptr += 2;
+            }
+            break;
+          case lite_api::ActivationType::kRelu6:
+            /* 0 <= din <= 6 */
+            for (; j < width; ++j) {
+              float tmp1 = LITEMAX(din_hei_ptr[0], 0.f);
+              float tmp2 = LITEMAX(din_hei_ptr[1], 0.f);
+              *(doutc0_ptr++) = LITEMIN(tmp1, six);
+              *(doutc1_ptr++) = LITEMIN(tmp2, six);
+              din_hei_ptr += 2;
+            }
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+            /*din = din >= 0 ? din : din * scale*/
+            for (; j < width; ++j) {
+              if (din_hei_ptr[0] >= 0) {
+                *(doutc0_ptr++) = din_hei_ptr[0];
+              } else {
+                *(doutc0_ptr++) = din_hei_ptr[0] * scale;
+              }
+              if (din_hei_ptr[1] >= 0) {
+                *(doutc1_ptr++) = din_hei_ptr[1];
+              } else {
+                *(doutc1_ptr++) = din_hei_ptr[1] * scale;
+              }
+              din_hei_ptr += 2;
+            }
+            break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
         }
       } else {
         for (; j < width; ++j) {
@@ -888,7 +1245,7 @@ inline bool write_to_output_c2_fp32(const float* din,
   }
   return true;
 }
-
+// clang-format off
 #ifdef __aarch64__
 #define NCHWC4_TRANS_FP32_COMPUTE                                   \
   "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */ \
@@ -912,6 +1269,26 @@ inline bool write_to_output_c2_fp32(const float* din,
   "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/ \
   "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
 
+#define NCHWC4_TRANS_FP32_RELU6                      \
+  "fmin   v16.4s, v16.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v17.4s, v17.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v18.4s, v18.4s, %[six].4s  \n" /* relu6 */ \
+  "fmin   v19.4s, v19.4s, %[six].4s  \n" /* relu6 */
+
+#define NCHWC4_TRANS_FP32_LEAKY_RELU                   \
+  "cmhs v8.4s, v16.4s, v20.4s \n"      /* vcgeq_u32 */ \
+  "cmhs v9.4s, v17.4s, v20.4s \n"      /* vcgeq_u32 */ \
+  "cmhs v10.4s, v18.4s, v20.4s \n"     /* vcgeq_u32 */ \
+  "cmhs v11.4s, v19.4s, v20.4s \n"     /* vcgeq_u32 */ \
+  "fmul v4.4s, v16.4s, %[scale].4s \n" /* mul */       \
+  "fmul v5.4s, v17.4s, %[scale].4s \n" /* mul */       \
+  "fmul v6.4s, v18.4s, %[scale].4s \n" /* mul */       \
+  "fmul v7.4s, v19.4s, %[scale].4s \n" /* mul */       \
+  "bif  v16.16b, v4.16b, v8.16b \n"    /* choose*/     \
+  "bif  v17.16b, v5.16b, v9.16b \n"    /* choose*/     \
+  "bif  v18.16b, v6.16b, v10.16b \n"   /* choose*/     \
+  "bif  v19.16b, v7.16b, v11.16b \n"   /* choose*/
+
 #define NCHWC4_TRANS_FP32_STORE                          \
   "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/    \
   "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/    \
@@ -940,6 +1317,26 @@ inline bool write_to_output_c2_fp32(const float* din,
   "vmax.f32   q2, q2, q15        @ relu\n" \
   "vmax.f32   q3, q3, q15        @ relu\n"
 
+#define NCHWC4_TRANS_FP32_RELU6                  \
+  "vmin.f32   q0, q0, %q[six]        @ relu6 \n" \
+  "vmin.f32   q1, q1, %q[six]        @ relu6 \n" \
+  "vmin.f32   q2, q2, %q[six]        @ relu6 \n" \
+  "vmin.f32   q3, q3, %q[six]        @ relu6 \n"
+
+#define NCHWC4_TRANS_FP32_LEAKY_RELU          \
+  "vcge.f32   q5, q0, q15        @ q0 > 0 \n" \
+  "vcge.f32   q6, q1, q15        @ q0 > 0 \n" \
+  "vcge.f32   q7, q2, q15        @ q0 > 0 \n" \
+  "vcge.f32   q8, q3, q15        @ q0 > 0 \n" \
+  "vmul.f32 q9, q0, %q[scale] \n"             \
+  "vmul.f32 q10, q1, %q[scale] \n"            \
+  "vmul.f32 q11, q2, %q[scale] \n"            \
+  "vmul.f32 q12, q3, %q[scale] \n"            \
+  "vbif q0, q9, q5 @ choose \n"               \
+  "vbif q1, q10, q6 @ choose \n"              \
+  "vbif q2, q11, q7 @ choose \n"              \
+  "vbif q3, q12, q8 @ choose \n"
+
 #define NCHWC4_TRANS_FP32_STORE                                        \
   "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n" \
   "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n" \
@@ -953,68 +1350,19 @@ inline bool write_to_output_c2_fp32(const float* din,
                                                                        \
   "bne    1b                            @ jump to main loop\n"
 #endif
-/*wirte result in outputs
-* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
-*/
-inline bool write_to_output_c4_fp32(const float* din,
-                                    float* dout,
-                                    int cs,
-                                    int ce,
-                                    int hs,
-                                    int he,
-                                    int ws,
-                                    int we,
-                                    int channel,
-                                    int height,
-                                    int width,
-                                    bool flag_relu,
-                                    float* trash_ptr) {
-  const int c4 = 4;
-  const int w4 = 4;
-  const int w_round = we - ws;
-  const int ch_n = ce - cs;
-  if (ch_n != 4) {
-    LOG(ERROR) << "write_to_output_c4_fp32 ch_n must be equal 4 and hei_n is "
-                  "more than zero";
-    return false;
-  }
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  float* doutc1r0 = doutc0r0 + size_c_out;
-  float* doutc2r0 = doutc1r0 + size_c_out;
-  float* doutc3r0 = doutc2r0 + size_c_out;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int valid_we = we > width ? width : we;
-  int cnt = (valid_we - ws) / w4;
-  int remain = valid_we - ws - cnt * w4;
-
-  for (int i = 0; i < size_h; i++) {
-    int size_w = i * width;
-    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-    float* doutc1_ptr = doutc1r0 + size_w;
-    float* doutc2_ptr = doutc2r0 + size_w;
-    float* doutc3_ptr = doutc3r0 + size_w;
-    if (ce > channel) {
-      switch (ce - channel) {
-        case 3:
-          doutc1_ptr = trash_ptr;
-        case 2:
-          doutc2_ptr = trash_ptr;
-        case 1:
-          doutc3_ptr = trash_ptr;
-        default:
-          break;
-      }
-    }
-    const float* din_hei_ptr = ptr_din + i * w_round * ch_n;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      if (flag_relu) {
+// clang-format on
+inline void act_switch_c4_fp32(const float* din_ptr,
+                               float* doutc0_ptr,
+                               float* doutc1_ptr,
+                               float* doutc2_ptr,
+                               float* doutc3_ptr,
+                               int cnt_loop,
+                               const operators::ActivationParam* act_param) {
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t six = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t scale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
 #ifdef __aarch64__
         asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
                          NCHWC4_TRANS_FP32_STORE
@@ -1023,7 +1371,7 @@ inline bool write_to_output_c4_fp32(const float* din,
                        [doutc2r0] "+r"(doutc2_ptr),
                        [doutc3r0] "+r"(doutc3_ptr),
                        [cnt] "+r"(cnt_loop),
-                       [ptr_din] "+r"(din_hei_ptr)
+                       [ptr_din] "+r"(din_ptr)
                      :
                      : "v0",
                        "v1",
@@ -1052,57 +1400,290 @@ inline bool write_to_output_c4_fp32(const float* din,
                        [doutc1r0] "+r"(doutc1_ptr),
                        [doutc2r0] "+r"(doutc2_ptr),
                        [doutc3r0] "+r"(doutc3_ptr),
-                       [ptr_din] "+r"(din_hei_ptr),
+                       [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      :
                      : "q0", "q1", "q2", "q3", "q15");
 #endif
-      } else {
+        break;
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
 #ifdef __aarch64__
-        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_RELU6 NCHWC4_TRANS_FP32_STORE
                      : [doutc0r0] "+r"(doutc0_ptr),
                        [doutc1r0] "+r"(doutc1_ptr),
                        [doutc2r0] "+r"(doutc2_ptr),
                        [doutc3r0] "+r"(doutc3_ptr),
                        [cnt] "+r"(cnt_loop),
-                       [ptr_din] "+r"(din_hei_ptr)
-                     :
+                       [ptr_din] "+r"(din_ptr)
+                     : [six] "w"(six)
                      : "v0",
                        "v1",
                        "v2",
                        "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
                        "v8",
                        "v9",
                        "v10",
                        "v11",
+                       "v12",
+                       "v13",
+                       "v14",
                        "v16",
                        "v17",
                        "v18",
-                       "v19");
+                       "v19",
+                       "v20");
 #else
-        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_RELU6 NCHWC4_TRANS_FP32_STORE
                      : [doutc0r0] "+r"(doutc0_ptr),
                        [doutc1r0] "+r"(doutc1_ptr),
                        [doutc2r0] "+r"(doutc2_ptr),
                        [doutc3r0] "+r"(doutc3_ptr),
-                       [ptr_din] "+r"(din_hei_ptr),
+                       [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
-                     :
-                     : "q0", "q1", "q2", "q3");
+                     : [six] "w"(six)
+                     : "q0", "q1", "q2", "q3", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_LEAKY_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [scale] "w"(scale)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
+#else
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_LEAKY_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [scale] "w"(scale)
+                     : "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q15");
 #endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [doutc2r0] "+r"(doutc2_ptr),
+                   [doutc3r0] "+r"(doutc3_ptr),
+                   [cnt] "+r"(cnt_loop),
+                   [ptr_din] "+r"(din_ptr)
+                 :
+                 : "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19");
+#else
+    asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [doutc2r0] "+r"(doutc2_ptr),
+                   [doutc3r0] "+r"(doutc3_ptr),
+                   [ptr_din] "+r"(din_ptr),
+                   [cnt] "+r"(cnt_loop)
+                 :
+                 : "q0", "q1", "q2", "q3", "q15");
+#endif
+  }
+}
+/*wirte result in outputs
+* input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
+*/
+inline bool write_to_output_c4_fp32(const float* din,
+                                    float* dout,
+                                    int cs,
+                                    int ce,
+                                    int hs,
+                                    int he,
+                                    int ws,
+                                    int we,
+                                    int channel,
+                                    int height,
+                                    int width,
+                                    bool flag_relu,
+                                    float* trash_ptr,
+                                    operators::ActivationParam* act_param) {
+  const int c4 = 4;
+  const int w4 = 4;
+  const int w_round = we - ws;
+  const int ch_n = ce - cs;
+
+  if (ch_n != 4) {
+    LOG(ERROR) << "write_to_output_c4_fp32 ch_n must be equal 4 and hei_n is "
+                  "more than zero";
+    return false;
+  }
+  int size_c_out = width * height;
+
+  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
+  float* doutc1r0 = doutc0r0 + size_c_out;
+  float* doutc2r0 = doutc1r0 + size_c_out;
+  float* doutc3r0 = doutc2r0 + size_c_out;
+
+  const float* ptr_din = din;
+
+  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
+
+  int valid_we = we > width ? width : we;
+  int cnt = (valid_we - ws) / w4;
+  int remain = valid_we - ws - cnt * w4;
+
+  for (int i = 0; i < size_h; i++) {
+    int size_w = i * width;
+    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
+    float* doutc1_ptr = doutc1r0 + size_w;
+    float* doutc2_ptr = doutc2r0 + size_w;
+    float* doutc3_ptr = doutc3r0 + size_w;
+    if (ce > channel) {
+      switch (ce - channel) {
+        case 3:
+          doutc1_ptr = trash_ptr;
+        case 2:
+          doutc2_ptr = trash_ptr;
+        case 1:
+          doutc3_ptr = trash_ptr;
+        default:
+          break;
       }
     }
+    const float* din_hei_ptr = ptr_din + i * w_round * ch_n;
+    if (cnt > 0) {
+      int cnt_loop = cnt;
+      act_switch_c4_fp32(din_hei_ptr,
+                         doutc0_ptr,
+                         doutc1_ptr,
+                         doutc2_ptr,
+                         doutc3_ptr,
+                         cnt_loop,
+                         act_param);
+    }
     if (remain > 0) {
       int offset = i * w_round * c4 + c4 * w4 * cnt;
       din_hei_ptr = ptr_din + offset;
+      doutc0_ptr += w4 * cnt;
+      doutc1_ptr += w4 * cnt;
+      doutc2_ptr += w4 * cnt;
+      doutc3_ptr += w4 * cnt;
       int j = 0;
-      if (flag_relu) {
-        for (; j < remain; ++j) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
-          *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
-          *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
-          din_hei_ptr += w4;
+      if (act_param != nullptr && act_param->has_active) {
+        float six = act_param->Relu_clipped_coef;
+        float scale = act_param->Leaky_relu_alpha;
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+            for (; j < remain; ++j) {
+              *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
+              *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
+              *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
+              *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
+              din_hei_ptr += 4;
+            }
+            break;
+          case lite_api::ActivationType::kRelu6:
+            /* 0 <= din <= 6 */
+            for (; j < remain; ++j) {
+              float tmp1 = LITEMAX(din_hei_ptr[0], 0.f);
+              float tmp2 = LITEMAX(din_hei_ptr[1], 0.f);
+              float tmp3 = LITEMAX(din_hei_ptr[2], 0.f);
+              float tmp4 = LITEMAX(din_hei_ptr[3], 0.f);
+              *(doutc0_ptr++) = LITEMIN(tmp1, six);
+              *(doutc1_ptr++) = LITEMIN(tmp2, six);
+              *(doutc2_ptr++) = LITEMIN(tmp3, six);
+              *(doutc3_ptr++) = LITEMIN(tmp4, six);
+              din_hei_ptr += 4;
+            }
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+            /*din = din >= 0 ? din : din * scale*/
+            for (; j < remain; ++j) {
+              if (din_hei_ptr[0] >= 0) {
+                *(doutc0_ptr++) = din_hei_ptr[0];
+              } else {
+                *(doutc0_ptr++) = din_hei_ptr[0] * scale;
+              }
+              if (din_hei_ptr[1] >= 0) {
+                *(doutc1_ptr++) = din_hei_ptr[1];
+              } else {
+                *(doutc1_ptr++) = din_hei_ptr[1] * scale;
+              }
+              if (din_hei_ptr[2] >= 0) {
+                *(doutc2_ptr++) = din_hei_ptr[2];
+              } else {
+                *(doutc2_ptr++) = din_hei_ptr[2] * scale;
+              }
+              if (din_hei_ptr[3] >= 0) {
+                *(doutc3_ptr++) = din_hei_ptr[3];
+              } else {
+                *(doutc3_ptr++) = din_hei_ptr[3] * scale;
+              }
+              din_hei_ptr += 4;
+            }
+            break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
         }
       } else {
         for (; j < remain; ++j) {
@@ -1110,14 +1691,14 @@ inline bool write_to_output_c4_fp32(const float* din,
           *(doutc1_ptr++) = din_hei_ptr[1];
           *(doutc2_ptr++) = din_hei_ptr[2];
           *(doutc3_ptr++) = din_hei_ptr[3];
-          din_hei_ptr += w4;
+          din_hei_ptr += 4;
         }
       }
     }
   }
   return true;
 }
-
+// clang-format off
 #ifdef __aarch64__
 #define NCHWC8_TRANS_FP32_COMPUTE                                    \
   "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
@@ -1161,6 +1742,48 @@ inline bool write_to_output_c4_fp32(const float* din,
   "fmax   v12.4s, v12.4s, v20.4s  \n" /*relu*/ \
   "fmax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
 
+#define NCHWC8_TRANS_FP32_RELU6                    \
+  "fmin   v16.4s, v16.4s, %[six].4s  \n" /*relu6*/ \
+  "fmin   v17.4s, v17.4s, %[six].4s  \n" /*relu6*/ \
+  "fmin   v18.4s, v18.4s, %[six].4s  \n" /*relu6*/ \
+  "fmin   v19.4s, v19.4s, %[six].4s  \n" /*relu6*/ \
+                                                   \
+  "fmin   v8.4s,  v8.4s,  %[six].4s  \n" /*relu6*/ \
+  "fmin   v9.4s,  v9.4s,  %[six].4s  \n" /*relu6*/ \
+  "fmin   v12.4s, v12.4s, %[six].4s  \n" /*relu6*/ \
+  "fmin   v13.4s, v13.4s, %[six].4s  \n" /*relu6*/
+
+#define NCHWC8_TRANS_FP32_LEAKY_RELU                \
+  "cmhs v10.4s, v16.4s, v20.4s \n" /* vcgeq_u32 */  \
+  "cmhs v11.4s, v17.4s, v20.4s \n" /* vcgeq_u32 */  \
+  "cmhs v14.4s, v18.4s, v20.4s \n" /* vcgeq_u32 */  \
+  "cmhs v15.4s, v19.4s, v20.4s \n" /* vcgeq_u32 */  \
+                                                    \
+  "cmhs v21.4s, v8.4s, v20.4s \n"  /* vcgeq_u32 */  \
+  "cmhs v22.4s, v9.4s, v20.4s \n"  /* vcgeq_u32 */  \
+  "cmhs v23.4s, v12.4s, v20.4s \n" /* vcgeq_u32 */  \
+  "cmhs v24.4s, v13.4s, v20.4s \n" /* vcgeq_u32 */  \
+                                                    \
+  "fmul v25.4s, v16.4s, %[scale].4s \n" /* mul */   \
+  "fmul v26.4s, v17.4s, %[scale].4s \n" /* mul */   \
+  "fmul v27.4s, v18.4s, %[scale].4s \n" /* mul */   \
+  "fmul v28.4s, v19.4s, %[scale].4s \n" /* mul */   \
+                                                    \
+  "fmul v29.4s, v8.4s, %[scale].4s \n"  /* mul */   \
+  "fmul v30.4s, v9.4s, %[scale].4s \n"  /* mul */   \
+  "fmul v31.4s, v12.4s, %[scale].4s \n" /* mul */   \
+                                                    \
+  "bif  v16.16b, v25.16b, v10.16b \n"   /* choose*/ \
+  "bif  v17.16b, v26.16b, v11.16b \n"   /* choose*/ \
+  "bif  v18.16b, v27.16b, v14.16b \n"   /* choose*/ \
+  "bif  v19.16b, v28.16b, v15.16b \n"   /* choose*/ \
+  "fmul v25.4s, v13.4s, %[scale].4s \n" /* mul */   \
+                                                    \
+  "bif  v8.16b, v29.16b, v21.16b \n"  /* choose*/   \
+  "bif  v9.16b, v30.16b, v22.16b \n"  /* choose*/   \
+  "bif  v12.16b, v31.16b, v23.16b \n" /* choose*/   \
+  "bif  v13.16b, v25.16b, v24.16b \n" /* choose*/
+
 #define NCHWC8_TRANS_FP32_STORE                          \
   "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/    \
   "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/    \
@@ -1174,6 +1797,7 @@ inline bool write_to_output_c4_fp32(const float* din,
   "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/    \
                                                          \
   "bne    1b                      \n" /* jump to main loop*/
+
 #else
 #define NCHWC8_TRANS_FP32_COMPUTE                           \
   "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"     \
@@ -1203,6 +1827,48 @@ inline bool write_to_output_c4_fp32(const float* din,
   "vmax.f32  q6, q6, q15                 @ relu\n" \
   "vmax.f32  q7, q7, q15                 @ relu\n"
 
+#define NCHWC8_TRANS_FP32_RELU6                         \
+  "vmin.f32  q0, q0, %q[six]                 @ relu6\n" \
+  "vmin.f32  q1, q1, %q[six]                 @ relu6\n" \
+  "vmin.f32  q2, q2, %q[six]                 @ relu6\n" \
+  "vmin.f32  q3, q3, %q[six]                 @ relu6\n" \
+                                                        \
+  "vmin.f32  q4, q4, %q[six]                 @ relu6\n" \
+  "vmin.f32  q5, q5, %q[six]                 @ relu6\n" \
+  "vmin.f32  q6, q6, %q[six]                 @ relu6\n" \
+  "vmin.f32  q7, q7, %q[six]                 @ relu6\n"
+
+#define NCHWC8_TRANS_FP32_LEAKY_RELU           \
+  "vcge.f32   q9, q0, q15        @ q0 > 0 \n"  \
+  "vcge.f32   q10, q1, q15        @ q0 > 0 \n" \
+  "vcge.f32   q11, q2, q15        @ q0 > 0 \n" \
+  "vcge.f32   q12, q3, q15        @ q0 > 0 \n" \
+  "vmul.f32 q13, q0, %q[scale] \n"             \
+  "vmul.f32 q14, q1, %q[scale] \n"             \
+  "vmul.f32 q15, q2, %q[scale] \n"             \
+                                               \
+  "vbif q0, q13, q9 @ choose \n"               \
+  "vmul.f32 q9, q3, %q[scale] \n"              \
+                                               \
+  "vbif q1, q14, q10 @ choose \n"              \
+  "vbif q2, q15, q11 @ choose \n"              \
+  "vbif q3, q9, q12 @ choose \n"               \
+                                               \
+  "vcge.f32   q9, q4, q15        @ q0 > 0 \n"  \
+  "vcge.f32   q10, q5, q15        @ q0 > 0 \n" \
+  "vcge.f32   q11, q6, q15        @ q0 > 0 \n" \
+  "vcge.f32   q12, q7, q15        @ q0 > 0 \n" \
+  "vmul.f32 q13, q4, %q[scale] \n"             \
+  "vmul.f32 q14, q5, %q[scale] \n"             \
+  "vmul.f32 q15, q6, %q[scale] \n"             \
+                                               \
+  "vbif q4, q13, q9 @ choose \n"               \
+  "vmul.f32 q9, q7, %q[scale] \n"              \
+                                               \
+  "vbif q5, q14, q10 @ choose \n"              \
+  "vbif q6, q15, q11 @ choose \n"              \
+  "vbif q7, q9, q12 @ choose \n"
+
 #define NCHWC8_TRANS_FP32_STORE                                \
   "subs   %[cnt], %[cnt], #1             @ loop count - 1\n"   \
   "vst1.32   {d0-d1}, [%[doutc0r0]]!     @ store result, add " \
@@ -1232,84 +1898,23 @@ inline bool write_to_output_c4_fp32(const float* din,
   "bne    1b                             @ jump to main loop\n"
 
 #endif
-/*wirte result in outputs
-* input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
-*/
-inline bool write_to_output_c8_fp32(const float* din,
-                                    float* dout,
-                                    int ch_n,
-                                    int hei_n,
-                                    int cs,
-                                    int ce,
-                                    int hs,
-                                    int he,
-                                    int ws,
-                                    int we,
-                                    int channel,
-                                    int height,
-                                    int width,
-                                    bool flag_relu,
-                                    float* trash_ptr) {
-  if (ch_n != 8 || hei_n <= 0) {
-    LOG(ERROR) << "ch_n must be equal 8 and hei_n is more than zero";
-    return false;
-  }
-  int size_c_out = width * height;
-
-  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
-  float* doutc1r0 = doutc0r0 + size_c_out;
-  float* doutc2r0 = doutc1r0 + size_c_out;
-  float* doutc3r0 = doutc2r0 + size_c_out;
-  float* doutc4r0 = doutc3r0 + size_c_out;
-  float* doutc5r0 = doutc4r0 + size_c_out;
-  float* doutc6r0 = doutc5r0 + size_c_out;
-  float* doutc7r0 = doutc6r0 + size_c_out;
-
-  const float* ptr_din = din;
-
-  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
-
-  int valid_w = we - ws;
-  int cnt = valid_w / 4;
-
-  if (we > width) {
-    cnt--;
-  }
-  if (flag_relu) {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      float* doutc1_ptr = doutc1r0 + size_w;
-      float* doutc2_ptr = doutc2r0 + size_w;
-      float* doutc3_ptr = doutc3r0 + size_w;
-      float* doutc4_ptr = doutc4r0 + size_w;
-      float* doutc5_ptr = doutc5r0 + size_w;
-      float* doutc6_ptr = doutc6r0 + size_w;
-      float* doutc7_ptr = doutc7r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 7:
-            doutc1_ptr = trash_ptr;
-          case 6:
-            doutc2_ptr = trash_ptr;
-          case 5:
-            doutc3_ptr = trash_ptr;
-          case 4:
-            doutc4_ptr = trash_ptr;
-          case 3:
-            doutc5_ptr = trash_ptr;
-          case 2:
-            doutc6_ptr = trash_ptr;
-          case 1:
-            doutc7_ptr = trash_ptr;
-          default:
-            break;
-        }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const float* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
+// clang-format on
+inline void act_switch_c8_fp32(const float* din_ptr,
+                               float* doutc0_ptr,
+                               float* doutc1_ptr,
+                               float* doutc2_ptr,
+                               float* doutc3_ptr,
+                               float* doutc4_ptr,
+                               float* doutc5_ptr,
+                               float* doutc6_ptr,
+                               float* doutc7_ptr,
+                               int cnt_loop,
+                               const operators::ActivationParam* act_param) {
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t six = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t scale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
 #ifdef __aarch64__
         asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU
                          NCHWC8_TRANS_FP32_STORE
@@ -1322,9 +1927,10 @@ inline bool write_to_output_c8_fp32(const float* din,
                        [doutc6r0] "+r"(doutc6_ptr),
                        [doutc7r0] "+r"(doutc7_ptr),
                        [cnt] "+r"(cnt_loop),
-                       [ptr_din] "+r"(din_hei_ptr)
+                       [ptr_din] "+r"(din_ptr)
                      :
-                     : "v1",
+                     : "v0",
+                       "v1",
                        "v2",
                        "v3",
                        "v4",
@@ -1338,7 +1944,6 @@ inline bool write_to_output_c8_fp32(const float* din,
                        "v12",
                        "v13",
                        "v14",
-                       "v15",
                        "v16",
                        "v17",
                        "v18",
@@ -1355,66 +1960,17 @@ inline bool write_to_output_c8_fp32(const float* din,
                        [doutc5r0] "+r"(doutc5_ptr),
                        [doutc6r0] "+r"(doutc6_ptr),
                        [doutc7r0] "+r"(doutc7_ptr),
-                       [ptr_din] "+r"(din_hei_ptr),
+                       [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
                      :
-                     : "q0", "q1", "q2", "q3", "q4", "q15");
+                     : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15");
 #endif
-      }
-      if (we > width) {
-        int offset = 32 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
-        for (; i < width; ++i) {
-          *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
-          *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
-          *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
-          *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
-          *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4], 0.f);
-          *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5], 0.f);
-          *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6], 0.f);
-          *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7], 0.f);
-          din_hei_ptr += 8;
-        }
-      }
-    }
-  } else {
-    for (int i = 0; i < size_h; i++) {
-      int size_w = i * width;
-      float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
-      float* doutc1_ptr = doutc1r0 + size_w;
-      float* doutc2_ptr = doutc2r0 + size_w;
-      float* doutc3_ptr = doutc3r0 + size_w;
-      float* doutc4_ptr = doutc4r0 + size_w;
-      float* doutc5_ptr = doutc5r0 + size_w;
-      float* doutc6_ptr = doutc6r0 + size_w;
-      float* doutc7_ptr = doutc7r0 + size_w;
-      if (ce > channel) {
-        switch (ce - channel) {
-          case 7:
-            doutc1_ptr = trash_ptr;
-          case 6:
-            doutc2_ptr = trash_ptr;
-          case 5:
-            doutc3_ptr = trash_ptr;
-          case 4:
-            doutc4_ptr = trash_ptr;
-          case 3:
-            doutc5_ptr = trash_ptr;
-          case 2:
-            doutc6_ptr = trash_ptr;
-          case 1:
-            doutc7_ptr = trash_ptr;
-          default:
-            break;
-        }
-      }
-      ptr_din = din + i * valid_w * ch_n;
-      const float* din_hei_ptr = ptr_din;
-      if (cnt > 0) {
-        int cnt_loop = cnt;
+        break;
+      case lite_api::ActivationType::kRelu6:
+/* 0 <= din <= 6 */
 #ifdef __aarch64__
-        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU6
+                         NCHWC8_TRANS_FP32_STORE
                      : [doutc0r0] "+r"(doutc0_ptr),
                        [doutc1r0] "+r"(doutc1_ptr),
                        [doutc2r0] "+r"(doutc2_ptr),
@@ -1424,8 +1980,8 @@ inline bool write_to_output_c8_fp32(const float* din,
                        [doutc6r0] "+r"(doutc6_ptr),
                        [doutc7r0] "+r"(doutc7_ptr),
                        [cnt] "+r"(cnt_loop),
-                       [ptr_din] "+r"(din_hei_ptr)
-                     :
+                       [ptr_din] "+r"(din_ptr)
+                     : [six] "w"(six)
                      : "v0",
                        "v1",
                        "v2",
@@ -1441,14 +1997,29 @@ inline bool write_to_output_c8_fp32(const float* din,
                        "v12",
                        "v13",
                        "v14",
-                       "v15",
                        "v16",
                        "v17",
                        "v18",
                        "v19",
                        "v20");
 #else
-        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_RELU6 NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     : [six] "w"(six)
+                     : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+/*din = din >= 0 ? din : din * scale*/
+#ifdef __aarch64__
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_LEAKY_RELU
+                         NCHWC8_TRANS_FP32_STORE
                      : [doutc0r0] "+r"(doutc0_ptr),
                        [doutc1r0] "+r"(doutc1_ptr),
                        [doutc2r0] "+r"(doutc2_ptr),
@@ -1457,16 +2028,323 @@ inline bool write_to_output_c8_fp32(const float* din,
                        [doutc5r0] "+r"(doutc5_ptr),
                        [doutc6r0] "+r"(doutc6_ptr),
                        [doutc7r0] "+r"(doutc7_ptr),
-                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_ptr)
+                     : [scale] "w"(scale)
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22",
+                       "v23",
+                       "v24",
+                       "v25",
+                       "v26",
+                       "v27",
+                       "v28",
+                       "v29",
+                       "v30",
+                       "v31");
+#else
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_LEAKY_RELU
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [ptr_din] "+r"(din_ptr),
                        [cnt] "+r"(cnt_loop)
-                     :
-                     : "q0", "q1", "q2", "q3", "q4");
+                     : [scale] "w"(scale)
+                     : "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [doutc2r0] "+r"(doutc2_ptr),
+                   [doutc3r0] "+r"(doutc3_ptr),
+                   [doutc4r0] "+r"(doutc4_ptr),
+                   [doutc5r0] "+r"(doutc5_ptr),
+                   [doutc6r0] "+r"(doutc6_ptr),
+                   [doutc7r0] "+r"(doutc7_ptr),
+                   [cnt] "+r"(cnt_loop),
+                   [ptr_din] "+r"(din_ptr)
+                 :
+                 : "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19",
+                   "v20");
+#else
+    asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+                 : [doutc0r0] "+r"(doutc0_ptr),
+                   [doutc1r0] "+r"(doutc1_ptr),
+                   [doutc2r0] "+r"(doutc2_ptr),
+                   [doutc3r0] "+r"(doutc3_ptr),
+                   [doutc4r0] "+r"(doutc4_ptr),
+                   [doutc5r0] "+r"(doutc5_ptr),
+                   [doutc6r0] "+r"(doutc6_ptr),
+                   [doutc7r0] "+r"(doutc7_ptr),
+                   [ptr_din] "+r"(din_ptr),
+                   [cnt] "+r"(cnt_loop)
+                 :
+                 : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q15");
 #endif
+  }
+}
+
+/*wirte result in outputs
+* input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
+*/
+inline bool write_to_output_c8_fp32(const float* din,
+                                    float* dout,
+                                    int ch_n,
+                                    int hei_n,
+                                    int cs,
+                                    int ce,
+                                    int hs,
+                                    int he,
+                                    int ws,
+                                    int we,
+                                    int channel,
+                                    int height,
+                                    int width,
+                                    bool flag_relu,
+                                    float* trash_ptr,
+                                    operators::ActivationParam* act_param) {
+  if (ch_n != 8 || hei_n <= 0) {
+    LOG(ERROR) << "ch_n must be equal 8 and hei_n is more than zero";
+    return false;
+  }
+  int size_c_out = width * height;
+
+  float* doutc0r0 = dout + cs * size_c_out + hs * width + ws;
+  float* doutc1r0 = doutc0r0 + size_c_out;
+  float* doutc2r0 = doutc1r0 + size_c_out;
+  float* doutc3r0 = doutc2r0 + size_c_out;
+  float* doutc4r0 = doutc3r0 + size_c_out;
+  float* doutc5r0 = doutc4r0 + size_c_out;
+  float* doutc6r0 = doutc5r0 + size_c_out;
+  float* doutc7r0 = doutc6r0 + size_c_out;
+
+  const float* ptr_din = din;
+
+  int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
+
+  int valid_w = we - ws;
+  int w4 = 4;
+  int cnt = valid_w / 4;
+
+  if (we > width) {
+    cnt--;
+  }
+  for (int i = 0; i < size_h; i++) {
+    int size_w = i * width;
+    float* doutc0_ptr = doutc0r0 + size_w;  // doutc0r0 + width;
+    float* doutc1_ptr = doutc1r0 + size_w;
+    float* doutc2_ptr = doutc2r0 + size_w;
+    float* doutc3_ptr = doutc3r0 + size_w;
+    float* doutc4_ptr = doutc4r0 + size_w;
+    float* doutc5_ptr = doutc5r0 + size_w;
+    float* doutc6_ptr = doutc6r0 + size_w;
+    float* doutc7_ptr = doutc7r0 + size_w;
+    if (ce > channel) {
+      switch (ce - channel) {
+        case 7:
+          doutc1_ptr = trash_ptr;
+        case 6:
+          doutc2_ptr = trash_ptr;
+        case 5:
+          doutc3_ptr = trash_ptr;
+        case 4:
+          doutc4_ptr = trash_ptr;
+        case 3:
+          doutc5_ptr = trash_ptr;
+        case 2:
+          doutc6_ptr = trash_ptr;
+        case 1:
+          doutc7_ptr = trash_ptr;
+        default:
+          break;
       }
-      if (we > width) {
-        int offset = 32 * (valid_w / 4 - 1);
-        din_hei_ptr = ptr_din + offset;
-        int i = we - 4;
+    }
+    ptr_din = din + i * valid_w * ch_n;
+    const float* din_hei_ptr = ptr_din;
+    if (cnt > 0) {
+      int cnt_loop = cnt;
+      act_switch_c8_fp32(din_hei_ptr,
+                         doutc0_ptr,
+                         doutc1_ptr,
+                         doutc2_ptr,
+                         doutc3_ptr,
+                         doutc4_ptr,
+                         doutc5_ptr,
+                         doutc6_ptr,
+                         doutc7_ptr,
+                         cnt_loop,
+                         act_param);
+    }
+    if (we > width) {
+      int offset = 32 * (valid_w / 4 - 1);
+      din_hei_ptr = ptr_din + offset;
+      doutc0_ptr += w4 * cnt;
+      doutc1_ptr += w4 * cnt;
+      doutc2_ptr += w4 * cnt;
+      doutc3_ptr += w4 * cnt;
+      doutc4_ptr += w4 * cnt;
+      doutc5_ptr += w4 * cnt;
+      doutc6_ptr += w4 * cnt;
+      doutc7_ptr += w4 * cnt;
+      int i = we - 4;
+      if (act_param != nullptr && act_param->has_active) {
+        float six = act_param->Relu_clipped_coef;
+        float scale = act_param->Leaky_relu_alpha;
+        switch (act_param->active_type) {
+          case lite_api::ActivationType::kRelu:
+            for (; i < width; ++i) {
+              *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
+              *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
+              *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
+              *(doutc3_ptr++) = LITEMAX(din_hei_ptr[3], 0.f);
+              *(doutc4_ptr++) = LITEMAX(din_hei_ptr[4], 0.f);
+              *(doutc5_ptr++) = LITEMAX(din_hei_ptr[5], 0.f);
+              *(doutc6_ptr++) = LITEMAX(din_hei_ptr[6], 0.f);
+              *(doutc7_ptr++) = LITEMAX(din_hei_ptr[7], 0.f);
+              din_hei_ptr += 8;
+            }
+            break;
+          case lite_api::ActivationType::kRelu6:
+            /* 0 <= din <= 6 */
+            for (; i < width; ++i) {
+              float tmp1 = LITEMAX(din_hei_ptr[0], 0.f);
+              float tmp2 = LITEMAX(din_hei_ptr[1], 0.f);
+              float tmp3 = LITEMAX(din_hei_ptr[2], 0.f);
+              float tmp4 = LITEMAX(din_hei_ptr[3], 0.f);
+              float tmp5 = LITEMAX(din_hei_ptr[4], 0.f);
+              float tmp6 = LITEMAX(din_hei_ptr[5], 0.f);
+              float tmp7 = LITEMAX(din_hei_ptr[6], 0.f);
+              float tmp8 = LITEMAX(din_hei_ptr[7], 0.f);
+              *(doutc0_ptr++) = LITEMIN(tmp1, six);
+              *(doutc1_ptr++) = LITEMIN(tmp2, six);
+              *(doutc2_ptr++) = LITEMIN(tmp3, six);
+              *(doutc3_ptr++) = LITEMIN(tmp4, six);
+              *(doutc4_ptr++) = LITEMIN(tmp5, six);
+              *(doutc5_ptr++) = LITEMIN(tmp6, six);
+              *(doutc6_ptr++) = LITEMIN(tmp7, six);
+              *(doutc7_ptr++) = LITEMIN(tmp8, six);
+              din_hei_ptr += 8;
+            }
+            break;
+          case lite_api::ActivationType::kLeakyRelu:
+            /*din = din >= 0 ? din : din * scale*/
+            for (; i < width; ++i) {
+              if (din_hei_ptr[0] >= 0) {
+                *(doutc0_ptr++) = din_hei_ptr[0];
+              } else {
+                *(doutc0_ptr++) = din_hei_ptr[0] * scale;
+              }
+              if (din_hei_ptr[1] >= 0) {
+                *(doutc1_ptr++) = din_hei_ptr[1];
+              } else {
+                *(doutc1_ptr++) = din_hei_ptr[1] * scale;
+              }
+              if (din_hei_ptr[2] >= 0) {
+                *(doutc2_ptr++) = din_hei_ptr[2];
+              } else {
+                *(doutc2_ptr++) = din_hei_ptr[2] * scale;
+              }
+              if (din_hei_ptr[3] >= 0) {
+                *(doutc3_ptr++) = din_hei_ptr[3];
+              } else {
+                *(doutc3_ptr++) = din_hei_ptr[3] * scale;
+              }
+              if (din_hei_ptr[4] >= 0) {
+                *(doutc4_ptr++) = din_hei_ptr[4];
+              } else {
+                *(doutc4_ptr++) = din_hei_ptr[4] * scale;
+              }
+              if (din_hei_ptr[4] >= 0) {
+                *(doutc5_ptr++) = din_hei_ptr[5];
+              } else {
+                *(doutc5_ptr++) = din_hei_ptr[5] * scale;
+              }
+              if (din_hei_ptr[6] >= 0) {
+                *(doutc6_ptr++) = din_hei_ptr[6];
+              } else {
+                *(doutc6_ptr++) = din_hei_ptr[6] * scale;
+              }
+              if (din_hei_ptr[7] >= 0) {
+                *(doutc7_ptr++) = din_hei_ptr[7];
+              } else {
+                *(doutc7_ptr++) = din_hei_ptr[7] * scale;
+              }
+              din_hei_ptr += 8;
+            }
+            break;
+          default:
+            LOG(FATAL) << "this act_type: "
+                       << static_cast<int>(act_param->active_type)
+                       << " fuse not support";
+        }
+      } else {
         for (; i < width; ++i) {
           *(doutc0_ptr++) = din_hei_ptr[0];
           *(doutc1_ptr++) = din_hei_ptr[1];
diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h
index b6c3478880d5cb59999d23ff03e2e342708ca95b..503dab29b6c4f0b9d3ff30a89060e473194216a9 100644
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -37,6 +37,7 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
                                const float* weights,
                                const float* bias,
                                const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                                ARMContext* ctx);
 
 void conv_3x3s2_depthwise_fp32(const float* i_data,
@@ -67,6 +68,7 @@ void conv_depthwise_3x3s1_fp32(const float* din,
                                int pad,
                                bool flag_bias,
                                bool flag_relu,
+                               const operators::ActivationParam act_param,
                                ARMContext* ctx);
 
 void conv_depthwise_3x3s2_fp32(const float* din,
diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc
index dc68e65f42a799d7fa7e8be75f5afcf3166b1df3..642d1c2c1b964b9553e522d70a086531f1706420 100644
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -579,6 +579,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                              ARMContext* ctx,
                              const float* scale) {
   auto paddings = *param.paddings;
+  auto act_param = param.activation_param;
   const int pad_h = paddings[0];
   const int pad_w = paddings[2];
   int stride = param.strides[1];
@@ -603,6 +604,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                 pad,
                                 flag_bias,
                                 flag_relu,
+                                act_param,
                                 ctx);
     } else {
       conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
@@ -617,6 +619,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                 reinterpret_cast<const float*>(weights),
                                 bias,
                                 param,
+                                act_param,
                                 ctx);
     }
 
diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h
index f4d00039aaa635d0ffb31846fd9ff9077ac0c621..60f74b7feecc91a2fe8262a1fea4dce26430031d 100644
--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
@@ -316,7 +316,9 @@ void fill_bias_int8(int* tensor,
                     int channel_size);
 // new winograd
 
-void weight_trans_c4(
+void weight_trans_c4_8x8(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+void weight_trans_c4_4x4(
     float* dest, const float* src, int ic, int oc, void* workspace);
 void conv_compute_6x6_3x3(const float* input,
                           float* output,
@@ -331,6 +333,32 @@ void conv_compute_6x6_3x3(const float* input,
                           const float* bias,
                           const operators::ConvParam& param,
                           ARMContext* ctx);
+void conv_compute_2x2_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx);
+void conv_compute_2x2_3x3_small(const float* input,
+                                float* output,
+                                int num,
+                                int chout,
+                                int hout,
+                                int wout,
+                                int chin,
+                                int hin,
+                                int win,
+                                const float* weight,
+                                const float* bias,
+                                const operators::ConvParam& param,
+                                ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc
index a4c61f9a9d181924c28cdd009f8412278d44f5bb..186ad19735799dcb91641354af4b4f09692bfce9 100644
--- a/lite/backends/arm/math/elementwise.cc
+++ b/lite/backends/arm/math/elementwise.cc
@@ -557,6 +557,52 @@ void elementwise_mul<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_mul<int>(const int* dinx,
+                          const int* diny,
+                          int* dout,
+                          int num) {
+  int cnt = num >> 4;
+  int remain = num % 16;
+#pragma omp parallel for
+  for (int i = 0; i < cnt; ++i) {
+    const int* dinx_ptr = dinx + (i << 4);
+    const int* diny_ptr = diny + (i << 4);
+    int* dout_ptr = dout + (i << 4);
+
+    int32x4_t dinx0 = vld1q_s32(dinx_ptr);
+    int32x4_t dinx1 = vld1q_s32(dinx_ptr + 4);
+    int32x4_t dinx2 = vld1q_s32(dinx_ptr + 8);
+    int32x4_t dinx3 = vld1q_s32(dinx_ptr + 12);
+
+    int32x4_t diny0 = vld1q_s32(diny_ptr);
+    int32x4_t diny1 = vld1q_s32(diny_ptr + 4);
+    int32x4_t diny2 = vld1q_s32(diny_ptr + 8);
+    int32x4_t diny3 = vld1q_s32(diny_ptr + 12);
+
+    dinx0 = vmulq_s32(dinx0, diny0);
+    dinx1 = vmulq_s32(dinx1, diny1);
+    dinx2 = vmulq_s32(dinx2, diny2);
+    dinx3 = vmulq_s32(dinx3, diny3);
+
+    vst1q_s32(dout_ptr, dinx0);
+    vst1q_s32(dout_ptr + 4, dinx1);
+    vst1q_s32(dout_ptr + 8, dinx2);
+    vst1q_s32(dout_ptr + 12, dinx3);
+  }
+  if (remain > 0) {
+    const int* dinx_ptr = dinx + (cnt << 4);
+    const int* diny_ptr = diny + (cnt << 4);
+    int* dout_ptr = dout + (cnt << 4);
+    for (int i = 0; i < remain; i++) {
+      *dout_ptr = *dinx_ptr * *diny_ptr;
+      dout_ptr++;
+      dinx_ptr++;
+      diny_ptr++;
+    }
+  }
+}
+
 template <>
 void elementwise_mul_relu<float>(const float* dinx,
                                  const float* diny,
@@ -678,6 +724,73 @@ void elementwise_mul_broadcast<float>(const float* dinx,
   }
 }
 
+template <>
+void elementwise_mul_broadcast<int>(const int* dinx,
+                                    const int* diny,
+                                    int* dout,
+                                    int batch,
+                                    int channels,
+                                    int num) {
+#pragma omp parallel for collapse(2)
+  for (int i = 0; i < batch; ++i) {
+    for (int j = 0; j < channels; ++j) {
+      int offset = (i * channels + j) * num;
+      const int* din_ptr = dinx + offset;
+      const int diny_data = diny[j];
+      int* dout_ptr = dout + offset;
+
+      int cnt = num >> 4;
+      int remain = num % 16;
+      int32x4_t rb = vdupq_n_s32(diny_data);
+      for (int k = 0; k < cnt; ++k) {
+        int32x4_t din0 = vld1q_s32(din_ptr);
+        int32x4_t din1 = vld1q_s32(din_ptr + 4);
+        int32x4_t din2 = vld1q_s32(din_ptr + 8);
+        int32x4_t din3 = vld1q_s32(din_ptr + 12);
+
+        din0 = vmulq_s32(din0, rb);
+        din1 = vmulq_s32(din1, rb);
+        din2 = vmulq_s32(din2, rb);
+        din3 = vmulq_s32(din3, rb);
+
+        vst1q_s32(dout_ptr, din0);
+        vst1q_s32(dout_ptr + 4, din1);
+        vst1q_s32(dout_ptr + 8, din2);
+        vst1q_s32(dout_ptr + 12, din3);
+
+        din_ptr += 16;
+        dout_ptr += 16;
+      }
+      if (remain >= 8) {
+        int32x4_t din0 = vld1q_s32(din_ptr);
+        int32x4_t din1 = vld1q_s32(din_ptr + 4);
+        din0 = vmulq_s32(din0, rb);
+        din1 = vmulq_s32(din1, rb);
+        vst1q_s32(dout_ptr, din0);
+        vst1q_s32(dout_ptr + 4, din1);
+        din_ptr += 8;
+        dout_ptr += 8;
+        remain -= 8;
+      }
+      if (remain >= 4) {
+        int32x4_t din0 = vld1q_s32(din_ptr);
+        din0 = vmulq_s32(din0, rb);
+        vst1q_s32(dout_ptr, din0);
+        din_ptr += 4;
+        dout_ptr += 4;
+        remain -= 4;
+      }
+      if (remain > 0) {
+        for (int p = 0; p < remain; ++p) {
+          *dout_ptr = *din_ptr * diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  }
+}
+
 template <>
 void elementwise_mul_relu_broadcast<float>(const float* dinx,
                                            const float* diny,
diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h
index 8977b5712c13dec0088d83db4cbfef8494785301..6fb64138221ea4ca4d70ddf04f53b5bd4cdf4a92 100644
--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -51,6 +51,7 @@
 #include "lite/backends/arm/math/prior_box.h"
 #include "lite/backends/arm/math/reduce_max.h"
 #include "lite/backends/arm/math/reduce_mean.h"
+#include "lite/backends/arm/math/reduce_prod.h"
 #include "lite/backends/arm/math/scale.h"
 #include "lite/backends/arm/math/sequence_expand.h"
 #include "lite/backends/arm/math/sequence_pool.h"
@@ -61,6 +62,7 @@
 #include "lite/backends/arm/math/slice.h"
 #include "lite/backends/arm/math/softmax.h"
 #include "lite/backends/arm/math/split.h"
+#include "lite/backends/arm/math/split_merge_lod_tenosr.h"
 #include "lite/backends/arm/math/stack.h"
 #include "lite/backends/arm/math/topk.h"
 #include "lite/backends/arm/math/yolo_box.h"
diff --git a/lite/backends/arm/math/interpolate.cc b/lite/backends/arm/math/interpolate.cc
index e9e18043dfc09001ebba23f952a59474630e54aa..1c53142fc53bc785efcbf28fa007d403ad99ab70 100644
--- a/lite/backends/arm/math/interpolate.cc
+++ b/lite/backends/arm/math/interpolate.cc
@@ -477,17 +477,23 @@ void nearest_interp(const float* src,
   float scale_h_new = (with_align)
                           ? (static_cast<float>(h_in - 1) / (h_out - 1))
                           : (static_cast<float>(h_in) / (h_out));
-
-#pragma omp parallel for collapse(2) schedule(static)
-  for (int h = 0; h < h_out; ++h) {
-    for (int w = 0; w < w_out; ++w) {
-      int near_x = (with_align) ? static_cast<int>(scale_w_new * w + 0.5)
-                                : static_cast<int>(scale_w_new * w);
-      int near_y = (with_align) ? static_cast<int>(scale_h_new * h + 0.5)
-                                : static_cast<int>(scale_h_new * h);
-      near_x = near_x < 0 ? 0 : near_x;
-      near_y = near_y < 0 ? 0 : near_y;
-      dst[h * w_out + w] = src[near_y * w_in + near_x];
+  if (with_align) {
+    for (int h = 0; h < h_out; ++h) {
+      float* dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h + 0.5);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w + 0.5);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
+    }
+  } else {
+    for (int h = 0; h < h_out; ++h) {
+      float* dst_p = dst + h * w_out;
+      int near_y = static_cast<int>(scale_h_new * h);
+      for (int w = 0; w < w_out; ++w) {
+        int near_x = static_cast<int>(scale_w_new * w);
+        *dst_p++ = src[near_y * w_in + near_x];
+      }
     }
   }
 }
@@ -520,9 +526,9 @@ void interpolate(lite::Tensor* X,
     }
     auto out_size = OutSize;
     if (out_size != nullptr) {
-      auto out_size_data = get_new_data_from_tensor<float>(out_size);
-      out_height = static_cast<int>(out_size_data[0]);
-      out_width = static_cast<int>(out_size_data[1]);
+      auto out_size_data = get_new_data_from_tensor<int>(out_size);
+      out_height = out_size_data[0];
+      out_width = out_size_data[1];
     }
   }
   float height_scale = scale;
@@ -544,8 +550,10 @@ void interpolate(lite::Tensor* X,
   int out_w = Out->dims()[3];
   int spatial_in = in_h * in_w;
   int spatial_out = out_h * out_w;
-  for (int i = 0; i < count; ++i) {
-    if ("Bilinear" == interpolate_type) {
+
+  if ("Bilinear" == interpolate_type) {
+#pragma omp parallel for
+    for (int i = 0; i < count; ++i) {
       bilinear_interp(din + spatial_in * i,
                       in_w,
                       in_h,
@@ -555,7 +563,10 @@ void interpolate(lite::Tensor* X,
                       1.f / width_scale,
                       1.f / height_scale,
                       with_align);
-    } else if ("Nearest" == interpolate_type) {
+    }
+  } else if ("Nearest" == interpolate_type) {
+#pragma omp parallel for
+    for (int i = 0; i < count; ++i) {
       nearest_interp(din + spatial_in * i,
                      in_w,
                      in_h,
diff --git a/lite/backends/arm/math/packed_sgemm_c4.cc b/lite/backends/arm/math/packed_sgemm_c4.cc
index 8087e0337bda0866f5d399a07ecb674f0fa55a3e..af4934e85756f03ec197520b2b5c130e27bdcad6 100644
--- a/lite/backends/arm/math/packed_sgemm_c4.cc
+++ b/lite/backends/arm/math/packed_sgemm_c4.cc
@@ -695,7 +695,6 @@ void sgemm_prepack_c4_common(int M,
     }
   }
 }
-
 void sgemm_prepack_c4_small(int M,
                             int N,
                             int K,
@@ -1146,6 +1145,540 @@ void sgemm_prepack_c4_small(int M,
   }
 }
 
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            ARMContext* ctx) {
+  const int m_round = (M + 3) / 4 * 4;
+  const int k_round = (K + 3) / 4 * 4;
+  const int mloop = m_round >> 2;
+  const int lda = 4 * k_round;
+  const int ldb_byte = 4 * N * sizeof(float);
+  const int kcnt = k_round >> 2;
+#ifdef __aarch64__
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#endif
+  for (int m = 0; m < mloop; ++m) {
+    const float* b = B;
+    int n = N;
+#ifdef __aarch64__
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        /* load b2, b3 */
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        /* load a2, a3 */
+        "fmul v8.4s,  v16.4s, v0.s[0] \n"
+        "fmul v9.4s,  v16.4s, v1.s[0] \n"
+        "fmul v10.4s, v16.4s, v2.s[0] \n"
+        "fmul v11.4s, v16.4s, v3.s[0] \n"
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "prfm pldl1keep, [%[b]]       \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        /* load b4, b5 */
+        "ld1  {v4.4s, v5.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load b6, b7 */
+        "ld1  {v6.4s, v7.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "sub  %[b],   %[b],   #128    \n"
+        "fmul v12.4s, v16.4s, v4.s[0] \n"
+        "fmul v13.4s, v16.4s, v5.s[0] \n"
+        "fmul v14.4s, v16.4s, v6.s[0] \n"
+        "fmul v15.4s, v16.4s, v7.s[0] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v12.4s, v17.4s, v4.s[1] \n"
+        "fmla v13.4s, v17.4s, v5.s[1] \n"
+        "fmla v14.4s, v17.4s, v6.s[1] \n"
+        "fmla v15.4s, v17.4s, v7.s[1] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v12.4s, v18.4s, v4.s[2] \n"
+        "fmla v13.4s, v18.4s, v5.s[2] \n"
+        "fmla v14.4s, v18.4s, v6.s[2] \n"
+        "fmla v15.4s, v18.4s, v7.s[2] \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "fmla v12.4s, v19.4s, v4.s[3] \n"
+        "fmla v13.4s, v19.4s, v5.s[3] \n"
+        "fmla v14.4s, v19.4s, v6.s[3] \n"
+        "fmla v15.4s, v19.4s, v7.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "beq  2f                      \n"
+        "1:\n"
+        /* load b2, b3 */
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "prfm pldl1keep, [%[b]]       \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        /* load b4, b5 */
+        "ld1  {v4.4s, v5.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load b6, b7 */
+        "ld1  {v6.4s, v7.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "sub  %[b],   %[b],   #128    \n"
+        "fmla v12.4s, v16.4s, v4.s[0] \n"
+        "fmla v13.4s, v16.4s, v5.s[0] \n"
+        "fmla v14.4s, v16.4s, v6.s[0] \n"
+        "fmla v15.4s, v16.4s, v7.s[0] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v12.4s, v17.4s, v4.s[1] \n"
+        "fmla v13.4s, v17.4s, v5.s[1] \n"
+        "fmla v14.4s, v17.4s, v6.s[1] \n"
+        "fmla v15.4s, v17.4s, v7.s[1] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v12.4s, v18.4s, v4.s[2] \n"
+        "fmla v13.4s, v18.4s, v5.s[2] \n"
+        "fmla v14.4s, v18.4s, v6.s[2] \n"
+        "fmla v15.4s, v18.4s, v7.s[2] \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "fmla v12.4s, v19.4s, v4.s[3] \n"
+        "fmla v13.4s, v19.4s, v5.s[3] \n"
+        "fmla v14.4s, v19.4s, v6.s[3] \n"
+        "fmla v15.4s, v19.4s, v7.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        "st1  {v12.4s, v13.4s, v14.4s, v15.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* load b0-b3 */
+        "ld1  {v0.4s,  v1.4s},  [%[b]], #32 \n"
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        "fmul v8.4s,  v16.4s, v0.s[0] \n"
+        "fmul v9.4s,  v16.4s, v1.s[0] \n"
+        "fmul v10.4s, v16.4s, v2.s[0] \n"
+        "fmul v11.4s, v16.4s, v3.s[0] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "sub  %[b],   %[b],   #64     \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "beq  2f                      \n"
+        "1:\n"
+        /* load b0-b3 */
+        "ld1  {v0.4s,  v1.4s},  [%[b]], #32 \n"
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "sub  %[b],   %[b],   #64     \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v8", "v9",
+          "v10", "v11", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* load b0 */
+        "ld1  {v0.4s},  [%[b]], #16  \n"
+        "fmul v8.4s,  v16.4s, v0.s[0] \n"
+        "fmul v9.4s,  v17.4s, v0.s[1] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "sub  %[b],   %[b],   #16     \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v19.4s, v0.s[3] \n"
+         /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "beq  2f                      \n"
+        "1:\n"
+        /* load b0 */
+        "ld1  {v0.4s},  [%[b]], #16  \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v17.4s, v0.s[1] \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "sub  %[b],   %[b],   #16     \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v19.4s, v0.s[3] \n"
+         /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "bne  1b                      \n"
+        "2:\n"
+        "fadd v8.4s,  v8.4s,  v9.4s   \n"
+        "st1  {v8.4s}, [%[c]], #16    \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte),
+          [vzero] "w" (vzero)
+        : "v0", "v8", "v9", "v16", "v17", 
+          "v18", "v19", "cc", "memory"
+      );
+      b += 4;
+    }
+#else
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]!  \n"
+        "vld1.32  {d0-d3}, [%[b]]! \n"
+        /* load b2, b3 */
+        "vld1.32    {d4-d7},   [%[b]]! \n"
+        "vmul.f32   q8,   q4,   d0[0]  \n"
+        "vmul.f32   q9,   q4,   d2[0]  \n"
+        "vmul.f32   q10,  q4,   d4[0]  \n"
+        "vmul.f32   q11,  q4,   d6[0]  \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15},   [%[a]]! \n"
+        "pld    [%[b]]                 \n"
+        "vmla.f32   q8,   q5,   d0[1]  \n"
+        "vmla.f32   q9,   q5,   d2[1]  \n"
+        "vmla.f32   q10,  q5,   d4[1]  \n"
+        "vmla.f32   q11,  q5,   d6[1]  \n"
+        "subs   %[cnt],   %[cnt],  #1  \n"
+        "vmla.f32   q8,   q6,   d1[0]  \n"
+        "vmla.f32   q9,   q6,   d3[0]  \n"
+        "vmla.f32   q10,  q6,   d5[0]  \n"
+        "vmla.f32   q11,  q6,   d7[0]  \n"
+        "pld    [%[b], #64]            \n"
+        "vmla.f32   q8,   q7,   d1[1]  \n"
+        "vmla.f32   q9,   q7,   d3[1]  \n"
+        /* load b4, b5 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q10,  q7,   d5[1]  \n"
+        "vmla.f32   q11,  q7,   d7[1]  \n"
+        /* load b6, b7 */
+        "vld1.32    {d4-d7},  [%[b]]!  \n"
+        "vmul.f32   q12,  q4,   d0[0]  \n"
+        "vmul.f32   q13,  q4,   d2[0]  \n"
+        "vmul.f32   q14,  q4,   d4[0]  \n"
+        "vmul.f32   q15,  q4,   d6[0]  \n"
+        "sub  %[b],   %[b],   #128     \n"
+        "vmla.f32   q12,  q5,   d0[1]  \n"
+        "vmla.f32   q13,  q5,   d2[1]  \n"
+        "vmla.f32   q14,  q5,   d4[1]  \n"
+        "vmla.f32   q15,  q5,   d6[1]  \n"
+        "add  %[b],   %[b],   %[ldb]   \n"
+        "vmla.f32   q12,  q6,   d1[0]  \n"
+        "vmla.f32   q13,  q6,   d3[0]  \n"
+        "vmla.f32   q14,  q6,   d5[0]  \n"
+        "vmla.f32   q15,  q6,   d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32    {d8-d11}, [%[a]]!  \n"
+        "vmla.f32   q12,  q7,   d1[1]  \n"
+        "vmla.f32   q13,  q7,   d3[1]  \n"
+        /* load b0, b1 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q14,  q7,   d5[1]  \n"
+        "vmla.f32   q15,  q7,   d7[1]  \n"
+        "beq  2f                       \n"
+        "1:\n"
+        /* load b2, b3 */
+        "vld1.32    {d4-d7},   [%[b]]! \n"
+        "vmla.f32   q8,   q4,   d0[0]  \n"
+        "vmla.f32   q9,   q4,   d2[0]  \n"
+        "vmla.f32   q10,  q4,   d4[0]  \n"
+        "vmla.f32   q11,  q4,   d6[0]  \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15},   [%[a]]! \n"
+        "pld    [%[b]]                 \n"
+        "vmla.f32   q8,   q5,   d0[1]  \n"
+        "vmla.f32   q9,   q5,   d2[1]  \n"
+        "vmla.f32   q10,  q5,   d4[1]  \n"
+        "vmla.f32   q11,  q5,   d6[1]  \n"
+        "subs   %[cnt],   %[cnt],  #1  \n"
+        "vmla.f32   q8,   q6,   d1[0]  \n"
+        "vmla.f32   q9,   q6,   d3[0]  \n"
+        "vmla.f32   q10,  q6,   d5[0]  \n"
+        "vmla.f32   q11,  q6,   d7[0]  \n"
+        "pld    [%[b], #64]            \n"
+        "vmla.f32   q8,   q7,   d1[1]  \n"
+        "vmla.f32   q9,   q7,   d3[1]  \n"
+        /* load b4, b5 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q10,  q7,   d5[1]  \n"
+        "vmla.f32   q11,  q7,   d7[1]  \n"
+        /* load b6, b7 */
+        "vld1.32    {d4-d7},  [%[b]]!  \n"
+        "vmla.f32   q12,  q4,   d0[0]  \n"
+        "vmla.f32   q13,  q4,   d2[0]  \n"
+        "vmla.f32   q14,  q4,   d4[0]  \n"
+        "vmla.f32   q15,  q4,   d6[0]  \n"
+        "sub  %[b],   %[b],   #128     \n"
+        "vmla.f32   q12,  q5,   d0[1]  \n"
+        "vmla.f32   q13,  q5,   d2[1]  \n"
+        "vmla.f32   q14,  q5,   d4[1]  \n"
+        "vmla.f32   q15,  q5,   d6[1]  \n"
+        "add  %[b],   %[b],   %[ldb]   \n"
+        "vmla.f32   q12,  q6,   d1[0]  \n"
+        "vmla.f32   q13,  q6,   d3[0]  \n"
+        "vmla.f32   q14,  q6,   d5[0]  \n"
+        "vmla.f32   q15,  q6,   d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32    {d8-d11}, [%[a]]!  \n"
+        "vmla.f32   q12,  q7,   d1[1]  \n"
+        "vmla.f32   q13,  q7,   d3[1]  \n"
+        /* load b0, b1 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q14,  q7,   d5[1]  \n"
+        "vmla.f32   q15,  q7,   d7[1]  \n"
+        "bne  1b                       \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!   \n"
+        "vst1.32  {d20-d23}, [%[c]]!   \n"
+        "vst1.32  {d24-d27}, [%[c]]!   \n"
+        "vst1.32  {d28-d31}, [%[c]]!   \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11},  [%[a]]!   \n"
+        /* load b0-b3 */
+        "vld1.32  {d0-d3},  [%[b]]! \n"
+        "vld1.32  {d4-d7},  [%[b]]! \n"
+        "vmul.f32  q8,   q4, d0[0]  \n"
+        "vmul.f32  q9,   q4, d2[0]  \n"
+        "vmul.f32  q10,  q4, d4[0]  \n"
+        "vmul.f32  q11,  q4, d6[0]  \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15}, [%[a]]!\n"
+        "sub  %[b], %[b], #64       \n"
+        "vmla.f32  q8,   q5, d0[1]  \n"
+        "vmla.f32  q9,   q5, d2[1]  \n"
+        "vmla.f32  q10,  q5, d4[1]  \n"
+        "vmla.f32  q11,  q5, d6[1]  \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32  q8,   q6, d1[0]  \n"
+        "vmla.f32  q9,   q6, d3[0]  \n"
+        "vmla.f32  q10,  q6, d5[0]  \n"
+        "vmla.f32  q11,  q6, d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]! \n"
+        "vmla.f32  q8,   q7, d1[1]  \n"
+        "vmla.f32  q9,   q7, d3[1]  \n"
+        "vmla.f32  q10,  q7, d5[1]  \n"
+        "vmla.f32  q11,  q7, d7[1]  \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "beq  2f                    \n"
+        "1:\n"
+        /* load b0-b3 */
+        "vld1.32  {d0-d3},  [%[b]]! \n"
+        "vld1.32  {d4-d7},  [%[b]]! \n"
+        "vmla.f32  q8,   q4, d0[0]  \n"
+        "vmla.f32  q9,   q4, d2[0]  \n"
+        "vmla.f32  q10,  q4, d4[0]  \n"
+        "vmla.f32  q11,  q4, d6[0]  \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15}, [%[a]]!\n"
+        "sub  %[b], %[b], #64       \n"
+        "vmla.f32  q8,   q5, d0[1]  \n"
+        "vmla.f32  q9,   q5, d2[1]  \n"
+        "vmla.f32  q10,  q5, d4[1]  \n"
+        "vmla.f32  q11,  q5, d6[1]  \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32  q8,   q6, d1[0]  \n"
+        "vmla.f32  q9,   q6, d3[0]  \n"
+        "vmla.f32  q10,  q6, d5[0]  \n"
+        "vmla.f32  q11,  q6, d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]! \n"
+        "vmla.f32  q8,   q7, d1[1]  \n"
+        "vmla.f32  q9,   q7, d3[1]  \n"
+        "vmla.f32  q10,  q7, d5[1]  \n"
+        "vmla.f32  q11,  q7, d7[1]  \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "bne  1b                    \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!\n"
+        "vst1.32  {d20-d23}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "0:\n"
+        /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]! \n"
+        /* load b0 */
+        "vld1.32  {d0-d1},  [%[b]]! \n"
+        "vmul.f32   q5, q1, d0[0]   \n"
+        "vmul.f32   q6, q2, d0[1]   \n"
+        /* load a2, a3 */
+        "vld1.32  {d6-d9},  [%[a]]! \n"
+        "sub  %[b], %[b],   #16     \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32   q5, q3, d1[0]   \n"
+        "vmla.f32   q6, q4, d1[1]   \n"
+         /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]!  \n"
+        "beq  2f                    \n"
+        "1:\n"
+        /* load b0 */
+        "vld1.32  {d0-d1},  [%[b]]! \n"
+        "vmla.f32   q5, q1, d0[0]   \n"
+        "vmla.f32   q6, q2, d0[1]   \n"
+        /* load a2, a3 */
+        "vld1.32  {d6-d9},  [%[a]]! \n"
+        "sub  %[b], %[b],   #16     \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32   q5, q3, d1[0]   \n"
+        "vmla.f32   q6, q4, d1[1]   \n"
+         /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]!  \n"
+        "bne  1b                    \n"
+        "2:\n"
+        "vadd.f32   q5, q5,   q6    \n"
+        "vst1.32  {d10-d11}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [ldb]  "r" (ldb_byte)
+        : "q0", "q1", "q2", "q3", "q4", 
+          "q5", "q6", "q7", "q8", "cc", "memory"
+      );
+      // clang-format on
+      b += 4;
+    }
+#endif
+    A_packed += lda;
+  }
+}
+
 void sgemm_prepack_c4(int M,
                       int N,
                       int K,
diff --git a/lite/backends/arm/math/packed_sgemm_c4.h b/lite/backends/arm/math/packed_sgemm_c4.h
index 21e5af634315a7da66914bb04775088fec55550c..3229ff3e0774ce8bff02b12d79d7ec50ed873cea 100644
--- a/lite/backends/arm/math/packed_sgemm_c4.h
+++ b/lite/backends/arm/math/packed_sgemm_c4.h
@@ -47,6 +47,13 @@ void sgemm_prepack_c4_small(int M,
                             bool has_bias,
                             bool has_relu,
                             ARMContext* ctx);
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index 8524d7376f2bb7e337dfc11b890c00e281d2e880..9d42fd98df3ccec33457dd6d20ecb3b11684e04c 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -167,7 +167,7 @@ void pooling_basic(const float* din,
   "ld1 {v2.4s-v3.4s}, [%[data_in_channel]], #32    \n" \
   "fmax v6.4s, v4.4s, v5.4s \n"                        \
   "subs %w[cnt], %w[cnt], #1 \n"                       \
-  "fmax %w[vmax].4s, %w[vmax].4s, v6.4s \n"            \
+  "fmax %[vmax].4s, %[vmax].4s, v6.4s \n"              \
   "bne 1b \n"
 #define GLOBAL_AVG                                  \
   "1: \n"                                           \
@@ -176,7 +176,7 @@ void pooling_basic(const float* din,
   "ld1 {v0.4s-v1.4s}, [%[data_in_channel]], #32 \n" \
   "fadd %[vsum].4s, %[vsum].4s, v3.4s \n"           \
   "subs %w[cnt], %w[cnt], #1 \n"                    \
-  "fadd %w[vsum].4s, %w[vsum].4s, v4.4s \n"         \
+  "fadd %[vsum].4s, %[vsum].4s, v4.4s \n"           \
   "ld1 {v2.4s-v3.4s}, [%[data_in_channel]], #32 \n" \
   "bne 1b \n"
 
diff --git a/lite/backends/arm/math/reduce_prod.cc b/lite/backends/arm/math/reduce_prod.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7b3f7095f2087af365d0765f49df7902df42bb9
--- /dev/null
+++ b/lite/backends/arm/math/reduce_prod.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/arm/math/reduce_prod.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/reduce_prod.h b/lite/backends/arm/math/reduce_prod.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c8898288fa498a6f97709a27306e6975dffc975
--- /dev/null
+++ b/lite/backends/arm/math/reduce_prod.h
@@ -0,0 +1,185 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+template <typename T>
+void reduce_prod_n(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index, src_index0;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = static_cast<T>(1);
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void reduce_prod_c(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = static_cast<T>(1);
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void reduce_prod_h(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = static_cast<T>(1);
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void reduce_prod_w(const T* src,
+                   T* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = static_cast<T>(1);
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void reduce_prod_nc(const T* src,
+                    T* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  auto* tmp_out = tensor_tmp.mutable_data<T>();
+  reduce_prod_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+template <typename T>
+void reduce_prod_ch(const T* src,
+                    T* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  auto* tmp_out = tensor_tmp.mutable_data<T>();
+  reduce_prod_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+template <typename T>
+void reduce_prod_hw(const T* src,
+                    T* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  auto* tmp_out = tensor_tmp.mutable_data<T>();
+  reduce_prod_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+template <typename T>
+void reduce_prod_all(const T* src, T* dst, int64_t total_num) {
+  dst[0] = static_cast<T>(1);
+  for (int n = 0; n < total_num; ++n) {
+    dst[0] *= src[n];
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/slice.cc b/lite/backends/arm/math/slice.cc
index 8b9a7690509260ed4c6c0e14750d849f657d2fa8..67ca567fea988acfc9e20e2bfc929e9c3a0bbcb8 100644
--- a/lite/backends/arm/math/slice.cc
+++ b/lite/backends/arm/math/slice.cc
@@ -86,6 +86,13 @@ template void slice(const int* input,
                     std::vector<int> ends,
                     int* out,
                     Context<TARGET(kARM)>* ctx);
+template void slice(const float* input,
+                    std::vector<int64_t> dims,
+                    std::vector<int> axes,
+                    std::vector<int> starts,
+                    std::vector<int> ends,
+                    float* out,
+                    Context<TARGET(kARM)>* ctx);
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.cc b/lite/backends/arm/math/split_merge_lod_tenosr.cc
new file mode 100644
index 0000000000000000000000000000000000000000..35dc4a455b7c51e0aab1a45c48460ccc513b9a08
--- /dev/null
+++ b/lite/backends/arm/math/split_merge_lod_tenosr.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/split_merge_lod_tenosr.h"
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod,
+                                        size_t start_idx,
+                                        size_t end_idx,
+                                        size_t start_level) {
+  LoD sub_lod;
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    CHECK(start_idx <= end_idx);
+    CHECK(end_idx < lod[level_idx].size());
+    std::vector<uint64_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    sub_lod.emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+}
+
+void AppendLoD(LoD *lod, const LoD &lod_length) {
+  CHECK(lod->empty() || lod->size() == lod_length.size());
+  if (lod->empty()) {
+    for (size_t i = 0; i < lod_length.size(); ++i) {
+      lod->emplace_back(std::vector<uint64_t>({0}));
+    }
+  }
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto &level = (*lod)[i];
+    for (auto len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.h b/lite/backends/arm/math/split_merge_lod_tenosr.h
new file mode 100644
index 0000000000000000000000000000000000000000..47c484aa4a203ed1819a7e810f71858f4ef0b4dd
--- /dev/null
+++ b/lite/backends/arm/math/split_merge_lod_tenosr.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <utility>
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD &lod, size_t start_idx, size_t end_idx, size_t start_level);
+
+void AppendLoD(LoD *lod, const LoD &lod_length);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc
index a4f33f467feb8626696595e95a29fde7b636919d..5dd53084f4079ae68c6fda0530fb5de8cf1d3717 100644
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -89,9 +89,15 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
         this->act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
   }
 
+#if CUDNN_VERSION_MIN(7, 0, 0)
+  cudnnMathType_t math_type =
+      use_tensor_core_ ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+  CUDNN_CHECK(cudnnSetConvolutionMathType(this->conv_desc_, math_type));
+#endif
+
   if (ic == param.groups && ic == oc && ic != 1) {
     this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  } else if (1) {
+  } else if (!param.var_length) {
     const auto* i_data = param.x->data<float>();
     const auto* w_data = param.filter->data<float>();
     auto* o_data = param.output->mutable_data<float>(TARGET(kCUDA));
diff --git a/lite/backends/cuda/math/gemm.h b/lite/backends/cuda/math/gemm.h
index 12194d54b08a533a3812e10b5d2f78134c19da24..85576e65018a0e1bdec6f2bd2fdc590bd35e9656 100644
--- a/lite/backends/cuda/math/gemm.h
+++ b/lite/backends/cuda/math/gemm.h
@@ -55,6 +55,8 @@ class Gemm {
            PtypeOut* c,
            Context<TARGET(kCUDA)>* ctx);
 
+  cublasHandle_t get_handle() const { return cu_handle_; }
+
  private:
   cudaStream_t exe_stream_;
   cublasHandle_t cu_handle_;
diff --git a/lite/backends/cuda/math/transpose.cu b/lite/backends/cuda/math/transpose.cu
index cebcece812dc584d0921edea2fef8f129e430b56..c50840fe269657965db8c58b171fce6819009775 100644
--- a/lite/backends/cuda/math/transpose.cu
+++ b/lite/backends/cuda/math/transpose.cu
@@ -69,44 +69,16 @@ void BatchTranspose2DCUDAImpl(const int N,
                               const int W,
                               const T* input,
                               T* out,
-                              CUDAContext* ctx) {
+                              cudaStream_t* stream) {
   const int dh = (H + kTileDim - 1) / kTileDim;
   const int dw = (W + kTileDim - 1) / kTileDim;
   BatchTranspose2DCUDAKernel<
-      T><<<N * dh * dw, dim3(kTileDim, kBlockRows), 0, ctx->exec_stream()>>>(
+      T><<<N * dh * dw, dim3(kTileDim, kBlockRows), 0, *stream>>>(
       N, H, W, dh, dw, input, out);
   cudaError_t error = cudaGetLastError();
   if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
 
-#define TYPE_SPECIALIZED_CUDA_NCHW2NHWC(T)             \
-  template <>                                          \
-  void NCHW2NHWC<T>(const int N,                       \
-                    const int C,                       \
-                    const int HxW,                     \
-                    const T* X,                        \
-                    T* Y,                              \
-                    CUDAContext* ctx) {                \
-    BatchTranspose2DCUDAImpl<T>(N, C, HxW, X, Y, ctx); \
-  }
-TYPE_SPECIALIZED_CUDA_NCHW2NHWC(float)
-TYPE_SPECIALIZED_CUDA_NCHW2NHWC(int8_t)
-#undef TYPE_SPECIALIZED_CUDA_NCHW2NHWC
-
-#define TYPE_SPECIALIZED_CUDA_NHWC2NCHW(T)             \
-  template <>                                          \
-  void NHWC2NCHW<T>(const int N,                       \
-                    const int C,                       \
-                    const int HxW,                     \
-                    const T* X,                        \
-                    T* Y,                              \
-                    CUDAContext* ctx) {                \
-    BatchTranspose2DCUDAImpl<T>(N, HxW, C, X, Y, ctx); \
-  }
-TYPE_SPECIALIZED_CUDA_NHWC2NCHW(float)
-TYPE_SPECIALIZED_CUDA_NHWC2NCHW(int8_t)
-#undef TYPE_SPECIALIZED_CUDA_NHWC2NCHW
-
 template <typename T>
 __global__ void TransposeCUDAKernel(const int size,
                                     const int ndim,
@@ -136,7 +108,9 @@ void TransposeCUDAImpl(const std::vector<int64_t>& X_dims,
                        const std::vector<int>& axes,
                        const T* X,
                        T* Y,
-                       CUDAContext* ctx) {
+                       lite::Tensor* Y_dims_,
+                       lite::Tensor* strides_,
+                       cudaStream_t* stream) {
   CHECK_EQ(X_dims.size(), axes.size()) << "dimension size should be equal";
   int ndim = X_dims.size();
   std::vector<int> strides(ndim, 0);
@@ -156,37 +130,68 @@ void TransposeCUDAImpl(const std::vector<int64_t>& X_dims,
     size *= X_dims[i];
   }
 
-  lite::Tensor Y_dims_, strides_;
-  Y_dims_.Resize(std::vector<int64_t>({ndim}));
-  int* d_y_dims = Y_dims_.mutable_data<int>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(
-      d_y_dims, Y_dims.data(), sizeof(int) * Y_dims.size(), IoDirection::HtoD);
+  Y_dims_->Resize(std::vector<int64_t>({ndim}));
+  int* d_y_dims = Y_dims_->mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(d_y_dims,
+                                 Y_dims.data(),
+                                 sizeof(int) * Y_dims.size(),
+                                 IoDirection::HtoD,
+                                 *stream);
 
-  strides_.Resize(std::vector<int64_t>({ndim}));
-  int* d_strides = strides_.mutable_data<int>(TARGET(kCUDA));
-  CopySync<TARGET(kCUDA)>(d_strides,
-                          strides.data(),
-                          sizeof(int) * strides.size(),
-                          IoDirection::HtoD);
+  strides_->Resize(std::vector<int64_t>({ndim}));
+  int* d_strides = strides_->mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(d_strides,
+                                 strides.data(),
+                                 sizeof(int) * strides.size(),
+                                 IoDirection::HtoD,
+                                 *stream);
 
   const int M = (size + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-  TransposeCUDAKernel<<<M, CUDA_NUM_THREADS, 0, ctx->exec_stream()>>>(
+  TransposeCUDAKernel<<<M, CUDA_NUM_THREADS, 0, *stream>>>(
       size, ndim, d_strides, d_y_dims, X, Y);
   auto e = cudaGetLastError();
   CHECK_EQ(e, cudaSuccess) << " CUDA: " << cudaGetErrorString(e);
 }
 
-#define TYPE_SPECIALIZED_CUDA_TRANSPOSE(T)              \
-  template <>                                           \
-  void Transpose<T>(const std::vector<int64_t>& X_dims, \
-                    const std::vector<int>& axes,       \
-                    const T* X,                         \
-                    T* Y,                               \
-                    CUDAContext* ctx) {                 \
-    TransposeCUDAImpl<T>(X_dims, axes, X, Y, ctx);      \
-  }
-TYPE_SPECIALIZED_CUDA_TRANSPOSE(float)
-#undef TYPE_SPECIALIZED_CUDA_TRANSPOSEF
+template <typename T>
+void Transpose<T>::NCHW2NHWC(
+    int N, int C, int HxW, const T* X, T* Y, cudaStream_t* stream) {
+  BatchTranspose2DCUDAImpl<T>(N, C, HxW, X, Y, stream);
+}
+
+template <typename T>
+void Transpose<T>::NHWC2NCHW(
+    int N, int C, int HxW, const T* X, T* Y, cudaStream_t* stream) {
+  BatchTranspose2DCUDAImpl<T>(N, HxW, C, X, Y, stream);
+}
+
+template <typename T>
+void Transpose<T>::transpose(T* dst,
+                             const T* src,
+                             const std::vector<int64_t>& src_dims,
+                             const std::vector<int>& axes,
+                             cudaStream_t* stream) {
+  TransposeCUDAImpl<T>(src_dims, axes, src, dst, &Y_dims_, &strides_, stream);
+}
+
+// template <typename T>
+// void Transpose<T>::transpose(T* dst,
+//                             const T* src,
+//                             const std::vector<int>& src_dims,
+//                             const std::vector<int>& axes,
+//                             cudaStream_t* stream) {
+//  std::vector<int64_t> _src_dims(src_dims.size(), 0);
+//  std::transform(
+//      src_dims.begin(),
+//      src_dims.end(),
+//      _src_dims.begin(),
+//      [](int data) -> int64_t { return static_cast<int64_t>(data); });
+//  TransposeCUDAImpl<T>(_src_dims, axes, src, dst, &Y_dims_, &strides_,
+//  stream);
+//}
+
+template class Transpose<int8_t>;
+template class Transpose<float>;
 
 }  // namespace math
 }  // namespace cuda
diff --git a/lite/backends/cuda/math/transpose.h b/lite/backends/cuda/math/transpose.h
index ba2464547b587f44cd9b0ce287a0d40d37d46411..ed52ba3b5590ab631c3c57a0472e16cb0ed51a91 100644
--- a/lite/backends/cuda/math/transpose.h
+++ b/lite/backends/cuda/math/transpose.h
@@ -26,17 +26,27 @@ namespace cuda {
 namespace math {
 
 template <typename T>
-void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y, CUDAContext* context);
+class Transpose {
+ public:
+  void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y, cudaStream_t* stream);
 
-template <typename T>
-void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y, CUDAContext* context);
+  void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y, cudaStream_t* stream);
 
-template <typename T>
-void Transpose(const std::vector<int64_t>& X_dims,
-               const std::vector<int>& axes,
-               const T* X,
-               T* Y,
-               CUDAContext* ctx);
+  void transpose(T* dst,
+                 const T* src,
+                 const std::vector<int64_t>& src_dims,
+                 const std::vector<int>& axes,
+                 cudaStream_t* stream);
+
+  // void transpose(T* dst,
+  //               const T* src,
+  //               const std::vector<int>& src_dims,
+  //               const std::vector<int>& axes,
+  //               cudaStream_t* stream);
+
+ private:
+  lite::Tensor Y_dims_, strides_;  // for transpose.
+};
 
 }  // namespace math
 }  // namespace cuda
diff --git a/lite/backends/fpga/CMakeLists.txt b/lite/backends/fpga/CMakeLists.txt
index b12fd85caf7e0c79de830b45569e02ba916c34e6..a5207c01a4d5e7b8d05490bd7c9be0dcc01f365e 100644
--- a/lite/backends/fpga/CMakeLists.txt
+++ b/lite/backends/fpga/CMakeLists.txt
@@ -3,13 +3,35 @@ if (NOT LITE_WITH_FPGA)
 endif()
 
 set(LITE_FPGA_KD_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD")
+set(LITE_FPGA_KD_LLAPI_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/llapi")
+set(LITE_FPGA_KD_PE_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga/KD/pes")
 set(LITE_FPGA_PATH "${PADDLE_SOURCE_DIR}/lite/backends/fpga")
 
 message("fpga_kd_path ${LITE_FPGA_KD_PATH}")
 message("fpga_path ${LITE_FPGA_PATH}")
-file(GLOB_RECURSE KD_CPP *.cpp *.cc)
+file(GLOB KD_CPP "${LITE_FPGA_KD_PATH}/*.cpp")
+file(GLOB PE_CPP "${LITE_FPGA_KD_PE_PATH}/*.cpp")
+file(GLOB LLAPI_CPP "${LITE_FPGA_KD_LLAPI_PATH}/*.cpp")
 file(GLOB FPGA_CPP "${LITE_FPGA_PATH}/*.cc")
-
-cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
+set(FPGA_ALL_CPP "")
+FOREACH(FILE_PATH ${KD_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND FPGA_ALL_CPP KD/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${PE_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND FPGA_ALL_CPP KD/pes/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${LLAPI_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list(APPEND FPGA_ALL_CPP KD/llapi/${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+FOREACH(FILE_PATH ${FPGA_CPP})
+    STRING(REGEX REPLACE ".+/(.+\\..*)" "\\1" FILE_NAME ${FILE_PATH})
+    list( APPEND FPGA_ALL_CPP ${FILE_NAME})
+ENDFOREACH(FILE_PATH)
+message("fpga kd: ${FPGA_ALL_CPP}")
+cc_library(kernel_fpga SRCS ${FPGA_ALL_CPP})
+#cc_library(kernel_fpga SRCS ${KD_CPP} ${FPGA_CPP})
 cc_library(lite_tensor_fpga SRCS lite_tensor.cc DEPS memory)
-cc_library(fpga_target_wrapper SRCS ${LITE_FPGA_PATH}/target_wrapper.cc DEPS kernel_fpga)
+cc_library(fpga_target_wrapper SRCS target_wrapper.cc DEPS kernel_fpga)
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b9b23070616baf18f347c6b2af2d87a300d428f
--- /dev/null
+++ b/lite/backends/fpga/KD/debugger.hpp
@@ -0,0 +1,140 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+#define FPGA_PRINT_TENSOR
+
+class Debugger {
+ public:
+  static Debugger& get_instance() {
+    static Debugger s_instance;
+    return s_instance;
+  }
+
+  void registerOutput(std::string op_type, zynqmp::Tensor* tensor) {
+    if (op_type != "conv") {  // NOLINT
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, bool> op_config;
+  Debugger() {
+    op_config["concat"] = true;
+    op_config["conv"] = true;
+    op_config["crop"] = true;
+  }
+};
+
+inline void chw_to_hwc(Tensor* t, float* dst) {
+  int num = t->dims()[0];
+  int channel = t->dims()[1];
+
+  int height = 1;
+  int width = 1;
+  if (t->dims().size() > 2) {
+    height = t->dims()[2];
+  }
+  if (t->dims().size() > 3) {
+    width = t->dims()[3];
+  }
+  const float* chw_data = t->data<float>();
+  float* hwc_data = dst;
+
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        for (int w = 0; w < width; w++) {
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+
+inline void read_from_file(lite::Tensor* t, const std::string& path) {
+  std::ifstream file_stream;
+  file_stream.open(path);
+  if (!file_stream) {
+    return;
+  }
+  float* data = t->mutable_data<float>();
+  int num = t->numel();
+  for (int i = 0; i < num; ++i) {
+    float value = 0;
+    file_stream >> value;
+    data[i] = value;
+  }
+}
+
+inline void save_float(float* data, const std::string& name, int len) {
+  static int counter = 0;
+  std::string old_string = std::to_string(counter);
+  std::string new_string =
+      std::string(3 - old_string.length(), '0') + old_string;
+
+  std::string file = "arm_" + new_string + name;
+  counter++;
+
+  std::ofstream ofs;
+  ofs.open(file);
+  for (int i = 0; i < len; i++) {
+    float value = data[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+
+inline void save_tensor(lite::Tensor* t,
+                        const std::string& name,
+                        bool convert = true) {
+  float* data = const_cast<float*>(t->data<float>());
+  float* dst = new float[t->numel()];
+  if (convert) {
+    chw_to_hwc(t, dst);
+    data = dst;
+  }
+
+  save_float(data, name, t->numel());
+  delete[] dst;
+}
+
+inline void save_tensor(const lite::Tensor* t,
+                        const std::string& name,
+                        bool convert = true) {
+  float* data = const_cast<float*>(t->data<float>());
+  float* dst = new float[t->numel()];
+  if (convert) {
+    chw_to_hwc(const_cast<lite::Tensor*>(t), dst);
+    data = dst;
+  }
+
+  save_float(data, name, t->numel());
+
+  delete[] dst;
+}
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/dl_engine.cpp b/lite/backends/fpga/KD/dl_engine.cpp
old mode 100644
new mode 100755
index 9849e4275b5d0f59346b9684530610853f1a560c..ea503518a0f39671e77157f14788a1cadb4579f3
--- a/lite/backends/fpga/KD/dl_engine.cpp
+++ b/lite/backends/fpga/KD/dl_engine.cpp
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "lite/backends/fpga/KD/dl_engine.hpp"
+
 namespace paddle {
 namespace zynqmp {
 
 DLEngine::DLEngine() {
   open_device();
-  struct DeviceInfo info;
-  int ret = get_device_info(info);
-  filter::set_filter_capacity(info.filter_cap);
+  int ret = get_device_info(info_);
+  filter::set_filter_capacity(info_.filter_cap);
+  filter::set_colunm(info_.colunm);
 }
 
 }  // namespace zynqmp
diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp
old mode 100644
new mode 100755
index 829f41dfebfabfe5642bd4cf107fc6c54f3ffd86..eddf5ca454cdc9e91f87d6e4f2c8dfc13f35fdc6
--- a/lite/backends/fpga/KD/dl_engine.hpp
+++ b/lite/backends/fpga/KD/dl_engine.hpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
-
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
@@ -29,8 +28,15 @@ class DLEngine {
     return s_instance;
   }
 
+  DeviceInfo& deviceInfo();
+
+  bool isZU3() { return info_.device_type / 100 == 3; }
+
+  float* out_data = nullptr;
+
  private:
   DLEngine();
+  DeviceInfo info_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/layout.hpp b/lite/backends/fpga/KD/layout.hpp
index 74819cd2120630def0114422b04efe076e1d6cb2..c6b5c911872b6b22633a4319ea708ed23c7e7e36 100644
--- a/lite/backends/fpga/KD/layout.hpp
+++ b/lite/backends/fpga/KD/layout.hpp
@@ -22,6 +22,7 @@ namespace paddle {
 namespace zynqmp {
 
 enum LayoutType {
+  None,
   N,
   NC,
   NCHW,
@@ -39,6 +40,15 @@ class Layout {
   virtual int elementCount(const std::vector<int>& dims) = 0;
 };
 
+struct None : Layout {
+  int numIndex() { return -1; }
+  int channelIndex() { return -1; }
+  int heightIndex() { return -1; }
+  int widthIndex() { return -1; }
+  int alignedElementCount(const std::vector<int>& dims) { return 16; }
+  virtual int elementCount(const std::vector<int>& dims) { return 1; }
+};
+
 struct NCHW : Layout {
   int numIndex() { return 0; }
   int channelIndex() { return 1; }
diff --git a/lite/backends/fpga/KD/llapi/bias_scale.cpp b/lite/backends/fpga/KD/llapi/bias_scale.cpp
index cd60f27f9896e857f8ad566d285a9b9aea1d4721..339a442207e811be31161ff25f60a080572efe8d 100644
--- a/lite/backends/fpga/KD/llapi/bias_scale.cpp
+++ b/lite/backends/fpga/KD/llapi/bias_scale.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <memory.h>
 
+#include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/bias_scale.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
@@ -54,7 +55,7 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
   *data_in = ptr_aligned;
 }
 
-void interleave(float **data_in, int num_after_alignment) {
+size_t interleave(float **data_in, int num_after_alignment) {
   float *ptr_uninterleaved = *data_in;
   float *ptr_interleaved =
       (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
@@ -69,6 +70,7 @@ void interleave(float **data_in, int num_after_alignment) {
 
   fpga_free(ptr_uninterleaved);
   *data_in = ptr_interleaved;
+  return 2 * num_after_alignment * sizeof(float);
 }
 
 void format_bias_scale_array(float **bias_scale_array,
@@ -78,8 +80,9 @@ void format_bias_scale_array(float **bias_scale_array,
   int div_num = (num + element_num_per_division - 1) / element_num_per_division;
   int element_num_after_division =
       align_to_x(element_num_per_division, BS_NUM_ALIGNMENT);
-  interleave(bias_scale_array, div_num * element_num_after_division);
-  fpga_flush(*bias_scale_array, 2 * element_num_after_division * sizeof(float));
+  size_t mem =
+      interleave(bias_scale_array, div_num * element_num_after_division);
+  fpga_flush(*bias_scale_array, mem);
 }
 void format_bias_array(float **bias_array, int num) {
   float *ptr_unaligned = *bias_array;
diff --git a/lite/backends/fpga/KD/llapi/bias_scale.h b/lite/backends/fpga/KD/llapi/bias_scale.h
index 83f30df18fc7e5967d727ed8ce275d63e1cb29e0..d47d082ccdc6b41cf43860495e43076c17b13ac3 100644
--- a/lite/backends/fpga/KD/llapi/bias_scale.h
+++ b/lite/backends/fpga/KD/llapi/bias_scale.h
@@ -19,7 +19,7 @@ namespace zynqmp {
 namespace bias_scale {
 
 void align_element(float** data_in, int num_per_div_before_alignment, int num);
-void interleave(float** data_in, int num_after_alignment);
+size_t interleave(float** data_in, int num_after_alignment);
 void format_bias_scale_array(float** bias_scale_array,
                              int element_num_per_division,
                              int num);
diff --git a/lite/backends/fpga/KD/llapi/filter.cpp b/lite/backends/fpga/KD/llapi/filter.cpp
index 0e41a204a854b0b57e1a8c98fb3cc8d5224c807c..30250969b6fbe6e9e5ce7e9f96f963e8bee89224 100644
--- a/lite/backends/fpga/KD/llapi/filter.cpp
+++ b/lite/backends/fpga/KD/llapi/filter.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #include "lite/backends/fpga/KD/llapi/filter.h"
 #include <memory.h>
 #include <algorithm>
+#include <fstream>
+#include <string>
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
@@ -23,11 +25,41 @@ namespace zynqmp {
 namespace filter {
 
 static int FILTER_SIZE = 2048;
+static int COLUMN = 4;
+
+void saveToFile(std::string name, void* data_in, int size) {
+  std::ofstream ofs;
+  ofs.open(name);
+
+  int8_t* data = static_cast<int8_t*> data_in;
+  for (int i = 0; i < size; i++) {
+    float value = data[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
+
+void saveFloatToFile(std::string name, float* data_in, int size) {
+  std::ofstream ofs;
+  ofs.open(name);
+
+  for (int i = 0; i < size; i++) {
+    float value = data_in[i];
+    ofs << value << std::endl;
+  }
+  ofs.close();
+}
 
 void set_filter_capacity(uint32_t cap) { FILTER_SIZE = cap; }
 
+void set_colunm(uint32_t column) { COLUMN = column; }
+
+// replace zynqmp_api.h  #define FILTER_NUM_ALIGNMENT
+int get_filter_num_alignment() { return COLUMN * 4; }
+
 int calc_division_capacity(int chw) {
-  int n = FILTER_SIZE / ((chw + 15) / 16) * 32;
+  int filter_num_alignment = get_filter_num_alignment();
+  int n = FILTER_SIZE / ((chw + 15) / 16) * filter_num_alignment;
   return n < FILTER_SIZE ? n : FILTER_SIZE;
 }
 
@@ -52,28 +84,28 @@ int calc_num_per_div(int num, int group_num, int division_capacity) {
   }
 }
 
-void convert_to_hwc(
-    char **data_in, int num, int channel, int height, int width) {
-  char *tmp = *data_in;
+void convert_to_hwc(int8_t* chw_data,
+                    int8_t* hwc_data,
+                    int num,
+                    int channel,
+                    int height,
+                    int width) {
   int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
+  int wc = width * channel;
+  int index = 0;
   for (int n = 0; n < num; n++) {
-    int64_t amount_per_row = width * channel;
     for (int c = 0; c < channel; c++) {
       for (int h = 0; h < height; h++) {
-        int64_t offset_height = h * amount_per_row;
         for (int w = 0; w < width; w++) {
-          *(data_tmp + n * chw + offset_height + w * channel + c) =
-              *((*data_in)++);
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
+          index++;
         }
       }
     }
   }
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
 
-float find_max(float *data_in, int data_size) {
+float find_max(float* data_in, int data_size) {
   float max = 0.0;
   for (int i = 0; i < data_size; ++i) {
     float value = data_in[i];
@@ -83,166 +115,178 @@ float find_max(float *data_in, int data_size) {
   return max;
 }
 
-signed char float_to_int8(float fdata) {
+int8_t float_to_int8(float fdata) {
   if (fdata < 0.0) {
     fdata -= 0.5;
   } else {
     fdata += 0.5;
   }
-  return (signed char)fdata;
+  return (int8_t)fdata;
 }
 
-void quantize(float **data_in, int data_size, float max) {
-  float *tmp = *data_in;
+void quantize(float* src, int8_t* dst, int len, float max) {
   float fix_range = 127;
   float scale = fix_range / max;
-
-  signed char *tmp_data = (signed char *)fpga_malloc(data_size * sizeof(char));
-  for (int i = 0; i < data_size; i++) {
-    tmp_data[i] = float_to_int8(
-        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
+  for (size_t i = 0; i < len; i++) {
+    dst[i] = float_to_int8(src[i] * scale);
   }
-  *data_in = (float *)tmp_data;  // NOLINT
-  fpga_free(tmp);
 }
 
-void align_element(char **data_in, int num, int chw) {
-  int j = 0;
+bool should_align_chw(int chw) {
   int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  if (align_chw != chw) {
-    char *tmp = *data_in;
-    char *data_tmp =
-        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
-
-    memset(data_tmp, 0, num * align_chw);
-    for (j = 0; j < num; j++) {
-      memcpy(data_tmp + j * align_chw, (*data_in) + j * chw, chw);
-    }
-    *data_in = data_tmp;
-    fpga_free(tmp);
+  return align_chw != chw;
+}
+
+void align_chw(int8_t* src, int8_t* dst, int num, int chw) {
+  int aligned_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  memset(dst, 0, num * aligned_chw);
+  for (int j = 0; j < num; j++) {
+    memcpy((dst + j * aligned_chw), (src + j * chw), chw);
   }
 }
 
-void align_num(char **data_in,
+void align_num(int8_t* src,
+               int8_t* dst,
                int num_per_div_before_alignment,
                int num,
-               int chw) {
-  int i = 0;
-  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+               int align_chw) {
+  int filter_num_alignment = get_filter_num_alignment();
   int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+      align_to_x(num_per_div_before_alignment, filter_num_alignment);
 
-  char *tmp = *data_in;
   int div_num =
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
   int num_element = div_num * num_per_div_after_alignment * align_chw;
-  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-
-  memset(data_tmp, 0, num_element * sizeof(char));
 
+  memset(dst, 0, num_element * sizeof(int8_t));
+  int i = 0;
   for (i = 0; i < div_num - 1; i++) {
-    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-           *data_in + num_per_div_before_alignment * align_chw * i,
+    memcpy(dst + num_per_div_after_alignment * align_chw * i,
+           src + num_per_div_before_alignment * align_chw * i,
            num_per_div_before_alignment * align_chw);
   }
 
-  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-         *data_in + num_per_div_before_alignment * align_chw * i,
+  memcpy(dst + num_per_div_after_alignment * align_chw * i,
+         src + num_per_div_before_alignment * align_chw * i,
          (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
-
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
 
-void reorder(char **data_in, int num_after_alignment, int chw) {
+void reorder(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
   int index = 0;
   int new_index = 0;
-
+  int filter_num_alignment = get_filter_num_alignment();
   int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
   for (index = 0; index < num_after_alignment; index++) {
-    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
-                (index / 16 % 2 * 4);
-    memcpy(data_tmp + index * chw_align,
-           *data_in + new_index * chw_align,
-           chw_align);
+    new_index = index / filter_num_alignment * filter_num_alignment +
+                (index % (filter_num_alignment / 2) / 4 * 8) +
+                (index % (filter_num_alignment / 2) % 4) +
+                (index / (filter_num_alignment / 2) % 2 * 4);
+    memcpy((dst + index * chw_align), (src + new_index * chw_align), chw_align);
   }
-  *data_in = data_tmp;
-  fpga_free(tmp);
 }
 
-size_t interleave(char **data_in, int num_after_alignment, int chw) {
-  int i = 0;
-  int j = 0;
-  int k = 0;
+void interleave(int8_t* src, int8_t* dst, int num_after_alignment, int chw) {
   int interleave_per_num = 16;
-
   int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
-  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
-                          sizeof(char));
-  char *tmp = *data_in;
   int interleave_num = chw_align * 2 / interleave_per_num;
-  for (i = 0; i < num_after_alignment; i += 2) {
-    for (j = 0, k = 0; j < interleave_num; j += 2, k++) {
-      memcpy(data_tmp + i * chw_align + interleave_per_num * j,
-             *data_in + i * chw_align + interleave_per_num * k,
+  for (int i = 0; i < num_after_alignment; i += 2) {
+    for (int j = 0, k = 0; j < interleave_num; j += 2, k++) {
+      memcpy(dst + i * chw_align + interleave_per_num * j,
+             src + i * chw_align + interleave_per_num * k,
              interleave_per_num);
-      memcpy(data_tmp + i * chw_align + interleave_per_num * (j + 1),
-             *data_in + (i + 1) * chw_align + interleave_per_num * k,
+      memcpy(dst + i * chw_align + interleave_per_num * (j + 1),
+             src + (i + 1) * chw_align + interleave_per_num * k,
              interleave_per_num);
     }
   }
-  *data_in = data_tmp;
-  fpga_free(tmp);
-  return chw_align * num_after_alignment;
 }
 
-size_t format_filter(float **data_in,
-                     int num,
-                     int channel,
-                     int height,
-                     int width,
-                     int group_num,
-                     float max) {
+int8_t* format_filter(float* data_in,
+                      int& mem_size_a,  // NOLINT
+                      int num,
+                      int channel,
+                      int height,
+                      int width,
+                      int group_num,
+                      float max,
+                      std::vector<float>& filter_max) {  // NOLINT
   int data_size = channel * height * width * num;
   int chw = channel * height * width;
 
   int division_capacity = calc_division_capacity(chw);
+  int filter_num_alignment = get_filter_num_alignment();
   int num_per_div_before_alignment =
       calc_num_per_div(num, group_num, division_capacity);
   int num_per_div_after_alignment =
-      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+      align_to_x(num_per_div_before_alignment, filter_num_alignment);
   int div_num =
       (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
   int residual = num % num_per_div_before_alignment;
   int num_after_alignment = num_per_div_after_alignment *
                                 ((residual == 0) ? div_num : (div_num - 1)) +
-                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
-  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;  // NOLINT
-  convert_to_hwc(quantize_data, num, channel, height, width);
-  align_element(quantize_data, num, chw);
-  if (num_after_alignment != num) {
-    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+                            align_to_x(residual, filter_num_alignment);
+
+  int8_t* quantized_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
+
+  for (int n = 0; n < num; n++) {
+    float* filter_start = data_in + n * chw;
+    float f_max = find_max(filter_start, chw);
+    int8_t* quantized_start = quantized_data + n * chw;
+    quantize(filter_start, quantized_start, chw, max);
+    filter_max.push_back(max);
   }
 
-  reorder(quantize_data, num_after_alignment, chw);
-  size_t mem_size = interleave(quantize_data, num_after_alignment, chw);
-  fpga_flush(*quantize_data,
+  int8_t* hwc_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(data_size * sizeof(int8_t)));
+  convert_to_hwc(quantized_data, hwc_data, num, channel, height, width);
+  fpga_free(quantized_data);
+
+  int8_t* temp_data = hwc_data;  // NOLINT
+  int chw_aligned = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
+  if (should_align_chw(chw)) {
+    int8_t* hwc_aligned_data = reinterpret_cast<int8_t*>(
+        fpga_malloc(num * chw_aligned * sizeof(int8_t)));
+    align_chw(hwc_data, hwc_aligned_data, num, chw);
+
+    temp_data = hwc_aligned_data;
+    fpga_free(hwc_data);
+  }
+  if (num_after_alignment != num) {
+    int filter_num_alignment = get_filter_num_alignment();
+    int num_per_div_after_alignment =
+        align_to_x(num_per_div_before_alignment, filter_num_alignment);
+    int num_element = div_num * num_per_div_after_alignment * chw_aligned;
+    int8_t* num_aligned_data =
+        reinterpret_cast<int8_t*>(fpga_malloc(num_element * sizeof(int8_t)));
+    align_num(temp_data,
+              num_aligned_data,
+              num_per_div_before_alignment,
+              num,
+              chw_aligned);
+
+    fpga_free(temp_data);
+    temp_data = num_aligned_data;
+  }
+  int8_t* aligned_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
+  reorder(temp_data, aligned_data, num_after_alignment, chw);
+  fpga_free(temp_data);
+  int8_t* interleaved_data =
+      reinterpret_cast<int8_t*>(fpga_malloc(num_after_alignment * chw_aligned));
+  interleave(aligned_data, interleaved_data, num_after_alignment, chw);
+  fpga_free(aligned_data);
+  fpga_flush(interleaved_data,
              align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment *
                  sizeof(char));
-  return mem_size;
+  mem_size_a = num_after_alignment * chw_aligned;
+  return interleaved_data;
 }
 
-void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
-  int16_t *tmp = *data_in;
-  int16_t *data_tmp =
-      (int16_t *)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
+void convert_to_hwn(int16_t** data_in, int num, int height, int width) {
+  int16_t* tmp = *data_in;
+  int16_t* data_tmp =
+      (int16_t*)fpga_malloc(height * width * num * sizeof(int16_t));  // NOLINT
   for (int n = 0; n < num; n++) {
     for (int h = 0; h < height; h++) {
       for (int w = 0; w < width; w++) {
@@ -254,16 +298,16 @@ void convert_to_hwn(int16_t **data_in, int num, int height, int width) {
   fpga_free(tmp);
 }
 
-size_t align_element_n(int16_t **data_in, int num, int height, int width) {
+size_t align_element_n(int16_t** data_in, int num, int height, int width) {
   int unalign_n = num;
   int align_n = align_to_x(num, FILTER_ELEMENT_ALIGNMENT);
   int num_element = height * width * align_n;
   if (unalign_n != align_n) {
-    int16_t *tmp = *data_in;
+    int16_t* tmp = *data_in;
 
     int num_element = height * width * align_n;
-    int16_t *data_tmp =
-        (int16_t *)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
+    int16_t* data_tmp =
+        (int16_t*)fpga_malloc(num_element * sizeof(int16_t));  // NOLINT
 
     memset(data_tmp, 0, num_element * sizeof(int16_t));
     for (int h = 0; h < height; h++) {
@@ -276,17 +320,37 @@ size_t align_element_n(int16_t **data_in, int num, int height, int width) {
       }
     }
     *data_in = data_tmp;
-    free(tmp);
+    fpga_free(tmp);
   }
   return num_element * sizeof(int16_t);
 }
 
+void to_fp16(float* src,
+             float16* dst,
+             int num,
+             int height,
+             int width,
+             float* scale_ptr) {
+  int size = num * height * width;
+  for (int n = 0; n < num; n++) {
+    float scale_val = scale_ptr[n];
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        int index = n * height * width + h * width + w;
+        float value = src[index] * scale_val;
+        dst[index] = float_to_half(value);
+      }
+    }
+  }
+  fpga_flush(dst, size * sizeof(int16_t));
+}
+
 void quantize_to_fp16(
-    float **data_in, int num, int height, int width, float *scale_ptr) {
-  float *tmp = *data_in;
+    float** data_in, int num, int height, int width, float* scale_ptr) {
+  float* tmp = *data_in;
   int size = num * height * width;
 
-  float16 *tmp_data = (float16 *)fpga_malloc(size * sizeof(float16));  // NOLINT
+  float16* tmp_data = (float16*)fpga_malloc(size * sizeof(float16));  // NOLINT
   for (int n = 0; n < num; n++) {
     float scale_val = scale_ptr[n];
     for (int h = 0; h < height; h++) {
@@ -298,13 +362,14 @@ void quantize_to_fp16(
     }
   }
   fpga_flush(tmp_data, size * sizeof(int16_t));
-  *data_in = (float *)tmp_data;  // NOLINT
+  *data_in = (float*)tmp_data;  // NOLINT
   fpga_free(tmp);
 }
 size_t format_dwconv_filter(
-    float **data_in, int num, int height, int width, float *scale_ptr) {
+    float** data_in, int num, int height, int width, float* scale_ptr) {
   quantize_to_fp16(data_in, num, height, width, scale_ptr);
-  int16_t **quantize_data = (int16_t **)data_in;  // NOLINT
+  int16_t** quantize_data = reinterpret_cast<int16_t**>(data_in);
+
   convert_to_hwn(quantize_data, num, height, width);
   size_t size = align_element_n(quantize_data, num, height, width);
   fpga_flush(*quantize_data,
diff --git a/lite/backends/fpga/KD/llapi/filter.h b/lite/backends/fpga/KD/llapi/filter.h
index 7d9c6c2e015250cbcba2d1dba71b7c1f3554d9f0..6e056ce0da0d8e731abf7dc418800a8e3d94969a 100644
--- a/lite/backends/fpga/KD/llapi/filter.h
+++ b/lite/backends/fpga/KD/llapi/filter.h
@@ -18,38 +18,33 @@ limitations under the License. */
 #include <cstdlib>
 #include <cwchar>
 
+#include <vector>
+
 namespace paddle {
 namespace zynqmp {
 namespace filter {
 
 void set_filter_capacity(uint32_t cap);
+void set_colunm(uint32_t column);
+int get_filter_num_alignment();
 int calc_division_capacity(int chw);
 int calc_split_num(int num, int division_capacity);
 int calc_division_number(int num, int group_num, int division_capacity);
 int calc_num_per_div(int num, int group_num, int division_capacity);
-void convert_to_hwc(
-    char** data_in, int num, int channel, int height, int width);
+
 float find_max(float* data_in, int data_size);
-void quantize(float** data_in, int data_size, float max);
-void align_element(char** data_in, int num, int chw);
-void align_num(char** data_in,
-               int num_per_div_before_alignment,
-               int num,
-               int chw);
-void reorder(char** data_in, int num_after_alignment, int chw);
-size_t interleave(char** data_in, int num_after_alignment, int chw);
-size_t format_filter(float** data_in,
-                     int num,
-                     int channel,
-                     int height,
-                     int width,
-                     int group_num,
-                     float max);
+int8_t* format_filter(float* data_in,
+                      int& mem_size,  // NOLINT
+                      int num,
+                      int channel,
+                      int height,
+                      int width,
+                      int group_num,
+                      float max,                        // NOLINT
+                      std::vector<float>& filter_max);  // NOLINT
 
 void convert_to_hwn(int16_t** data_in, int num, int height, int width);
 size_t align_element_n(int16_t** data_in, int num, int height, int width);
-void quantize_to_fp16(
-    float** data_in, int num, int height, int width, float* scale_ptr);
 size_t format_dwconv_filter(
     float** data_in, int num, int height, int width, float* scale_ptr);
 
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
old mode 100644
new mode 100755
index 1f1226ead3d4e9b50100f4de574104a5d6f777b2..06488469d97c077a34b3cfdb8a049c8cd61dfc93
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
@@ -23,13 +23,12 @@ limitations under the License. */
 #include <map>
 #include <utility>
 
-#include "lite/backends/fpga/KD/llapi/config.h"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 
 namespace paddle {
 namespace zynqmp {
 
-#define PADDLE_LITE_OS_LINUX
+#define PADDLE_OS_LINUX
 
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
@@ -39,14 +38,10 @@ static size_t memory_size_max = 0;
 static size_t memory_size = 0;
 
 static inline int do_ioctl(uint64_t req, const void *arg) {
-  int ret = -1;
-#ifdef PADDLE_LITE_OS_LINUX
-  ret = ioctl(fd, req, arg);
-  if (ret != 0) {
-    throw - 1;
-  }
+#ifdef PADDLE_OS_LINUX
+  return ioctl(fd, req, arg);
 #else
-  return ret;
+  return -1;
 #endif
 }
 
@@ -66,7 +61,9 @@ void reset_device() {
 
 // memory management;
 void *fpga_malloc(size_t size) {
-#ifdef PADDLE_LITE_OS_LINUX
+#ifdef ENABLE_DEBUG
+#endif
+#ifdef PADDLE_OS_LINUX
   void *ptr = reinterpret_cast<void *>(
       mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
   if (ptr == NULL) {
@@ -105,11 +102,8 @@ void fpga_free(void *ptr) {
     size = iter->second;
     memory_map.erase(iter);
   }
-
   memory_size -= size;
-
-#ifdef PADDLE_LITE_OS_LINUX
-
+#ifdef PADDLE_OS_LINUX
   munmap(ptr, size);
 #else
   free(ptr);
@@ -150,6 +144,11 @@ void fpga_copy(void *dest, const void *src, size_t num) {
   memcpy(dest, src, num);
 }
 
+int fpga_reset() {
+  struct FpgaResetArgs args;
+  return do_ioctl(IOCTL_FPGA_RESET, &args);
+}
+
 int ioctl_conv(const struct ConvArgs &args) {
   return do_ioctl(IOCTL_CONFIG_CONV, &args);
 }
@@ -166,7 +165,6 @@ int compute_fpga_conv(const struct SplitConvArgs &args) {
   }
 
   if (split_num > 1) {
-    std::cout << "Split num > 1 !!!!!!!!!!!!!!!!!!" << std::endl;
     exit(-1);
   }
   return ret;
@@ -186,6 +184,7 @@ int get_device_info(const struct DeviceInfo &args) {
 }
 
 int perform_bypass(const struct BypassArgs &args) {
+  int ret = -1;
   int size = args.image.channels * args.image.width * args.image.height;
   int max_size = 1 << 21;
 
@@ -213,7 +212,7 @@ int perform_bypass(const struct BypassArgs &args) {
         reinterpret_cast<char *>(input_address + i * max_size * type_size);
     bypassArgs.output.address =
         reinterpret_cast<char *>(output_address + i * max_size * out_type_size);
-    int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
     scale = std::max(scale, scales[0]);
 
     if (ret != 0) {
@@ -222,13 +221,15 @@ int perform_bypass(const struct BypassArgs &args) {
   }
 
   int remainder = size - max_size * count;
-  bypassArgs.image.channels = remainder;
-  bypassArgs.image.address =
-      reinterpret_cast<char *>(input_address + count * max_size * type_size);
-  bypassArgs.output.address = reinterpret_cast<char *>(
-      output_address + count * max_size * out_type_size);
-  int ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
-  scale = std::max(scale, scales[0]);
+  if (remainder > 0) {
+    bypassArgs.image.channels = remainder;
+    bypassArgs.image.address =
+        reinterpret_cast<char *>(input_address + count * max_size * type_size);
+    bypassArgs.output.address = reinterpret_cast<char *>(
+        output_address + count * max_size * out_type_size);
+    ret = do_ioctl(IOCTL_CONFIG_BYPASS, &bypassArgs);
+    scale = std::max(scale, scales[0]);
+  }
   args.output.scale_address[0] = scale;
   args.output.scale_address[1] = 1.0f / scale;
   return ret;
@@ -237,52 +238,17 @@ int perform_bypass(const struct BypassArgs &args) {
 int compute_fpga_concat(const struct ConcatArgs &args) { return -1; }
 
 int compute_fpga_scale(const struct ScaleArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Scale======";
-  std::cout << "scale_address:" << args.scale_address << std::endl;
-  std::cout << "bias_address:" << args.bias_address << std::endl;
-
-  std::cout << "wc_alignment:" << args.wc_alignment << std::endl;
-  std::cout << "channel_alignment:" << args.channel_alignment << std::endl;
-
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-#endif
   return do_ioctl(IOCTL_CONFIG_SCALE, &args);
 }
 
 int compute_fpga_dwconv(const struct DWconvArgs &args) {
-#ifdef ENABLE_DEBUG
-  std::cout << "======Compute Basic Conv======";
-  std::cout << "   relu_enabled:" << args.relu_enabled
-            << "   filter_address:" << args.filter_address;
-  std::cout << "   image_address:" << args.image.address
-            << "   image_scale_address:" << args.image.scale_address
-            << "   image_channels:" << args.image.channels
-            << "   image_height:" << args.image.height
-            << "   image_width:" << args.image.width
-            << "   pad_height:" << args.image.pad_height
-            << "   pad_width:" << args.image.pad_width;
-  std::cout << "   kernel_height:" << args.kernel.height
-            << "   kernel_width:" << args.kernel.width
-            << "   stride_h:" << args.kernel.stride_h
-            << "   stride_w:" << args.kernel.stride_w;
-  std::cout << "   out_address:" << args.output.address
-            << "   out_scale_address:" << args.output.scale_address;
-
-#endif
   return do_ioctl(IOCTL_CONFIG_DWCONV, &args);
 }
 
+int config_activation(const struct ActiveParamterArgs &args) {
+  return do_ioctl(IOCTL_CONFIG_ACTIVATION_PARAMETER, &args);
+}
+
 int config_inplace(const struct InplaceArgs &args) {
   return do_ioctl(IOCTL_CONFIG_INPLACE, &args);
 }
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h
old mode 100644
new mode 100755
index 7d22de95a2272862c6fe781295bdaab7177a92fe..9489c24730e52fb778ed341e0ce452b7ef86edf9
--- a/lite/backends/fpga/KD/llapi/zynqmp_api.h
+++ b/lite/backends/fpga/KD/llapi/zynqmp_api.h
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
+#define PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
+
 #include <stdint.h>
 #include <cstddef>
 #include <iostream>
@@ -40,6 +43,13 @@ enum DLayoutType {
   LAYOUT_HWC = 0,
 };
 
+enum ActiveType {
+  TYPE_RELU = 0,
+  TYPE_RELU6 = 1,
+  TYPE_LEAK_RELU = 2,
+  TYPE_SIGMOID = 3,
+};
+
 struct VersionArgs {
   void* buffer;
 };
@@ -48,7 +58,7 @@ struct DeviceInfo {
   uint32_t filter_cap;
   uint32_t version;
   uint16_t device_type;
-  uint32_t reserved0;
+  uint32_t colunm;
   uint32_t reserved1;
   uint32_t reserved2;
   uint32_t reserved3;
@@ -108,6 +118,7 @@ struct ConvArgs {
   void* filter_scale_address;
   uint32_t filter_num;
   uint32_t group_num;
+  uint32_t dilation;
 
   struct KernelArgs kernel;
   struct ImageInputArgs image;  // input image;
@@ -199,9 +210,16 @@ struct NormalizeParameterArgs {
   uint32_t hight_width;
 };
 
+struct ActiveParamterArgs {
+  ActiveType type;
+  uint16_t leaky_relu_factor;
+};
+
 struct InplaceArgs {
   bool leaky_relu_enable;
   bool relu_enable;
+  bool sigmoid_enable;
+  bool relu6_enable;
   bool power_enable;
   bool normalize_enable;
 };
@@ -216,7 +234,9 @@ struct FpgaRegReadArgs {
   uint64_t value;
 };
 
-struct FpgaResetArgs {};
+struct FpgaResetArgs {
+  uint32_t val;
+};
 
 #define IOCTL_FPGA_MAGIC (('F' + 'P' + 'G' + 'A') / 4)
 
@@ -248,6 +268,8 @@ struct FpgaResetArgs {};
   _IOW(IOCTL_FPGA_MAGIC, 41, struct PowerParameterArgs)
 #define IOCTL_CONFIG_NORMALIZE_PARAMETER \
   _IOW(IOCTL_FPGA_MAGIC, 42, struct NormalizeParameterArgs)
+#define IOCTL_CONFIG_ACTIVATION_PARAMETER \
+  _IOW(IOCTL_FPGA_MAGIC, 43, struct ActiveParamterArgs)
 #define IOCTL_FPGA_REG_READ _IOW(IOCTL_FPGA_MAGIC, 50, struct FpgaRegReadArgs)
 #define IOCTL_FPGA_REG_WRITE _IOW(IOCTL_FPGA_MAGIC, 51, struct FpgaRegWriteArgs)
 #define IOCTL_FPGA_RESET _IOW(IOCTL_FPGA_MAGIC, 52, struct FpgaResetArgs)
@@ -331,6 +353,7 @@ int compute_fpga_scale(const struct ScaleArgs& args);
 int compute_fpga_concat(const struct ConcatArgs& args);
 int compute_fpga_resize(const struct ResizeArgs& args);
 
+int config_activation(const struct ActiveParamterArgs& args);
 int config_power(const struct PowerArgs& args);
 int compute_fpga_dwconv(const struct DWconvArgs& args);
 int config_norm_param(const struct NormalizeParameterArgs& args);
@@ -341,7 +364,11 @@ int config_inplace(const struct InplaceArgs& args);
 int flush_cache(void* addr, int size);
 int invalidate_cache(void* addr, int size);
 
+int fpga_reset();
+
 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);
 }  // namespace zynqmp
 }  // namespace paddle
+
+#endif  // PADDLE_LITE_SRC_FPGA_KD_ZYNQMP_API_H
diff --git a/lite/backends/fpga/KD/pe.hpp b/lite/backends/fpga/KD/pe.hpp
index d1dc3c4caa18cbfeba74fac26cca9e19230e2c21..2796124341012574dc719ae9f30633d1d9524680 100644
--- a/lite/backends/fpga/KD/pe.hpp
+++ b/lite/backends/fpga/KD/pe.hpp
@@ -32,6 +32,5 @@ class PE {
 
   virtual ~PE() {}
 };
-
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/pe_params.hpp b/lite/backends/fpga/KD/pe_params.hpp
index 709f04d399793c6f21c34fc1265f7ed8b5818314..9dc295a58d4bbfd50a0b9ecbdb06a22c8900cef7 100644
--- a/lite/backends/fpga/KD/pe_params.hpp
+++ b/lite/backends/fpga/KD/pe_params.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
+#include <string>
 #include <vector>
 
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
@@ -26,6 +27,7 @@ namespace zynqmp {
 struct ReLUParam {
  public:
   bool enabled = false;
+  float leaky_relu_factor = 0.0f;
 };
 
 struct PEParam {
@@ -98,6 +100,24 @@ struct DepthwiseConvParam : ConvParam {
   Tensor* quantizedFilter_ = new Tensor();
 };
 
+struct GRUParam : PEParam {
+ public:
+  Tensor* input = nullptr;
+  Tensor* h0 = nullptr;
+  Tensor* weight = nullptr;
+  Tensor* bias = nullptr;
+
+  Tensor* batch_gate = nullptr;
+  Tensor* batch_reset_hidden_prev = nullptr;
+  Tensor* batch_hidden = nullptr;
+  Tensor* hidden = nullptr;
+
+  std::string gate_activation = "sigmoid";
+  std::string activation = "tanh";
+  bool is_reverse = false;
+  bool origin_mode = false;
+};
+
 enum PoolingType : int {
   MAX = 0,
   AVERAGE = 1,
@@ -133,6 +153,12 @@ struct ElementwiseAddParam : PEParam {
   EWAddArgs ewargs;
 };
 
+struct ElementwiseMulParam : PEParam {
+ public:
+  std::vector<Tensor*> inputs;
+  Tensor* output = nullptr;
+};
+
 struct FullyConnectedParam : PEParam {
  public:
   Tensor* input = nullptr;
diff --git a/lite/backends/fpga/KD/pes/conv_pe.hpp b/lite/backends/fpga/KD/pes/conv_pe.hpp
index e897f82280fa57f904bd7c749e371d8ec9219b51..fb15eaf77822eed076ec2001bace6871e93587ff 100644
--- a/lite/backends/fpga/KD/pes/conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/conv_pe.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <arm_neon.h>
+#include <algorithm>
 #include <vector>
 
 #include "lite/backends/fpga/KD/pe.hpp"
@@ -49,7 +50,108 @@ class ConvPE : public PE {
       concatPE_.init();
       concatPE_.apply();
     }
+
+    if (DLEngine::get_instance().isZU3() &&
+        param_.input->shape().dimSize() == 4 &&
+        param_.input->shape().width() == 1 &&
+        param_.input->shape().width() >= 2048) {
+      use_cpu_ = true;
+    }
+
+    if (param_.filter->shape().width() == 1 &&
+        param_.filter->shape().height() == 1) {  // NOLINT
+    }
+    if (!use_cpu_) {  // NOLINT
+    }
   }
+
+  void cpu_conv_hwc() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    input->syncToCPU();
+
+    Tensor float_input;
+    Tensor float_output;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    float_input.copyFrom(input);
+    float_input.syncToCPU();
+    float* out = float_output.mutableData<float>(FP32, output->shape());
+
+    int out_width = output->shape().width();
+    int out_channel = output->shape().channel();
+    int in_channel = input->shape().channel();
+
+    float* filter_data = param_.filter->data<float>();
+
+    int image_height = input->shape().height();
+    int image_width = input->shape().width();
+    int image_channels = input->shape().channel();
+    int image_pad_h = param_.paddings[0];
+    int image_pad_w = param_.paddings[1];
+    int kernel_height = param_.filter->shape().height();
+    int kernel_width = param_.filter->shape().width();
+    int kernel_step_h = param_.strides[0];
+    int kernel_step_w = param_.strides[1];
+    int pooled_height_ = output->shape().height();
+    int pooled_width_ = out_width;
+    int filter_chw = image_channels * kernel_height * kernel_width;
+
+    float max = 0;
+
+    for (int ph = 0; ph < pooled_height_; ph++) {
+      for (int pw = 0; pw < pooled_width_; pw++) {
+        int hstart = ph * kernel_step_h - image_pad_h;
+        int wstart = pw * kernel_step_w - image_pad_w;
+        int hend =
+            std::min(hstart + kernel_height, static_cast<int>(image_height));
+        int wend =
+            std::min(wstart + kernel_width, static_cast<int>(image_width));
+        hstart = std::max(hstart, static_cast<int>(0));
+        wstart = std::max(wstart, static_cast<int>(0));
+        for (int oc = 0; oc < out_channel; oc++) {
+          float sum = 0.0f;
+          const int pool_index = (ph * pooled_width_ + pw) * out_channel + oc;
+          for (int c = 0; c < image_channels; c++) {
+            for (int h = hstart; h < hend; h++) {
+              int hi = 0;
+              if (ph == 0) {
+                hi = h - hstart + image_pad_h;
+              } else {
+                hi = h - hstart;
+              }
+              for (int w = wstart; w < wend; w++) {
+                int wi = 0;
+                if (pw == 0) {
+                  wi = w - wstart + image_pad_w;
+                } else {
+                  wi = w - wstart;
+                }
+                const int index = (h * image_width + w) * image_channels + c;
+                int weight_index = oc * filter_chw +
+                                   kernel_width * kernel_height * c +
+                                   kernel_width * hi + wi;
+                float value = image_addr[index] * filter_data[weight_index];
+                sum += value;
+              }
+            }
+          }
+
+          if (param_.relu.enabled && sum < 0) {
+            sum = -sum;
+          }
+          if (sum > max) {
+            max = sum;
+          }
+          out[pool_index] = sum;
+        }
+      }
+    }
+    float_output.flush();
+    output->copyFrom(&float_output);
+    output->scale()[0] = max / 127;
+    output->scale()[1] = 127 / max;
+  }
+
   void cpu_compute() {
     Tensor* input = param_.input;
     Tensor* output = param_.output;
@@ -59,43 +161,78 @@ class ConvPE : public PE {
     Tensor float_output;
     float* image_addr = float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
+    float_input.syncToCPU();
+
     float* out = float_output.mutableData<float>(FP32, output->shape());
 
+    float* bias_data = param_.bias()->data<float>();
+
+    int out_width = output->shape().width();
     int out_channel = output->shape().channel();
     int in_channel = input->shape().channel();
 
     float* filter_data = param_.filter->data<float>();
     float* mi = new float[in_channel];
+    float max = 0;
 
+    int out_index = 0;
     for (int i = 0; i < out_channel; i++) {
       float* image = image_addr;
       float* filter_ptr = filter_data + i * in_channel;
       float* out_ptr = mi;
-#pragma omp parallel for
-      for (int j = 0; j < in_channel; j++) {
-        float value = image_addr[j] * filter_ptr[j];
-        mi[j] = value;
-      }
 
-      float sum = 0;
-      for (int j = 0; j < in_channel; j++) {
-        sum += mi[j];
+      for (int h = 0; h < output->shape().height(); h++) {
+        for (int w = 0; w < output->shape().width(); w++) {
+          float sum = 0;
+
+          // #pragma omp parallel for
+          for (int j = 0; j < in_channel; j++) {
+            int image_index = h * out_width * in_channel + w * in_channel + j;
+            float value = image_addr[image_index] * filter_ptr[j];
+            sum += value;
+          }
+
+          sum += bias_data[i];
+
+          if (param_.relu.enabled && sum < 0) {
+            sum = 0;
+          }
+          if (sum > max) {
+            max = sum;
+          }
+          out_index = h * out_width * out_channel + w * out_channel + i;
+          out[out_index] = sum;
+        }
       }
-      out[i] = sum;
     }
     delete[] mi;
     float_output.flush();
     output->copyFrom(&float_output);
+    output->scale()[0] = max / 127;
+    output->scale()[1] = 127 / max;
   }
 
   bool dispatch() {
-    inplace_.relu_enable = param_.relu.enabled;
+    if (use_cpu_) {
+      cpu_compute();
+      return true;
+    }
+
+    inplace_.leaky_relu_enable =
+        (param_.relu.leaky_relu_factor != 0) ? true : false;
+    inplace_.relu_enable =
+        inplace_.leaky_relu_enable ? false : param_.relu.enabled;
+
     inplace_.power_enable = false;
     inplace_.normalize_enable = false;
-
-    if (param_.relu.enabled) {
-      inplace_.relu_enable = param_.relu.enabled;
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
       config_inplace(inplace_);
+      if (inplace_.leaky_relu_enable) {
+        activeParamterArgs.type = TYPE_LEAK_RELU;
+        activeParamterArgs.leaky_relu_factor =
+            fp32_2_fp16(param_.relu.leaky_relu_factor);
+        config_activation(activeParamterArgs);
+      }
     }
 
     std::vector<BasicConvParam*>& params = param_.splitParams();
@@ -104,9 +241,16 @@ class ConvPE : public PE {
       ret |= compute_fpga_conv_basic(conv_param->args);
     }
 
-    if (param_.relu.enabled) {
+    if (inplace_.relu_enable || inplace_.leaky_relu_enable) {
       inplace_.relu_enable = false;
+      inplace_.leaky_relu_enable = false;
       config_inplace(inplace_);
+
+      if (inplace_.leaky_relu_enable) {
+        activeParamterArgs.type = TYPE_LEAK_RELU;
+        activeParamterArgs.leaky_relu_factor = fp32_2_fp16(0);
+        config_activation(activeParamterArgs);
+      }
     }
 
     size_t size = params.size();
@@ -127,11 +271,13 @@ class ConvPE : public PE {
   ConvParam& param() { return param_; }
 
  private:
+  bool use_cpu_ = false;
   ConvParam param_;
   ConcatPE concatPE_;
   ElementwiseAddPE addPE_;
   int split_axis = 0;
   InplaceArgs inplace_ = {0};
+  ActiveParamterArgs activeParamterArgs;
 };
 
 }  // namespace zynqmp
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
old mode 100644
new mode 100755
index 23332b422df65250f8cadf07f5e0d95e970d316a..ecee45569c8df3d3e3926b2ca78cb49da8415aa4
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef conv_process_hpp
+#define conv_process_hpp
+
 #include <string.h>
 #include <cmath>
 #include <vector>
@@ -45,7 +48,9 @@ inline int get_split_num(Tensor* filter) {
              filter->shape().width();
   auto num = filter->shape().num();
   int div_capacity = filter::calc_division_capacity(chw);
-  return filter::calc_split_num(num, div_capacity);
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  int aligned_num = align_to_x(num, filter_num_alignment);
+  return filter::calc_split_num(aligned_num, div_capacity);
 }
 
 inline void fill_scale_bias_const(ConvParam* param_) {
@@ -126,41 +131,85 @@ inline void format_scale_bias(Tensor* scale,
     bias_data = bias->data<float>();
   }
   int channel = filter->shape().num();
-  Shape bias_scale_shape(N, {2 * channel});
+  int scale_bias_len = align_to_x(channel / group, BS_NUM_ALIGNMENT) * group;
+
+  int c_per_group = channel / group;
+  int aligned_c_per_group = align_to_x(channel / group, BS_NUM_ALIGNMENT);
+
+  Shape bias_scale_shape(N, {2 * scale_bias_len});
   float* bs_data = scale_bias->mutableData<float>(FP32, bias_scale_shape);
-  for (int i = 0; i < channel; i++) {
-    float scale_value = scale_data == nullptr ? 1 : scale_data[i];
-    float bias_value = bias_data == nullptr ? 0 : bias_data[i];
-    bs_data[i + channel] = scale_value;
-    bs_data[i] = bias_value;
+  float* temp_data =
+      reinterpret_cast<float*>(fpga_malloc(2 * scale_bias_len * sizeof(float)));
+  memset(temp_data, 0, 2 * scale_bias_len * sizeof(float));
+
+  std::vector<float> scales;
+  if (scale_data != nullptr) {
+    for (int i = 0; i < channel; ++i) {
+      scales.push_back(scale_data[i]);
+    }
+    for (int i = 0; i < scale_bias_len - channel; i++) {
+      scales.push_back(1);
+    }
+  } else {
+    for (int i = 0; i < scale_bias_len; i++) {
+      scales.push_back(1);
+    }
   }
 
-  int element_num_per_div = get_filter_num_per_div(filter, group);
-  bias_scale::format_bias_scale_array(&bs_data, element_num_per_div, channel);
+  for (int i = 0; i < scale_bias_len; ++i) {
+    temp_data[i + scale_bias_len] = 1;
+    temp_data[i] = 0;
+  }
+
+  for (int g = 0; g < group; g++) {
+    for (int c = 0; c < c_per_group; c++) {
+      int src_index = g * c_per_group + c;
+      int dst_index = g * aligned_c_per_group + c;
+      float scale_value = scales[src_index];
+      float bias_value = bias_data == nullptr ? 0 : bias_data[src_index];
+      temp_data[dst_index + scale_bias_len] = scale_value;
+      temp_data[dst_index] = bias_value;
+    }
+  }
+
+  bias_scale::format_bias_scale_array(
+      &temp_data, scale_bias_len / group, scale_bias_len);
+  memcpy(bs_data, temp_data, 2 * scale_bias_len * sizeof(float));
 }
 
-inline void format_filter(Tensor* filter, Tensor* quantized_filter, int group) {
+inline void format_filter(Tensor* filter,
+                          Tensor* quantized_filter,
+                          int group,
+                          std::vector<float>& scales) {  // NOLINT
   float max_value = find_max(*filter);
   Shape& filter_shape = filter->shape();
+
+  int mem_size;
+  std::vector<float> max_values;
+  int8_t* quantized_data = filter::format_filter(filter->data<float>(),
+                                                 mem_size,
+                                                 filter_shape.num(),
+                                                 filter_shape.channel(),
+                                                 filter_shape.height(),
+                                                 filter_shape.width(),
+                                                 group,
+                                                 max_value,
+                                                 max_values);
+
+  float mem_factor = mem_size * 1.0f / filter->shape().numel();
+  quantized_filter->setMemScale(mem_factor);
+
   quantized_filter->setAligned(true);
-  quantized_filter->mutableData<int8_t>(INT8, filter->shape());
+  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
   quantized_filter->scale()[0] = max_value / 127.0f;
   quantized_filter->scale()[1] = 127.0f / max_value;
 
-  auto memory_size = filter->shape().memorySize(sizeof(float));
-  auto new_data = reinterpret_cast<float*>(fpga_malloc(memory_size));
-  memcpy(new_data, filter->data<float>(), memory_size);
-  size_t mem_size = filter::format_filter(&new_data,
-                                          filter_shape.num(),
-                                          filter_shape.channel(),
-                                          filter_shape.height(),
-                                          filter_shape.width(),
-                                          group,
-                                          max_value);
-  int8_t* src = quantized_filter->mutableData<int8_t>(INT8, filter->shape());
-  memcpy(src, new_data, mem_size);
-  fpga_free(new_data);
+  memcpy(src, quantized_data, mem_size);
   quantized_filter->flush();
+
+  for (size_t i = 0; i < max_values.size(); i++) {
+    scales.push_back(max_values[i] / max_value);
+  }
 }
 
 inline void format_dw_filter(Tensor* filter,
@@ -207,10 +256,18 @@ inline void split_filter_num(const ConvParam& c_param) {
   Tensor* out = param.output;
   Tensor* filter = param.filter;
   auto channel = out->shape().channel();
-
   int split_num = param.groups == 1 ? get_split_num(param.filter) : 1;
   int filter_num_per_div = get_filter_num_per_div(filter, param.groups);
 
+  auto chw = filter->shape().channel() * filter->shape().height() *
+             filter->shape().width();
+  auto num = filter->shape().num();
+  int div_capacity = filter::calc_division_capacity(chw);
+  int filter_num_alignment = filter::get_filter_num_alignment();
+  int aligned_num =
+      align_to_x(num / param.groups, filter_num_alignment) * param.groups;
+  split_num = filter::calc_split_num(aligned_num, div_capacity);
+
   Shape& out_shape = out->shape();
   for (int i = 0; i < split_num; i++) {
     BasicConvParam* conv_param = new BasicConvParam();
@@ -251,9 +308,17 @@ inline void split_filter_num(const ConvParam& c_param) {
            filter->data<float>() + i * filter_num_per_div * filter_hwc,
            filter_num * filter_hwc * sizeof(float));
     new_filter.flush();
-
     conv_param->filter.mutableData<float>(FP32, f_shape);
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
+
+    if (param.groups != 1) {
+      int mem_factor =
+          32 / filter_num_per_div;  // TODO(chonwhite): change 32 to param;
+      conv_param->filter.setMemScale(mem_factor);
+    }
+
+    std::vector<float> v;  // TODO(chonwhite): change local variable name
+    format_filter(&new_filter, &(conv_param->filter), param.groups, v);
+    conv_param->filter.setDataType(INT8);
 
     int sb_num = 2 * align_to_x(filter_num, BS_NUM_ALIGNMENT);
     Tensor scale;
@@ -265,7 +330,7 @@ inline void split_filter_num(const ConvParam& c_param) {
     float* scale_data = scale.mutableData<float>(FP32, s_shape);
     float* bias_data = bias.mutableData<float>(FP32, s_shape);
     for (int n = 0; n < filter_num; n++) {
-      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start];
+      scale_data[n] = param.scale()->data<float>()[n + chnnnel_start] * v[n];
     }
     for (int n = 0; n < filter_num; n++) {
       bias_data[n] = param.bias()->data<float>()[n + chnnnel_start];
@@ -276,11 +341,14 @@ inline void split_filter_num(const ConvParam& c_param) {
                       &conv_param->filter,
                       &conv_param->scaleBias,
                       param.groups);
+
     conv_param->scaleBias.flush();
+    float* bs_data = conv_param->scaleBias.data<float>();
 
     args.group_num = param.groups;
     args.relu_enabled = param.relu.enabled;
     args.sb_address = conv_param->scaleBias.data<float>();
+    args.sb_address = bs_data;
     args.kernel.stride_h = param.strides[1];
     args.kernel.stride_w = param.strides[0];
     args.kernel.height = new_filter.shape().height();
@@ -294,17 +362,12 @@ inline void split_filter_num(const ConvParam& c_param) {
     args.image.channels = input->shape().channel();
     args.image.width = input->shape().width();
     args.image.height = input->shape().height();
-    auto paddings = *param.padding;
-    args.image.pad_width = param.paddings[2];
+    args.image.pad_width = param.paddings[1];
     args.image.pad_height = param.paddings[0];
+    args.dilation = param.dilations[0];
+
     args.output.address = out_address;
     args.output.scale_address = out_scale_address;
-    bool pad_equal =
-        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-    if (!pad_equal) {
-      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
-                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-    }
     param.splitParams().push_back(conv_param);
   }
 }
@@ -317,7 +380,7 @@ inline void split_channel(const ConvParam& c_param) {
 
   int num = ceil(input->shape().channel() * 1.0f / 2047);
   int channel = input->shape().channel() / num;
-  std::cout << "channel::" << channel << "num::" << num << std::endl;
+
   Shape bs_shape(N, {channel});
 
   for (int i = 0; i < num; i++) {
@@ -331,6 +394,7 @@ inline void split_channel(const ConvParam& c_param) {
 
     // filter transformation;
     Shape f_shape(NCHW, {param.filter->shape().num(), channel, 1, 1});
+
     Tensor new_filter;
 
     float* dst = new_filter.mutableData<float>(FP32, f_shape);
@@ -341,7 +405,8 @@ inline void split_channel(const ConvParam& c_param) {
       src += param.filter->shape().channel();
     }
     new_filter.flush();
-    format_filter(&new_filter, &(conv_param->filter), param.groups);
+    std::vector<float> scales;
+    format_filter(&new_filter, &(conv_param->filter), param.groups, scales);
 
     Tensor bias;
     Tensor scale;
@@ -379,18 +444,11 @@ inline void split_channel(const ConvParam& c_param) {
     args.image.channels = conv_param->input.shape().channel();
     args.image.width = conv_param->input.shape().width();
     args.image.height = conv_param->input.shape().height();
-    auto paddings = *param.paddings;
-    args.image.pad_width = paddings[2];
-    args.image.pad_height = paddings[0];
-
+    args.image.pad_width = param.paddings[1];
+    args.image.pad_height = param.paddings[0];
+    args.dilation = param.dilations[0];
     args.output.address = conv_param->output.mutableData<void>();
     args.output.scale_address = conv_param->output.scale();
-    bool pad_equal =
-        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-    if (!pad_equal) {
-      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
-                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-    }
     param.splitParams().push_back(conv_param);
   }
 }
@@ -422,7 +480,6 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
     for (int i = 0; i < 1; i++) {
       for (int i = 0; i < img.shape().numel(); i++) {
         float value = half_to_float(img.data<float16>()[i]);
-        std::cout << "value:" << value << std::endl;
       }
     }
   }
@@ -431,3 +488,5 @@ inline bool compute_conv(const ConvParam& c_conv_params) {
 
 }  // namespace zynqmp
 }  // namespace paddle
+
+#endif /* conv_process_hpp */
diff --git a/lite/backends/fpga/KD/pes/crop_pe.cpp b/lite/backends/fpga/KD/pes/crop_pe.cpp
old mode 100644
new mode 100755
index c29df623aa610d329a46ee337cdcb1abd801881c..1438aaba6565cefa72f863d5fc3af0a389fc95e0
--- a/lite/backends/fpga/KD/pes/crop_pe.cpp
+++ b/lite/backends/fpga/KD/pes/crop_pe.cpp
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "lite/backends/fpga/KD/pes/crop_pe.hpp"
 
-#include <vector>
-
 namespace paddle {
 namespace zynqmp {
 
diff --git a/lite/backends/fpga/KD/pes/crop_pe.hpp b/lite/backends/fpga/KD/pes/crop_pe.hpp
index 6ebbcdb31f1afb7939c75a2ba9254c0b31f67d31..ccd1e0c98968375ebd840c7e8b15aedd6ad7ef77 100755
--- a/lite/backends/fpga/KD/pes/crop_pe.hpp
+++ b/lite/backends/fpga/KD/pes/crop_pe.hpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
 #include <cstring>
 #include <vector>
 
diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
old mode 100644
new mode 100755
index f86806102d4a217ae4bb7355b36ca10d96ca4a05..0efca2ec2e60e8973d92f41463b0444722f2a73b
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -37,18 +37,36 @@ class DepthwiseConvPE : public PE {
     Tensor* output = param.output;
     int channel = output->shape().channel();
 
-    float* new_scale_data = param_.scale()->data<float>();
-    float* new_bias_data = param_.bias()->data<float>();
-
     float16* b_data = bias_.mutableData<float16>(FP16, param_.bias()->shape());
-    for (int i = 0; i < channel; i++) {
-      b_data[i] = float_to_half(new_bias_data[i]);
+    if (param_.bias()->dataType() == FP32) {
+      float* new_bias_data = param_.bias()->data<float>();
+      // bias从float转换成float16
+      for (int i = 0; i < channel; i++) {
+        b_data[i] = float_to_half(new_bias_data[i]);
+      }
+      bias_.flush();
+    } else {
+      float16* new_bias_data = param_.bias()->data<float16>();
+      memcpy(b_data, new_bias_data, channel * sizeof(float16));
+      bias_.flush();
     }
-    bias_.flush();
 
-    Tensor* quantized_filter = param.quantizedFilter();
-    quantized_filter->mutableData<float16>(FP16, param.filter->shape());
-    format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
+    if (param_.scale()->dataType() == FP32) {
+      float* new_scale_data = param_.scale()->data<float>();
+      Tensor* quantized_filter = param.quantizedFilter();
+      quantized_filter->mutableData<float16>(FP16, param.filter->shape());
+      format_dw_filter(param.filter, param.quantizedFilter(), new_scale_data);
+
+    } else {
+      // filter 全为1时，且channal为对齐时
+      float16* scale_data = param_.scale()->data<float16>();
+      float16* filter_data = param.quantizedFilter()->mutableData<float16>(
+          FP16, param.filter->shape());
+      memcpy(filter_data,
+             scale_data,
+             param.filter->shape().numel() * sizeof(float16));
+      param.quantizedFilter()->flush();
+    }
 
     DWconvArgs args = {0};
     args.bias_address = b_data;
@@ -61,21 +79,14 @@ class DepthwiseConvPE : public PE {
     args.image.channels = input->shape().channel();
     args.image.height = input->shape().height();
     args.image.width = input->shape().width();
-    auto paddings = *param.paddings;
-    args.image.pad_width = param.paddings[2];
-    args.image.pad_height = param.paddings[0];
+    args.image.pad_width = param.paddings[0];
+    args.image.pad_height = param.paddings[1];
     args.image.scale_address = input->scale();
     args.output.address = output->data<void>();
     args.output.scale_address = output->scale();
     args.out_width = param.output->shape().width();
     args.out_height = param.output->shape().height();
     args.sub_conv_num = 1;
-    bool pad_equal =
-        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-    if (!pad_equal) {
-      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
-                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
-    }
     param.args = args;
 
     inplace_.relu_enable = param_.relu.enabled;
diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0505e78b61e3b0130c876880894cec29c78406f2
--- /dev/null
+++ b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
@@ -0,0 +1,77 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+namespace paddle {
+namespace zynqmp {
+
+class ElementwiseMulPE : public PE {
+ public:
+  bool init() {
+    Tensor* output = param_.output;
+    output->setAligned(true);
+    output->setDataLocation(Device);
+    return true;
+  }
+
+  void apply() {
+    Tensor* input = param_.inputs[0];
+    Tensor* output = param_.output;
+
+    int wc_aligned = align_to_x(param_.inputs[0]->shape().numel(), 32);
+
+    Shape s(N, {wc_aligned});
+    float16* bias_data = bias_tensor.mutableData<float16>(FP16, s);
+    memset(bias_data, 0, wc_aligned * sizeof(float16));
+
+    ScaleArgs& args = args_;
+    args.scale_address = param_.inputs[1]->data<void>();
+    args.bias_address = bias_tensor.data<void>();
+    args.wc_alignment = wc_aligned;
+    args.channel_alignment = wc_aligned;
+    args.image.address = input->data<void>();
+    args.image.scale_address = input->scale();
+    args.image.channels = wc_aligned;
+    args.image.height = 1;
+    args.image.width = 1;
+    args.image.pad_width = 0;
+    args.image.pad_height = 0;
+    args.output.address = output->data<void>();
+    args.output.scale_address = output->scale();
+  }
+
+  void updateInput(Tensor* t, int index) {
+    if (index == 0) {
+      args_.scale_address = t->data<void>();  // replace inputs?
+    }
+  }
+
+  bool dispatch() {
+    compute_fpga_scale(args_) == 0;
+    return true;
+  }
+
+  ElementwiseMulParam& param() { return param_; }
+
+ private:
+  ElementwiseMulParam param_;
+  ScaleArgs args_ = {0};
+  Tensor bias_tensor;
+};
+
+}  // namespace zynqmp
+}  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
old mode 100644
new mode 100755
index 2179a142ad3b3a990512b3ea1cd202bc5ce502f1..db3e05276171607da4cea421dd554846a00314a6
--- a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
+++ b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
@@ -37,7 +37,10 @@ class FullyConnectedPE : public PE {
     ConvParam& convParam_ = convPE_.param();
     Tensor* input = param_.input;
     convParam_.input = param_.input;
+    num_ = param_.input->shape().num();
+
     convParam_.output = param_.output;
+
     convParam_.groups = 1;
     convParam_.strides = {1, 1};
     convParam_.paddings = {0, 0};
@@ -63,7 +66,6 @@ class FullyConnectedPE : public PE {
         new_filter_data[i * chw + j] = scale;
       }
     }
-
     conv_filter->flush();
     convParam_.filter = conv_filter;
 
@@ -89,6 +91,8 @@ class FullyConnectedPE : public PE {
  private:
   FullyConnectedParam param_;
   ConvPE convPE_;
+  Tensor tempOut_;
+  int num_ = 1;
 };
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..dcacab4eeef32b245d4126b72597b398a6627ba6
--- /dev/null
+++ b/lite/backends/fpga/KD/pes/gru_pe.hpp
@@ -0,0 +1,191 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/backends/fpga/KD/pe.hpp"
+#include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp"
+#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
+#include "lite/backends/fpga/KD/pes/relu_pe.hpp"
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace zynqmp {
+
+struct GRUTensors {
+  Tensor* gate;
+  Tensor* pre_output;
+  Tensor* output;
+  Tensor* reset_output;
+};
+
+class GRUPE : public PE {
+ public:
+  bool init() { return true; }
+
+  void apply() {
+    auto hidden = param_.hidden;
+    int frame_size = hidden->shape().channel();
+
+    zynqmp::Shape hidden_shape{zynqmp::NCHW, {1, frame_size, 1, 1}};
+    float16* prev_hidden_data =
+        prev_hidden_.mutableData<float16>(zynqmp::FP16, hidden_shape);
+    memset(prev_hidden_data, 0, hidden_shape.numel() * sizeof(float16));
+
+    zynqmp::Shape weight_shape{zynqmp::NC, {frame_size, frame_size * 2}};
+    float* weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
+    memset(weight_data, 0, weight_shape.numel() * sizeof(float));
+    weight_data = weight_.mutableData<float>(zynqmp::FP32, weight_shape);
+    memcpy(weight_data,
+           param_.weight->data<float>(),
+           weight_shape.numel() * sizeof(float));
+
+    Shape gate_shape(zynqmp::NC, {1, frame_size * 2});
+    gate_ping_.mutableData<void>(FP32, gate_shape);
+    gate_pong_.mutableData<void>(FP16, gate_shape);
+
+    zynqmp::FullyConnectedParam& pre_out_param = pre_out_pe_.param();
+    pre_out_param.input = &prev_hidden_;
+    pre_out_param.output = &gate_pong_;
+    pre_out_param.filter = &weight_;
+    pre_out_param.bias = &gate_ping_;
+    pre_out_pe_.init();
+    pre_out_pe_.apply();
+
+    reset_gate_.mutableData<void>(FP16, hidden_shape);
+    prev_hidden_.mutableData<void>(FP16, hidden_shape);
+    reset_hidden_.mutableData<void>(FP16, hidden_shape);
+
+    ElementwiseMulParam& mul_param = mul_pe_.param();
+    mul_param.inputs = {&reset_gate_, &prev_hidden_};
+    mul_param.output = &reset_hidden_;
+    mul_pe_.init();
+    mul_pe_.apply();
+  }
+
+  bool dispatch() { return true; }
+
+  void gru_unit_reset_act(const lite_api::ActivationType active_gate,
+                          GRUTensors& value,  // NOLINT
+                          int frame_size,
+                          int batch_size) {
+    int stride_update = 3 * frame_size;
+    int stride_cell_state = 3 * frame_size;
+    int stride_hidden_prev = frame_size;
+    int stride_hidden = frame_size;
+
+    float* update_gate_data = gate_ping_.data<float>();
+    float* reset_gate_data = update_gate_data + frame_size;
+
+    for (int b = 0; b < batch_size; b++) {
+      Tensor tmp;
+      Shape s(NC, {1, frame_size});
+      float* tmp_data = tmp.mutableData<float>(FP32, s);
+
+      for (int i = 0; i < frame_size; i++) {
+        update_gate_data[i] =
+            lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
+                update_gate_data[i]);
+        reset_gate_data[i] =
+            lite::arm::math::active_f32<lite_api::ActivationType::kSigmoid>(
+                reset_gate_data[i]);
+      }
+      memcpy(tmp_data, reset_gate_data, frame_size * sizeof(float));
+      tmp.flush();
+      reset_gate_.copyFrom(&tmp);
+
+      Tensor* hidden_prev = value.pre_output;
+      if (hidden_prev) {
+        // TODO(chonwhite): change to pre_out;
+        prev_hidden_.copyFrom(value.pre_output);
+        prev_hidden_.saveToFile("prev_.txt");
+      }
+
+      mul_pe_.dispatch();
+      reset_hidden_.saveToFile("reset_hidden_.txt");
+      update_gate_data += stride_update;
+      reset_gate_data += stride_update;
+
+      // reset_hidden_prev += stride_hidden;// TODO
+    }
+  }
+
+  void gru_unit_out_act(const lite_api::ActivationType active_node,
+                        bool origin_mode,
+                        GRUTensors& value,  // NOLINT
+                        int frame_size,
+                        int batch_size) {}
+
+  void copy_input(GRUTensors& value) {  // NOLINT
+    float max = find_max(*(value.gate));
+    gate_ping_.mutableData<void>(FP32, value.gate->shape());
+    gate_ping_.copyFrom(value.gate);
+    // update input pointer?
+  }
+
+  void GRUCOmpute(GRUTensors& value,  // NOLINT
+                  int frame_size,
+                  int batch_size,
+                  const lite_api::ActivationType active_node,
+                  const lite_api::ActivationType active_gate,
+                  bool origin_mode) {
+    copy_input(value);
+
+    if (value.pre_output) {
+      // copy by batch;
+      pre_out_pe_.dispatch();
+      gate_ping_.copyFrom(&gate_pong_);
+    }
+
+    gru_unit_reset_act(active_gate, value, frame_size, batch_size);
+  }
+
+  GRUParam& param() { return param_; }
+
+  Tensor* updateGate() { return &update_gate_; }
+
+  Tensor* resetGate() { return &reset_gate_; }
+
+ private:
+  GRUParam param_;
+  zynqmp::Tensor gate_ping_;
+  zynqmp::Tensor gate_pong_;
+  zynqmp::Tensor bias_;
+  zynqmp::Tensor weight_;
+  zynqmp::Tensor state_weight_;
+  zynqmp::Tensor update_gate_;
+  zynqmp::Tensor reset_gate_;
+  zynqmp::Tensor cell_state_;
+  zynqmp::Tensor prev_hidden_;
+  zynqmp::Tensor reset_hidden_;
+
+  Tensor tempTensor;
+
+  ReluPE update_relu_pe_;
+  ReluPE reset_relu_pe_;
+  zynqmp::ElementwiseMulPE mul_pe_;
+  zynqmp::FullyConnectedPE pre_out_pe_;
+  zynqmp::FullyConnectedPE reset_out_pe_;
+
+  zynqmp::ElementwiseAddPE bias_ew_pe_;
+};
+
+}  // namespace zynqmp
+}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h b/lite/backends/fpga/KD/pes/gru_util.hpp
similarity index 71%
rename from lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
rename to lite/backends/fpga/KD/pes/gru_util.hpp
index 3c76e0e8b5cf0842cb8d5a613cef7aee3cd13bdb..d49169846f4f18e4d8e30f3658c2173157678f81 100644
--- a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
+++ b/lite/backends/fpga/KD/pes/gru_util.hpp
@@ -14,13 +14,10 @@
 
 #pragma once
 
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/backends/arm/math/gru_utils.h"
 
-USE_XPU_BRIDGE(relu);
-USE_XPU_BRIDGE(conv2d);
-USE_XPU_BRIDGE(depthwise_conv2d);
-USE_XPU_BRIDGE(elementwise_add);
-USE_XPU_BRIDGE(pool2d);
-USE_XPU_BRIDGE(softmax);
-USE_XPU_BRIDGE(mul);
-USE_XPU_BRIDGE(batch_norm);
+namespace paddle {
+namespace lite {
+namespace fpga {}
+}
+}
diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp
old mode 100644
new mode 100755
index 1c99386ab19f485c07723c7fcc8501bdf5556f6c..2944691693b135a2d2df7b91ecbe0ef249b015d8
--- a/lite/backends/fpga/KD/pes/output_pe.hpp
+++ b/lite/backends/fpga/KD/pes/output_pe.hpp
@@ -25,6 +25,8 @@ class OutputPE : public PE {
   bool init() {
     Tensor* output = param_.output;
     output->setAligned(false);
+    DLEngine::get_instance().out_data = reinterpret_cast<float*>(
+        fpga_malloc(output->shape().numel() * sizeof(float)));
     return true;
   }
 
@@ -41,6 +43,15 @@ class OutputPE : public PE {
     } else {
       output->copyFrom(input);
     }
+    //
+    output->syncToCPU();
+    if (DLEngine::get_instance().out_data == nullptr) {
+      DLEngine::get_instance().out_data = reinterpret_cast<float*>(
+          fpga_malloc(output->shape().numel() * sizeof(float)));
+    }
+    memcpy(DLEngine::get_instance().out_data,
+           output->data<void>(),
+           output->shape().numel() * sizeof(float));
     return true;
   }
 
diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp
old mode 100644
new mode 100755
index 5bb4f5285a48c7696b1f0f78a9b1c4fe6a9d76c5..a8725b51a690e0e134785fcfdb3dd70edeffd441
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -35,24 +35,25 @@ class PoolingPE : public PE {
     Tensor* input = param_.input;
     Tensor* output = param_.output;
 
-    uint32_t k_width = param_.kernelSize[0];
-    uint32_t k_height = param_.kernelSize[1];
+    uint32_t k_height = param_.kernelSize[0];
+    uint32_t k_width = param_.kernelSize[1];
 
     if (param_.globalPooling) {
       k_width = input->shape().width();
       k_height = input->shape().height();
+      param_.kernelSize[0] = k_height;
+      param_.kernelSize[1] = k_width;
     }
 
     PoolingArgs args = {0};
     args.mode = param_.type;
-    auto paddings = *param_.paddings;
     args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
     args.image.address = input->data<float16>();
     args.image.channels = input->shape().channel();
     args.image.height = input->shape().height();
     args.image.width = input->shape().width();
-    args.image.pad_height = paddings[0];
-    args.image.pad_width = paddings[2];
+    args.image.pad_height = param_.paddings[0];
+    args.image.pad_width = param_.paddings[1];
     args.image.scale_address = input->scale();
     args.output.address = output->mutableData<float16>();
     args.output.scale_address = output->scale();
@@ -66,6 +67,7 @@ class PoolingPE : public PE {
 
     use_cpu_ = output->shape().width() == 1 && output->shape().height() == 1 &&
                (k_width > 7 || k_height > 7);
+    use_cpu_ = param_.type == AVERAGE;
   }
 
   void compute() {
@@ -77,13 +79,12 @@ class PoolingPE : public PE {
     float* image_addr = float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
     float16* data_out = output->data<float16>();
-    auto paddings = *param_.paddings;
 
     int image_height = input->shape().height();
     int image_width = input->shape().width();
     int image_channels = input->shape().channel();
-    int image_pad_h = paddings[0];
-    int image_pad_w = paddings[2];
+    int image_pad_h = param_.paddings[0];
+    int image_pad_w = param_.paddings[1];
     int kernel_height = param_.kernelSize[1];
     int kernel_width = param_.kernelSize[0];
     int kernel_step_h = param_.strides[0];
@@ -129,7 +130,7 @@ class PoolingPE : public PE {
     output->flush();
   }
 
-  void cpu_compute() {
+  void cpu_compute1() {
     Tensor* input = param_.input;
     Tensor* output = param_.output;
     input->syncToCPU();
@@ -138,7 +139,6 @@ class PoolingPE : public PE {
     float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
     float16* data_out = output->data<float16>();
-
     int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
 
     float scale_max = 0;
@@ -154,7 +154,35 @@ class PoolingPE : public PE {
     }
     output->scale()[0] = scale_max / 127.0f;
     output->scale()[1] = 127.0f / scale_max;
-    std::cout << "pool scale:" << scale_max / 127.0f << std::endl;
+    output->flush();
+  }
+
+  void cpu_compute() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    input->syncToCPU();
+
+    Tensor float_input;
+    float* float_input_data =
+        float_input.mutableData<float>(FP32, input->shape());
+    float_input.copyFrom(input);
+
+    float16* data_out = output->data<float16>();
+
+    int kernel_hw = param_.kernelSize[0] * param_.kernelSize[1];
+
+    float scale_max = 0;
+    for (int i = 0; i < output->shape().channel(); i++) {
+      float sum = 0;
+      for (int j = 0; j < kernel_hw; j++) {
+        sum += float_input_data[i * kernel_hw + j];
+      }
+      float value = sum / kernel_hw;
+      data_out[i] = float_to_half(value);
+      scale_max = std::max(scale_max, std::abs(value));
+    }
+    output->scale()[0] = scale_max / 127.0f;
+    output->scale()[1] = 127.0f / scale_max;
     output->flush();
   }
 
diff --git a/lite/backends/fpga/KD/pes/prior_box_pe.cpp b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
index d6a503a31d4e0736724740ce1875c916969d93e0..00dfe1830f6f44cbf6a30708fa5783563470c686 100644
--- a/lite/backends/fpga/KD/pes/prior_box_pe.cpp
+++ b/lite/backends/fpga/KD/pes/prior_box_pe.cpp
@@ -253,9 +253,8 @@ bool PriorBoxPE::dispatch() {
   if (cachedBoxes_ == nullptr) {
     cachedBoxes_ = new Tensor();
     cachedVariances_ = new Tensor();
-    cachedBoxes_->mutableData<float16>(FP16, param_.outputBoxes->shape());
-    cachedVariances_->mutableData<float16>(FP16,
-                                           param_.outputVariances->shape());
+    cachedBoxes_->mutableData<float>(FP32, param_.outputBoxes->shape());
+    cachedVariances_->mutableData<float>(FP32, param_.outputVariances->shape());
     cachedBoxes_->setDataLocation(CPU);
     cachedVariances_->setDataLocation(CPU);
     compute_prior_box();
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
old mode 100755
new mode 100644
index d5e16615d9943a1771dfabe916433768ecf16319..cc89ac943f90cb20062a3d6ef9a46b705193ad04
--- a/lite/backends/fpga/KD/pes/scale_pe.hpp
+++ b/lite/backends/fpga/KD/pes/scale_pe.hpp
@@ -14,11 +14,16 @@ limitations under the License. */
 
 #pragma once
 
+#include <algorithm>
+
 #include "lite/backends/fpga/KD/pe.hpp"
 #include "lite/backends/fpga/KD/pe_params.hpp"
+#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
+#include "lite/backends/fpga/KD/tensor.hpp"
 
 namespace paddle {
 namespace zynqmp {
+
 class ScalePE : public PE {
  public:
   inline int gcd(int a, int b) {
@@ -42,6 +47,8 @@ class ScalePE : public PE {
     Tensor* input = param_.input;
     Tensor* output = param_.output;
     Shape& input_shape = input->shape();
+    DepthwiseConvParam& dw_param = dw_pe_.param();
+
     int channel = input_shape.channel();
     int repeat = 1;
     int alignment = 16;
@@ -51,70 +58,142 @@ class ScalePE : public PE {
       int c_lcm = lcm(channel, alignment);
       repeat = c_lcm / (channel);
     }
+
+    // FPGA限制 H >2047, W >1023 , WC> 65536 ，需要使用CPU实现
     Shape shape(N, {channel * repeat});
-    param_.alignedBias()->mutableData<float16>(FP16, shape);
-    param_.alignedScale()->mutableData<float16>(FP16, shape);
 
-    float16* bias_data = param_.alignedBias()->data<float16>();
-    float16* scale_data = param_.alignedScale()->data<float16>();
+    float* filter_data = filter.mutableData<float>(FP32, shape);
+    std::fill_n(filter_data, input->shape().channel(), 1.0f);
 
-    if (param_.bias != nullptr) {
-      float* bias_data_float = param_.bias->data<float>();
+    Tensor* scale = dw_param.scale();
+    float16* scale_data = scale->mutableData<float16>(FP16, shape);
+
+    Tensor* bias = dw_param.bias();
+    float16* bias_data = bias->mutableData<float16>(FP16, shape);
+    std::fill_n(bias_data, input->shape().channel(), 0);
+
+    if (param_.scale->dataType() == FP32) {
+      if (param_.bias != nullptr) {
+        float* bias_data_float = param_.bias->data<float>();
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            float16 value = float_to_half(bias_data_float[j]);
+            bias_data[i * length + j] = value;
+          }
+        }
+      } else {
+        float16 zero = float_to_half(0.0f);
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            bias_data[i * length + j] = zero;
+          }
+        }
+      }
+
+      float* scale_data_float = param_.scale->data<float>();
       for (int i = 0; i < repeat; i++) {
         for (int j = 0; j < length; j++) {
-          float16 value = float_to_half(bias_data_float[j]);
-          bias_data[i * length + j] = value;
+          float16 value = float_to_half(scale_data_float[j]);
+          scale_data[i * length + j] = value;
         }
       }
     } else {
-      float16 zero = float_to_half(0.0f);
+      if (param_.bias != nullptr) {
+        float16* bias_data_float = param_.bias->data<float16>();
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            float16 value = bias_data_float[j];
+            bias_data[i * length + j] = value;
+          }
+        }
+      } else {
+        float16 zero = float_to_half(0.0f);
+        for (int i = 0; i < repeat; i++) {
+          for (int j = 0; j < length; j++) {
+            bias_data[i * length + j] = zero;
+          }
+        }
+      }
+
+      float16* scale_data_float = param_.scale->data<float16>();
       for (int i = 0; i < repeat; i++) {
         for (int j = 0; j < length; j++) {
-          bias_data[i * length + j] = zero;
+          float16 value = scale_data_float[j];
+          scale_data[i * length + j] = value;
         }
       }
     }
 
-    float* scale_data_float = param_.scale->data<float>();
-    for (int i = 0; i < repeat; i++) {
-      for (int j = 0; j < length; j++) {
-        float16 value = float_to_half(scale_data_float[j]);
-        scale_data[i * length + j] = value;
+    dw_param.input = param_.input;
+    dw_param.output = param_.output;
+    dw_param.filter = &filter;
+
+    dw_param.strides = {1, 1};
+    dw_param.paddings = {0, 0};
+    dw_param.kernelSize = {1, 1};
+    dw_param.dilations = {1, 1};
+
+    dw_pe_.init();
+    dw_pe_.apply();
+  }
+
+  void cpu_compute() {
+    Tensor* input = param_.input;
+    Tensor* output = param_.output;
+    Tensor float_input;
+    float* image_addr = float_input.mutableData<float>(FP32, input->shape());
+    input->syncToCPU();
+    float_input.copyFrom(input);
+    float16* data_out = output->data<float16>();
+
+    float* scale_data = param_.scale->data<float>();
+
+    int wh = input->shape().width() * input->shape().height();
+
+    float16* in_data = input->data<float16>();
+
+    float max = 0;
+
+    for (int i = 0; i < wh; i++) {
+      for (int c = 0; c < input->shape().channel(); c++) {
+        int index = i * input->shape().channel() + c;
+        float value = half_to_float(in_data[index]) * scale_data[c];
+        data_out[index] = float_to_half(value);
+
+        if (value < 0) {
+          value = -value;
+        }
+        if (value > max) {
+          max = value;
+        }
       }
     }
-
-    param_.alignedScale()->flush();
-    param_.alignedBias()->flush();
-
-    int wc = input_shape.width() * input_shape.channel();
-    int wc_aligned = align_image(wc);
-
-    ScaleArgs& args = param_.args;
-    args.scale_address = param_.alignedScale()->data<void>();
-    args.bias_address = param_.alignedBias()->data<void>();
-    args.wc_alignment = wc_aligned;
-    args.channel_alignment = channel * repeat;
-
-    args.image.address = input->data<void>();
-    args.image.scale_address = input->scale();
-    args.image.channels = channel;
-    args.image.height = input_shape.height();
-    args.image.width = input_shape.width();
-    args.image.pad_width = 0;
-    args.image.pad_height = 0;
-    args.output.address = output->data<void>();
-    args.output.scale_address = output->scale();
+    output->flush();
+    output->scale()[0] = max / 127.0f;
+    output->scale()[1] = 127.0f / max;
   }
 
   bool dispatch() {
+    if (param_.scale->dataType() == FP16) {
+      DepthwiseConvParam& dw_param = dw_pe_.param();
+      memcpy(dw_param.quantizedFilter()->mutableData<float16>(),
+             param_.scale->data<float16>(),
+             param_.scale->shape().numel() * sizeof(float16));
+      dw_param.quantizedFilter()->scale()[0] = param_.scale->scale()[0];
+      dw_param.quantizedFilter()->scale()[1] = param_.scale->scale()[1];
+
+      dw_param.quantizedFilter()->flush();
+    }
     param_.input->syncToDevice();
-    return compute_fpga_scale(param_.args) == 0;
+    return dw_pe_.dispatch();
   }
 
   ScaleParam& param() { return param_; }
 
  private:
   ScaleParam param_;
+  Tensor filter;
+  DepthwiseConvPE dw_pe_;
 };
 }  // namespace zynqmp
 }  // namespace paddle
diff --git a/lite/backends/fpga/KD/shape.hpp b/lite/backends/fpga/KD/shape.hpp
index 566ad8e6ff2eff32301e83b6cdb5b1addd0117fe..c25c3315145137a147928a164fcabd2923b09e87 100755
--- a/lite/backends/fpga/KD/shape.hpp
+++ b/lite/backends/fpga/KD/shape.hpp
@@ -23,6 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace zynqmp {
 
+static struct None none_;
 static struct NCHW nchw_;
 static struct NHWC nhwc_;
 static struct NC nc_;
@@ -82,6 +83,9 @@ class Shape {
   void setLayoutType(LayoutType layout) {
     this->layoutType_ = layout;
     switch (layout) {
+      case None:
+        layout_ = &none_;
+        break;
       case NCHW:
         layout_ = &nchw_;
         break;
diff --git a/lite/backends/fpga/KD/tensor.hpp b/lite/backends/fpga/KD/tensor.hpp
index f003ded33eb51136ae0ae0a2c21988460232f89a..f1b07d02622fad32e99205667424a4cb3c9fb46d 100644
--- a/lite/backends/fpga/KD/tensor.hpp
+++ b/lite/backends/fpga/KD/tensor.hpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <stdio.h>
+#include <unistd.h>
 #include <algorithm>
 #include <cmath>
 #include <cstring>
@@ -24,13 +25,10 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-// #include "lite/core/tensor.h"
-
 #include "lite/backends/fpga/KD/dl_engine.hpp"
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/llapi/zynqmp_api.h"
 #include "lite/backends/fpga/KD/shape.hpp"
-// #include "lite/backends/fpga/KD/types.hpp"
 
 namespace paddle {
 namespace zynqmp {
@@ -117,7 +115,8 @@ class Tensor {
 
   template <typename Dtype>
   Dtype* mutableData() {
-    size_t memorySize = shape_->memorySize(CellSize(dataType_));
+    size_t memorySize =
+        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
     if (placeHolder_ != nullptr) {
       if (memorySize > placeHolder_->memorySize()) {
         placeHolder_.reset(new PlaceHolder(memorySize));
@@ -241,6 +240,10 @@ class Tensor {
     }
   }
 
+  void setMemScale(float scale_factor) {
+    this->mem_scale_factor_ = scale_factor;
+  }
+
   void shareDataWith(Tensor* src) { shareDataWith(src, src->shape()); }
 
   void shareDataWith(Tensor* src, const Shape& shape, int offset = 0) {
@@ -276,9 +279,11 @@ class Tensor {
                   .height = 1,
                   .pad_width = 0u,
                   .pad_height = 0u};
-    args.output = {
+
+    ImageOutputArgs output = {
         .address = data<void>(), .scale_address = scale(),
     };
+    args.output = output;
     src->syncToDevice();
     size_t aligned_remainder = src->shape().numel() % 16;
     if (aligned_remainder > 0) {
@@ -294,10 +299,16 @@ class Tensor {
     this->invalidate();
   }
 
-  void flush() { fpga_flush(placeHolder_->data(), placeHolder_->memorySize()); }
+  void flush() {
+    size_t memorySize =
+        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
+    fpga_flush(placeHolder_->data(), memorySize);
+  }
 
   void invalidate() {
-    fpga_invalidate(placeHolder_->data(), placeHolder_->memorySize());
+    size_t memorySize =
+        shape_->memorySize(CellSize(dataType_)) * mem_scale_factor_;
+    fpga_invalidate(placeHolder_->data(), memorySize);
   }
 
   void sync() {
@@ -339,6 +350,8 @@ class Tensor {
     }
   }
 
+  void printScale(std::string type) { printScale(); }
+
   std::string dimsFileName() {
     return std::to_string(shape_->num()) + "_" +
            std::to_string(shape_->channel()) + "_" +
@@ -358,29 +371,9 @@ class Tensor {
     saveToFile(path);
   }
 
-  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
-    os << "tensor:"
-       << "\n";
-    os << "dims: {";
-    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
-      os << tensor.shape()[i] << " ";
-    }
-    os << "}\n";
-    for (int i = 0; i < tensor.shape().numel(); i++) {
-      float value = 0;
-      if (tensor.dataType() == FP32) {
-        value = tensor.data<float>()[i];
-      } else {
-        value = half_to_float(tensor.data<float16>()[i]);
-      }
-      os << value << " ";
-    }
-    os << "\n";
-    return os;
-  }
-
   void saveToFile(std::string path) {
     syncToCPU();
+    invalidate();
     std::ofstream ofs;
     static int counter = 0;
     std::string npath = std::to_string(counter) + "_" + path;
@@ -389,17 +382,18 @@ class Tensor {
   }
 
   void save_file_with_name(std::string path) {
-    // return;
     invalidate();
     std::ofstream ofs;
-
     ofs.open(path);
+
     for (int i = 0; i < shape_->numel(); i++) {
       float value = 0;
       if (dataType_ == FP32) {
         value = data<float>()[i];
-      } else {
+      } else if (dataType_ == FP16) {
         value = half_to_float(data<float16>()[i]);
+      } else {
+        value = data<int8_t>()[i];
       }
       ofs << value << std::endl;
     }
@@ -415,18 +409,49 @@ class Tensor {
     int num = shape_->numel();
     invalidate();
     float max = 0.0f;
-    float16* data = mutableData<float16>();
-    for (int i = 0; i < num; ++i) {
-      float value = 0;
-      file_stream >> value;
-      max = std::max(std::abs(value), max);
-      data[i] = float_to_half(value);
+    if (dataType_ == FP16) {
+      float16* data = mutableData<float16>();
+      for (int i = 0; i < num; ++i) {
+        float value = 0;
+        file_stream >> value;
+        max = std::max(std::abs(value), max);
+        data[i] = float_to_half(value);
+      }
+    } else {
+      float* data = mutableData<float>();
+      for (int i = 0; i < num; ++i) {
+        float value = 0;
+        file_stream >> value;
+        max = std::max(std::abs(value), max);
+        data[i] = value;
+      }
     }
     flush();
     placeHolder_->scale_[0] = max / 127.0f;
     placeHolder_->scale_[1] = 127.0f / max;
   }
 
+  friend std::ostream& operator<<(std::ostream& os, Tensor& tensor) {
+    os << "tensor:"
+       << "\n";
+    os << "dims: {";
+    for (int i = 0; i < tensor.shape().dimSize(); ++i) {
+      os << tensor.shape()[i] << " ";
+    }
+    os << "}\n";
+    for (int i = 0; i < tensor.shape().numel(); i++) {
+      float value = 0;
+      if (tensor.dataType() == FP32) {
+        value = tensor.data<float>()[i];
+      } else {
+        value = half_to_float(tensor.data<float16>()[i]);
+      }
+      os << value << " ";
+    }
+    os << "\n";
+    return os;
+  }
+
   ~Tensor() {
     if (shape_ != nullptr) {
       delete shape_;
@@ -436,6 +461,7 @@ class Tensor {
 
  private:
   int offset = 0;
+  float mem_scale_factor_ = 1.0f;
   std::shared_ptr<PlaceHolder> placeHolder_;
   Shape* shape_ = nullptr;
   DataType dataType_ = FP32;
diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc
old mode 100644
new mode 100755
index 43218173fd05626fb46495bb254b250c14e5417a..7f1e8d3e17f97315e77532b77bbcfcc8331edd4f
--- a/lite/backends/fpga/lite_tensor.cc
+++ b/lite/backends/fpga/lite_tensor.cc
@@ -95,16 +95,14 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
   dims_ = other.dims_;
   target_ = other.target_;
   lod_ = other.lod_;
-  // memory_size_ = other.memory_size_;
-  // buffer_->CopyDataFrom(*other.buffer_, memory_size_);
-  zynq_tensor_->mutableData<void>(other.zynq_tensor_->dataType(),
-                                  other.zynq_tensor_->shape());
-}
+  auto dt = zynq_tensor_->dataType();
 
-// template <typename T>
-// void TensorLite::mutable_data_internal() {
+  auto shape = other.zynq_tensor_->shape();
 
-// }
+  Resize(other.dims());
+  zynq_tensor_->mutableData<void>(zynq_tensor_->dataType(), shape);
+  this->ZynqTensor()->copyFrom(other.ZynqTensor());
+}
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/fpga/lite_tensor.h b/lite/backends/fpga/lite_tensor.h
index 2f9df3abb08dd15641323f4a3c59d6175f2e481b..311fc8a98400e5a6916ba1b9c8de1e6e0bcec4c0 100644
--- a/lite/backends/fpga/lite_tensor.h
+++ b/lite/backends/fpga/lite_tensor.h
@@ -106,7 +106,7 @@ class TensorLite {
   // For other devices, T and R may be the same type.
   template <typename T, typename R = T>
   const R *data() const {
-    return zynq_tensor_->data<R>();
+    return zynq_tensor_->data<R>() + offset_;
   }
 
   void Resize(const DDimLite &ddim) { dims_ = ddim; }
@@ -125,6 +125,7 @@ class TensorLite {
 
   bool persistable() const { return persistable_; }
   void set_persistable(bool persistable) { persistable_ = persistable; }
+
   // T is the data type and R is the return type
   // For OpenCL, the return type can be cl::Buffer
   // and the data type can be float/int8_t.
@@ -147,6 +148,8 @@ class TensorLite {
 
   size_t memory_size() const { return zynq_tensor_->memorySize(); }
 
+  size_t offset() const { return offset_; }
+
   bool IsInitialized() const { return buffer_->data(); }
 
   // Other share data to this.
@@ -157,6 +160,9 @@ class TensorLite {
   template <typename T>
   TensorLite Slice(int64_t begin, int64_t end) const;
 
+  template <typename T>
+  void Slice(TensorLite &dst, int64_t begin, int64_t end) const;  // NOLINT
+
   TargetType target() const { return target_; }
 
   zynqmp::Tensor *ZynqTensor() const { return zynq_tensor_; }
@@ -173,16 +179,21 @@ class TensorLite {
 
  private:
   TargetType target_{TargetType::kHost};
+
+  // precision_ and persistable_ are only used for persistable vars.
+  // If your tensor wants to be saved and loaded correctly, you must
+  // set values of precision_ and persistable_ after updating it.
+  // If your tensor is just a temp tensor, such as activations,
+  // you can ignore these two attributes.
+  PrecisionType precision_{PrecisionType::kUnk};
+  bool persistable_{false};
+
   DDimLite dims_;
   std::shared_ptr<Buffer> buffer_;
   LoD lod_;
   size_t memory_size_{};
-
   size_t offset_{0};
 
-  PrecisionType precision_{PrecisionType::kUnk};
-  bool persistable_{false};
-
   zynqmp::Tensor *zynq_tensor_ = new zynqmp::Tensor();
 
   template <typename T>
@@ -197,6 +208,9 @@ R *TensorLite::mutable_data() {
   }
   zynqmp::LayoutType layout_type = zynqmp::NCHW;
   switch (v.size()) {
+    case 0:
+      layout_type = zynqmp::None;
+      break;
     case 1:
       layout_type = zynqmp::N;
       break;
@@ -228,24 +242,60 @@ R *TensorLite::mutable_data(TargetType target) {
   return mutable_data<T>();
 }
 
-template <typename TensorT>
-bool TensorCompareWith(const TensorT &a, const TensorT &b) {
-  if (a.dims() != b.dims()) return false;
-  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
-  return true;
-}
 template <typename T>
 TensorLite TensorLite::Slice(int64_t begin, int64_t end) const {
-  int64_t base = numel() / dims_[0];
+  throw - 1;
+  CHECK_GE(begin, 0);
+  CHECK_LE(end, dims_[0]);
+  CHECK_LT(begin, end);
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    int64_t base = numel() / dims_[0];
+
+    TensorLite dst;
+    dst.target_ = target_;
+    auto dst_dims = dims_;
+    dst_dims[0] = end - begin;
+    dst.Resize(dst_dims);
+    void *dst_data = dst.mutable_data<T>();
+
+    T *src_data = const_cast<T *>(data<T>());
+    memcpy(dst_data,
+           src_data + static_cast<size_t>(begin * base) * sizeof(T),
+           dst_dims.production() * sizeof(T));
+    dst.ZynqTensor()->saveToFile("_slice", true);
+
+    return dst;
+  }
+}
+
+template <typename T>
+void TensorLite::Slice(TensorLite &dst, int64_t begin, int64_t end) const {
+  CHECK_GE(begin, 0);
+  CHECK_LE(end, dims_[0]);
+  CHECK_LT(begin, end);
 
-  TensorLite dst;
-  dst.buffer_ = buffer_;
   dst.target_ = target_;
   auto dst_dims = dims_;
   dst_dims[0] = end - begin;
   dst.Resize(dst_dims);
-  dst.offset_ = offset_ + static_cast<size_t>(begin * base) * sizeof(T);
-  return dst;
+  void *dst_data = dst.mutable_data<T>();
+
+  int64_t base = numel() / dims_[0];
+
+  T *src_data = const_cast<T *>(data<T>());
+  memcpy(dst_data,
+         src_data + static_cast<size_t>(begin * dst_dims.production()),
+         dst_dims.production() * sizeof(T));
 }
+
+template <typename TensorT>
+bool TensorCompareWith(const TensorT &a, const TensorT &b) {
+  if (a.dims() != b.dims()) return false;
+  if (memcmp(a.raw_data(), b.raw_data(), a.data_size()) != 0) return false;
+  return true;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/npu/CMakeLists.txt b/lite/backends/npu/CMakeLists.txt
index 426ff5698146c773c818b2bfd598d6bbbdf7867f..1540741d331097961dcf7cd791c9785a9c53ddd1 100644
--- a/lite/backends/npu/CMakeLists.txt
+++ b/lite/backends/npu/CMakeLists.txt
@@ -2,5 +2,4 @@ if(NOT LITE_WITH_NPU)
   return()
 endif()
 
-lite_cc_library(npu_runtime SRCS runtime.cc DEPS ${npu_runtime_libs})
-lite_cc_library(npu_builder SRCS builder.cc DEPS ${npu_builder_libs} npu_runtime tensor op scope)
+lite_cc_library(device_npu SRCS device.cc DEPS ${npu_builder_libs} ${npu_runtime_libs})
diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e63939264214bc619814f06c7cf0de1b56f71ee6
--- /dev/null
+++ b/lite/backends/npu/device.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/npu/device.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace npu {
+
+std::unique_ptr<hiai::AiModelMngerClient> Device::Build(
+    std::string& model_name,                 // NOLINT
+    std::vector<ge::Operator>& input_nodes,  // NOLINT
+    std::vector<ge::Operator>& output_nodes  // NOLINT
+    ) {
+  VLOG(3) << "[NPU] Build model";
+  // Build the HiAI IR graph to the HiAI om model
+  ge::Graph ir_graph("graph");
+  ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
+  ge::Model om_model("model", "model");
+  om_model.SetGraph(ir_graph);
+  domi::HiaiIrBuild ir_build;
+  domi::ModelBufferData om_model_buf;
+  if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
+    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
+    return nullptr;
+  }
+  if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
+    LOG(WARNING) << "[NPU] BuildIRModel failed!";
+    ir_build.ReleaseModelBuff(om_model_buf);
+    return nullptr;
+  }
+  // Create a HiAI model manager client to load the HiAI om model
+  std::unique_ptr<hiai::AiModelMngerClient> model_client(
+      new hiai::AiModelMngerClient());
+  if (model_client->Init(nullptr) != hiai::AI_SUCCESS) {
+    LOG(WARNING) << "[NPU] AiModelMngerClient init failed)!";
+    ir_build.ReleaseModelBuff(om_model_buf);
+    return nullptr;
+  }
+  model_name = "model_" + std::to_string(model_count_++) + ".om";
+  auto model_desc = std::make_shared<hiai::AiModelDescription>(
+      model_name, freq_level(), framework_type(), model_type(), device_type());
+  model_desc->SetModelBuffer(om_model_buf.data, om_model_buf.length);
+  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
+  model_descs.push_back(model_desc);
+  if (model_client->Load(model_descs) != hiai::AI_SUCCESS) {
+    LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
+    ir_build.ReleaseModelBuff(om_model_buf);
+    return nullptr;
+  }
+  ir_build.ReleaseModelBuff(om_model_buf);
+  return model_client;
+}
+
+}  // namespace npu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/npu/runtime.h b/lite/backends/npu/device.h
similarity index 66%
rename from lite/backends/npu/runtime.h
rename to lite/backends/npu/device.h
index 8b1ad51518d8626d9a6ecd6203a70b2637bb6004..3eba0b77e4bdeb26cdff869771645a5ce7637ae4 100644
--- a/lite/backends/npu/runtime.h
+++ b/lite/backends/npu/device.h
@@ -13,38 +13,47 @@
 // limitations under the License.
 
 #pragma once
+
 #include <memory>
 #include <string>
+#include <unordered_map>
+#include <vector>
 #include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "lite/core/tensor.h"
+#include "ai_ddk_lib/include/hiai_ir_build.h"
 
 namespace paddle {
 namespace lite {
 namespace npu {
 
-class DeviceInfo {
+class Device {
  public:
-  static DeviceInfo &Global() {
-    static DeviceInfo x;
+  static Device& Global() {
+    static Device x;
     return x;
   }
-  DeviceInfo() {}
+  Device() {}
 
   int freq_level() { return freq_level_; }
   int framework_type() { return framework_type_; }
   int model_type() { return model_type_; }
   int device_type() { return device_type_; }
 
+  // Build the HiAI IR graph to om model, return HiAI model manager client to
+  // load om model and run inference.
+  std::unique_ptr<hiai::AiModelMngerClient> Build(
+      std::string& model_name,                 // NOLINT
+      std::vector<ge::Operator>& input_nodes,  // NOLINT
+      std::vector<ge::Operator>& output_nodes  // NOLINT
+      );                                       // NOLINT
+
  private:
   int freq_level_{3};
   int framework_type_{0};
   int model_type_{0};
   int device_type_{0};
+  int model_count_{0};
 };
 
-bool LoadModel(const lite::Tensor &model_data,
-               std::shared_ptr<hiai::AiModelMngerClient> *model_client,
-               std::string *model_name);
 }  // namespace npu
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/npu/runtime.cc b/lite/backends/npu/runtime.cc
deleted file mode 100644
index 3485f63c7c8bb91081fd1969d0d41733417149d9..0000000000000000000000000000000000000000
--- a/lite/backends/npu/runtime.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/npu/runtime.h"
-#include <string>
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-
-// Create hiai model manager to load om model from lite tensor, and return the
-// manager and an unique model name
-bool LoadModel(const lite::Tensor &model_data,
-               std::shared_ptr<hiai::AiModelMngerClient> *model_client,
-               std::string *model_name) {
-  LOG(INFO) << "[NPU] Load model.";
-  auto model_data_ptr = model_data.data<int8_t>();
-  auto model_data_size = model_data.numel() * sizeof(int8_t);
-  if (model_data_ptr == nullptr || model_data_size == 0) {
-    return false;
-  }
-  *model_client = std::make_shared<hiai::AiModelMngerClient>();
-  int ret = (*model_client)->Init(nullptr);
-  if (ret != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient init failed(" << ret << ")!";
-    return false;
-  }
-  *model_name = "model.om";
-  auto model_desc = std::make_shared<hiai::AiModelDescription>(
-      *model_name,
-      DeviceInfo::Global().freq_level(),
-      DeviceInfo::Global().framework_type(),
-      DeviceInfo::Global().model_type(),
-      DeviceInfo::Global().device_type());
-  model_desc->SetModelBuffer(model_data_ptr, model_data_size);
-  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
-  model_descs.push_back(model_desc);
-  if ((*model_client)->Load(model_descs) != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
-    return false;
-  }
-  return true;
-}
-
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/CMakeLists.txt b/lite/backends/opencl/CMakeLists.txt
index 1acb98321844191832fd55b640a9b56d3d51b400..dd7f6b417e0d6416eec9bb3e60ef088432776112 100644
--- a/lite/backends/opencl/CMakeLists.txt
+++ b/lite/backends/opencl/CMakeLists.txt
@@ -11,8 +11,8 @@ lite_cc_library(cl_image SRCS cl_image.cc DEPS tensor cl_image_converter cl_runt
 lite_cc_library(cl_caller SRCS cl_caller.cc  DEPS cl_context cl_image)
 lite_cc_library(cl_target_wrapper SRCS target_wrapper.cc DEPS cl_runtime)
 lite_cc_test(test_cl_functions SRCS cl_functions_test.cc DEPS cl_context cl_image cl_caller cl_wrapper cl_target_wrapper
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_cl_im2col SRCS cl_im2col_test.cc DEPS tensor cl_context cl_wrapper cl_target_wrapper
-             ARGS --cl_path=${CMAKE_SOURCE_DIR}/paddle/fluid/lite/backends/opencl)
+             ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 add_dependencies(cl_wrapper opencl_clhpp)
diff --git a/lite/backends/opencl/cl_caller.cc b/lite/backends/opencl/cl_caller.cc
index 4926a53c43d54b4e2b4d802a7d8ef289c7e87fc5..6b9cab1056beaa6f516a0d3a202a7816c911f1b2 100644
--- a/lite/backends/opencl/cl_caller.cc
+++ b/lite/backends/opencl/cl_caller.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace lite {
+
 static void CopyImageData(CLContext* context,
                           const CLImage& cl_image,
                           float* out) {
@@ -51,119 +52,5 @@ bool InitOpenCLRuntime(std::string cl_path) {
   return runtime->IsInitSuccess();
 }
 
-void elementwise_add(CLContext* context,
-                     const float* in,
-                     const DDim& in_dim,
-                     const float* bias,
-                     const DDim& bias_dim,
-                     float* out,
-                     const DDim& out_dim) {
-  if (!(bias_dim.size() == 1 || bias_dim.size() == 4)) {
-    LOG(FATAL) << "Error: bias dims is error";
-    return;
-  }
-  auto kernel = bias_dim.size() == 1 ? context->GetKernel("channel_add")
-                                     : context->GetKernel("elementwise_add");
-  CLImage in_image;
-  in_image.set_tensor_data(in, in_dim);
-  in_image.InitNormalCLImage(context->GetContext());
-  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
-  CLImage bias_image;
-  bias_image.set_tensor_data(bias, bias_dim);
-  bias_image.InitCLImage(context->GetContext());
-  VLOG(3) << " --- Bias image: " << bias_image << " --- ";
-  CLImage out_image;
-  out_image.InitEmptyImage(context->GetContext(), out_dim);
-  cl_int status;
-  status = kernel.setArg(0, *in_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(1, *bias_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(2, *out_image.cl_image());
-  CL_CHECK_FATAL(status);
-
-  if (bias_dim.size() == 1) {
-    int tensor_w = in_dim[3];
-    status = kernel.setArg(3, tensor_w);
-    CL_CHECK_FATAL(status);
-  }
-  size_t width = in_image.ImageWidth();
-  size_t height = in_image.ImageHeight();
-  auto global_work_size = cl::NDRange{width, height};
-  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
-  CL_CHECK_FATAL(status);
-
-  status = context->GetCommandQueue().finish();
-  CL_CHECK_FATAL(status);
-  VLOG(3) << " --- Out image: " << out_image << " --- ";
-  CopyImageData(context, out_image, out);
-}
-
-void pool(CLContext* context,
-          const std::string pooling_type,
-          const int pad_h,
-          const int pad_w,
-          const int stride_h,
-          const int stride_w,
-          const int ksize_h,
-          const int ksize_w,
-          const float* in,
-          const DDim& in_dim,
-          float* out,
-          const DDim& out_dim) {
-  auto kernel =
-      context->GetKernel(string_format("pool_%s", pooling_type.c_str()));
-  CLImage in_image;
-  in_image.set_tensor_data(in, in_dim);
-  in_image.InitNormalCLImage(context->GetContext());
-  VLOG(3) << " --- Inpu image: " << in_image << " --- ";
-  CLImage out_image;
-  out_image.InitEmptyImage(context->GetContext(), out_dim);
-  auto global_work_size = context->DefaultWorkSize(out_image);
-  auto* in_converter =
-      dynamic_cast<CLImageConverterNormal*>(in_image.image_converter());
-  auto* out_converter =
-      dynamic_cast<CLImageConverterNormal*>(out_image.image_converter());
-  const int in_height = in_converter->HeightOfOneBlock();
-  const int in_width = in_converter->WidthOfOneBlock();
-  const int out_height = out_converter->HeightOfOneBlock();
-  const int out_width = out_converter->WidthOfOneBlock();
-  cl_int status;
-  status = kernel.setArg(0, in_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(1, in_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(2, out_height);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(3, out_width);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(4, pad_h);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(5, pad_w);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(6, stride_h);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(7, stride_w);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(8, ksize_h);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(9, ksize_w);
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(10, *in_image.cl_image());
-  CL_CHECK_FATAL(status);
-  status = kernel.setArg(11, *out_image.cl_image());
-  CL_CHECK_FATAL(status);
-
-  status = context->GetCommandQueue().enqueueNDRangeKernel(
-      kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
-  CL_CHECK_FATAL(status);
-
-  status = context->GetCommandQueue().finish();
-  CL_CHECK_FATAL(status);
-  VLOG(3) << " --- Out image: " << out_image << " --- ";
-  CopyImageData(context, out_image, out);
-}
-
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/opencl/cl_caller.h b/lite/backends/opencl/cl_caller.h
index ed5c9153d3cedf140cbf0570b7f71393fb918bf9..1817db9f6bd6d9ecf21978b8293bd9534328de0f 100644
--- a/lite/backends/opencl/cl_caller.h
+++ b/lite/backends/opencl/cl_caller.h
@@ -23,30 +23,5 @@ namespace lite {
 
 bool InitOpenCLRuntime(std::string cl_path);
 
-/// An elementwise_add method to embed OpenCL logic inside, it is used as a
-/// black box so that the framework can remain simple.
-/// NOTE Currently, these methods are quite expensive, we will optimize them
-/// latter.
-void elementwise_add(CLContext* context,
-                     const float* in,
-                     const DDim& in_dim,
-                     const float* bias,
-                     const DDim& bias_dim,
-                     float* out,
-                     const DDim& out_dim);
-
-void pool(CLContext* context,
-          const std::string pooling_type,
-          const int pad_h,
-          const int pad_w,
-          const int stride_h,
-          const int stride_w,
-          const int ksize_h,
-          const int ksize_w,
-          const float* in,
-          const DDim& in_dim,
-          float* out,
-          const DDim& out_dim);
-
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/opencl/cl_functions_test.cc b/lite/backends/opencl/cl_functions_test.cc
index b9f6648c9956e1952b65f66abfa40d912a99ee67..70f47b47946641edf4d023437b48d46cae93ca6e 100644
--- a/lite/backends/opencl/cl_functions_test.cc
+++ b/lite/backends/opencl/cl_functions_test.cc
@@ -41,9 +41,10 @@ TEST(cl_test, runtime_test) {
   auto &context = runtime->context();
   auto program = runtime->CreateProgram(
       context,
-      runtime->cl_path() + "/cl_kernel/" + "image/elementwise_add_kernel.cl");
+      runtime->cl_path() + "/cl_kernel/" + "buffer/elementwise_add_kernel.cl");
   auto event = runtime->CreateEvent(context);
-  CHECK(runtime->BuildProgram(program.get()));
+  const std::string build_option("-DCL_DTYPE_float");
+  CHECK(runtime->BuildProgram(program.get(), build_option));
 }
 
 TEST(cl_test, context_test) {
@@ -51,9 +52,11 @@ TEST(cl_test, context_test) {
   CHECK(runtime->IsInitSuccess());
   runtime->set_cl_path(FLAGS_cl_path);
   CLContext context;
-  context.AddKernel("pool_max", "image/pool_kernel.cl", "");
-  context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", "");
-  context.AddKernel("elementwise_add", "image/elementwise_add_kernel.cl", "");
+  context.AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float");
+  context.AddKernel(
+      "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
+  context.AddKernel(
+      "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
 }
 
 TEST(cl_test, kernel_test) {
@@ -61,9 +64,11 @@ TEST(cl_test, kernel_test) {
   CHECK(runtime->IsInitSuccess());
   runtime->set_cl_path(FLAGS_cl_path);
   std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  context->AddKernel("pool_max", "image/pool_kernel.cl");
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
+  context->AddKernel(
+      "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
+  context->AddKernel("pool_max", "image/pool_kernel.cl", "-DCL_DTYPE_float");
+  context->AddKernel(
+      "elementwise_add", "image/elementwise_add_kernel.cl", "-DCL_DTYPE_float");
   auto kernel = context->GetKernel(2);
 
   std::unique_ptr<float[]> in_data(new float[4 * 3 * 256 * 512]);
@@ -115,203 +120,12 @@ TEST(cl_test, kernel_test) {
   LOG(INFO) << out_image;
 }
 
-TEST(cl_test, channel_add_test) {
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    in_data[i] = dist(engine);
-  }
-
-  const DDim bias_dim = DDim(std::vector<DDim::value_type>{16});
-  std::unique_ptr<float[]> bias_data(new float[16]);
-  for (int i = 0; i < 16; i++) {
-    bias_data[i] = dist(engine);
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 16; j++) {
-      float b = bias_data[j];
-      for (int k = 0; k < 256 * 512; k++) {
-        int index = (i * 16 + j) * 256 * 512 + k;
-        out_ref[index] = in_data[index] + b;
-      }
-    }
-  }
-
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
-
-  bool status = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(status) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  context->AddKernel("channel_add", "image/channel_add_kernel.cl");
-  elementwise_add(context.get(),
-                  in_data.get(),
-                  in_dim,
-                  bias_data.get(),
-                  bias_dim,
-                  out.get(),
-                  out_dim);
-
-  int stride = 4 * 16 * 256 * 512 / 20;
-  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
-    std::cout << out[i] << " ";
-  }
-  std::cout << std::endl;
-
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
-  }
-}
-
-TEST(cl_test, elementwise_add_test) {
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> in_data(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    in_data[i] = dist(engine);
-  }
-
-  const DDim bias_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> bias_data(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    bias_data[i] = dist(engine);
-  }
-
-  std::unique_ptr<float[]> out_ref(new float[4 * 16 * 256 * 512]);
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    out_ref[i] = in_data[i] + bias_data[i];
-  }
-
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 16, 256, 512});
-  std::unique_ptr<float[]> out(new float[4 * 16 * 256 * 512]);
-
-  bool status = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(status) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("elementwise_add", "image/elementwise_add_kernel.cl");
-  context->AddKernel("channel_add", "image/channel_add_kernel.cl");
-  elementwise_add(context.get(),
-                  in_data.get(),
-                  in_dim,
-                  bias_data.get(),
-                  bias_dim,
-                  out.get(),
-                  out_dim);
-
-  int stride = 4 * 16 * 256 * 512 / 20;
-  for (int i = 0; i < 4 * 16 * 256 * 512; i += stride) {
-    std::cout << out[i] << " ";
-  }
-  std::cout << std::endl;
-
-  for (int i = 0; i < 4 * 16 * 256 * 512; i++) {
-    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
-  }
-}
-
-void pool_avg(const int padding_height,
-              const int padding_width,
-              const int stride_height,
-              const int stride_width,
-              const int ksize_height,
-              const int ksize_width,
-              const float *input_data,
-              const DDim &in_dim,
-              float *output_data,
-              const DDim &out_dim) {
-  const int batch_size = in_dim[0];
-  const int input_height = in_dim[2];
-  const int input_width = in_dim[3];
-  const int output_channels = out_dim[1];
-  const int output_height = out_dim[2];
-  const int output_width = out_dim[3];
-
-  const size_t input_spatial_size = input_height * input_width;
-  const size_t output_spatial_size = output_height * output_width;
-
-  for (int i = 0; i < batch_size; i++) {
-    for (int c = 0; c < output_channels; ++c) {
-      int channel = i * output_channels + c;
-      const float *input_ptr = input_data + channel * input_spatial_size;
-      float *output_ptr = output_data + channel * output_spatial_size;
-
-      for (int ph = 0; ph < output_height; ++ph) {
-        int hstart = ph * stride_height - padding_height;
-        int hend = std::min(hstart + ksize_height, input_height);
-        hstart = std::max(hstart, 0);
-        for (int pw = 0; pw < output_width; ++pw) {
-          int wstart = pw * stride_width - padding_width;
-          int wend = std::min(wstart + ksize_width, input_width);
-          wstart = std::max(wstart, 0);
-
-          float val = 0.f;
-          int count = 0;
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              val += input_ptr[h * input_width + w];
-              ++count;
-            }
-          }
-          output_ptr[ph * output_width + pw] =
-              (count > 0) ? val * (1.f / count) : 0.f;
-        }
-      }
-    }
-  }
-}
-
-TEST(cl_test, pool_test) {
-  std::default_random_engine engine;
-  std::uniform_real_distribution<float> dist(-5, 5);
-
-  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 1024, 7, 7});
-  std::unique_ptr<float[]> in_data(new float[4 * 1024 * 7 * 7]);
-  for (int i = 0; i < 4 * 1024 * 7 * 7; i++) {
-    in_data[i] = dist(engine);
-  }
-
-  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 1024, 1, 1});
-  std::unique_ptr<float[]> out(new float[4 * 1024 * 1 * 1]);
-  std::unique_ptr<float[]> out_ref(new float[4 * 1024 * 1 * 1]);
-
-  bool status = InitOpenCLRuntime(FLAGS_cl_path);
-  CHECK(status) << "Fail to initialize OpenCL runtime.";
-  std::unique_ptr<CLContext> context(new CLContext);
-  context->AddKernel("pool_max", "image/pool_kernel.cl");
-  context->AddKernel("pool_avg", "image/pool_kernel.cl");
-  pool(context.get(),
-       "avg",
-       0,
-       0,
-       1,
-       1,
-       7,
-       7,
-       in_data.get(),
-       in_dim,
-       out.get(),
-       out_dim);
-  pool_avg(0, 0, 1, 1, 7, 7, in_data.get(), in_dim, out_ref.get(), out_dim);
-
-  for (int i = 0; i < 4 * 1024 * 1 * 1; i++) {
-    EXPECT_NEAR(out[i], out_ref[i], 1e-6);
-  }
-}
-
 TEST(cl_test, target_wrapper_buffer_test) {
   bool inited = InitOpenCLRuntime(FLAGS_cl_path);
   CHECK(inited) << "Fail to initialize OpenCL runtime.";
   std::unique_ptr<CLContext> context(new CLContext);
   std::string kernel_name = "elementwise_add";
-  std::string build_options = "-DCL_DTYPE=float";
+  std::string build_options = "-DCL_DTYPE_float";
   context->AddKernel(
       kernel_name, "buffer/elementwise_add_kernel.cl", build_options);
   std::vector<float> h_a;
@@ -396,10 +210,13 @@ TEST(cl_test, target_wrapper_buffer_test) {
 TEST(cl_test, target_wrapper_image_test) {
   const size_t cl_image2d_width = 28;
   const size_t cl_image2d_height = 32;
+  const size_t cl_image2d_elem_size =
+      cl_image2d_width * cl_image2d_height * 4;  // 4 for RGBA channels
   const size_t cl_image2d_row_pitch{0};
   const size_t cl_image2d_slice_pitch{0};
   auto *d_image = static_cast<cl::Image2D *>(
       TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
+
   // Map/Unmap test
   auto *h_image =
       static_cast<float *>(TargetWrapperCL::MapImage(d_image,
@@ -407,15 +224,11 @@ TEST(cl_test, target_wrapper_image_test) {
                                                      cl_image2d_height,
                                                      cl_image2d_row_pitch,
                                                      cl_image2d_slice_pitch));
-  CHECK_EQ(
-      cl_image2d_row_pitch,
-      cl_image2d_width * 4 *
-          4);  // row_pitch = 448 = 28 * 4 (RGBA: 4 floats) * 4 (float in bytes)
-  CHECK_EQ(cl_image2d_slice_pitch, 0);  // slice_pitch = 0
+  CHECK_EQ(cl_image2d_slice_pitch, 0);
   LOG(INFO) << "cl_image2d_row_pitch = " << cl_image2d_row_pitch
             << ", cl_image2d_slice_pitch " << cl_image2d_slice_pitch;
 
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < cl_image2d_elem_size; i++) {
     h_image[i] = 3.14f * i;
   }
   TargetWrapperCL::Unmap(d_image, h_image);
@@ -426,15 +239,14 @@ TEST(cl_test, target_wrapper_image_test) {
                                                      cl_image2d_height,
                                                      cl_image2d_row_pitch,
                                                      cl_image2d_slice_pitch));
-  for (int i = 0; i < 10; i++) {
+  for (int i = 0; i < cl_image2d_elem_size; i++) {
     EXPECT_NEAR(h_ptr[i], 3.14f * i, 1e-6);
   }
   TargetWrapperCL::Unmap(d_image, h_ptr);
 
   // Imagecpy test
-  std::vector<float> h_image_cpy(cl_image2d_width * 4 *
-                                 cl_image2d_height);  // 4 for RGBA channels
-  for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
+  std::vector<float> h_image_cpy(cl_image2d_elem_size);
+  for (int i = 0; i < cl_image2d_elem_size; i++) {
     h_image_cpy[i] = 3.14f;
   }
   TargetWrapperCL::ImgcpySync(d_image,
@@ -446,6 +258,8 @@ TEST(cl_test, target_wrapper_image_test) {
                               IoDirection::HtoD);
   auto *d_image_cpy = static_cast<cl::Image2D *>(
       TargetWrapperCL::MallocImage<float>(cl_image2d_width, cl_image2d_height));
+
+  // device to device
   TargetWrapperCL::ImgcpySync(d_image_cpy,
                               d_image,
                               cl_image2d_width,
@@ -454,6 +268,8 @@ TEST(cl_test, target_wrapper_image_test) {
                               cl_image2d_slice_pitch,
                               IoDirection::DtoD);
   std::fill(h_image_cpy.begin(), h_image_cpy.end(), 0);
+
+  // host to device
   TargetWrapperCL::ImgcpySync(h_image_cpy.data(),
                               d_image_cpy,
                               cl_image2d_width,
@@ -461,7 +277,7 @@ TEST(cl_test, target_wrapper_image_test) {
                               cl_image2d_row_pitch,
                               cl_image2d_slice_pitch,
                               IoDirection::DtoH);
-  for (int i = 0; i < cl_image2d_width * 4 * cl_image2d_height; i++) {
+  for (int i = 0; i < cl_image2d_elem_size; i++) {
     EXPECT_NEAR(h_image_cpy[i], 3.14f, 1e-6);
   }
 
diff --git a/lite/backends/opencl/cl_image_converter.h b/lite/backends/opencl/cl_image_converter.h
index 6faa8045576f06d8c636372de644e6b5c164a5f4..962eb8d3ef35bdb603aa4a56181b1124885d5506 100644
--- a/lite/backends/opencl/cl_image_converter.h
+++ b/lite/backends/opencl/cl_image_converter.h
@@ -103,6 +103,7 @@ class CLImageConverterNormal : public CLImageConverterBase {
 };
 
 class CLImageConverterNWBlock : public CLImageConverterBase {
+ public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
   void NCHWToImage(float *tensor,
                    float *image,
@@ -113,6 +114,7 @@ class CLImageConverterNWBlock : public CLImageConverterBase {
                    const DDim &tensor_dim) override;
 };
 class CLImageConverterDWBlock : public CLImageConverterBase {
+ public:
   DDim InitImageDimInfoWith(const DDim &tensor_dim) override;
   void NCHWToImage(float *tensor,
                    float *image,
diff --git a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
index c9c16581d67db0c9143e91e13249edfd5901ddb8..532f947dd342b1ee4db69a084111a97ec014237f 100644
--- a/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/buffer/layout_kernel.cl
@@ -61,6 +61,57 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
   write_imagef(output_image, output_pos, output);
 }
 
+// buffer -> image2d_nw
+__kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
+                                __write_only image2d_t output_image,
+                                __private const int out_H,
+                                __private const int out_W,
+                                __private const int out_N,
+                                __private const int Stride0,
+                                __private const int Stride1,
+                                __private const int Stride2) {
+  const int out_n = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_ch = get_global_id(2);
+
+  const int out_c = out_ch / out_H;
+  const int out_h = out_ch % out_H;
+
+  const int in_c = out_c; //  index of c in h direction
+
+  const int in_n0 = out_n * 4 + 0;
+  const int in_n1 = out_n * 4 + 1;
+  const int in_n2 = out_n * 4 + 2;
+  const int in_n3 = out_n * 4 + 3;
+
+  const int in_h = out_h;
+  const int in_w = out_w;
+
+  int input_pos0 = in_n0 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+  int input_pos1 = in_n1 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+  int input_pos2 = in_n2 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+  int input_pos3 = in_n3 * Stride2 + in_c * Stride1 + in_h * Stride0 + in_w;
+
+  int2 output_pos;
+  output_pos.x = out_n * out_W + out_w;
+  output_pos.y = out_ch;
+
+  CL_DTYPE4 output = (CL_DTYPE4)0.0f;
+  output.x = convert_float(in[input_pos0]);
+  if (out_N - 4 * out_n >= 2) {
+    output.y = convert_float(in[input_pos1]);
+  }
+  if (out_N - 4 * out_n >= 3) {
+    output.z = convert_float(in[input_pos2]);
+  }
+  if (out_N - 4 * out_n >= 4) {
+    output.w = convert_float(in[input_pos3]);
+  }
+  write_imagef(output_image, output_pos, output);
+}
+
+
+
 // image2d -> buffer
 __kernel void image2d_to_buffer(__read_only image2d_t input,
                                 __private const int in_width,
diff --git a/lite/backends/opencl/cl_kernel/cl_common.h b/lite/backends/opencl/cl_kernel/cl_common.h
index 7f901fc994ffd82ccfe99f59614a3422260d0dc5..f193ab82d78fcd21165100658e9a0edefdbd5e0a 100644
--- a/lite/backends/opencl/cl_kernel/cl_common.h
+++ b/lite/backends/opencl/cl_kernel/cl_common.h
@@ -14,8 +14,17 @@ limitations under the License. */
 
 #pragma once
 
+/////////////////////////////////
+// fp16 enabled, MAX_VALUE, MIN_VALUE
+/////////////////////////////////
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
+#define MAX_VALUE FLT_MAX
+#define MIN_VALUE -FLT_MAX
+
+/////////////////////////////////
+// CL_DTYPE_float / CL_DTYPE_half
+/////////////////////////////////
 // Data type: pass one of macros on host: [CL_DTYPE_float, CL_DYPE_half]
 #ifdef CL_DTYPE_float
 #define CL_DTYPE float
@@ -27,24 +36,36 @@ limitations under the License. */
 #define CL_DTYPE_CHAR h
 #endif
 
+/////////////////////////////////
+// GET_VEC_TYPE
+/////////////////////////////////
 // Note: macro name replacement need twice parser
 #define GET_VEC_TYPE(type__, size__) type__##size__
 #define VECTORIZED_TYPE(type__, size__) GET_VEC_TYPE(type__, size__)
 #define CL_DTYPE4 VECTORIZED_TYPE(CL_DTYPE, 4)
 
+/////////////////////////////////
+// CONVERT_TYPE_TO
+/////////////////////////////////
 #define _CONVERT_TYPE_TO(value, type) convert_##type(value)
 #define CONVERT_TYPE_TO(value, type) _CONVERT_TYPE_TO(value, type)
 
+/////////////////////////////////
+// WRITE_IMG_TYPE / READ_IMG_TYPE
+/////////////////////////////////
 #define _WRITE_IMG_TYPE(type_char, img, pos, value) \
   write_image##type_char(img, pos, value)
 #define WRITE_IMG_TYPE(type_char, img, pos, value) \
   _WRITE_IMG_TYPE(type_char, img, pos, value)
 
-#define _READ_IMG_TYPE(type_char, img, pos, sampler) \
+#define _READ_IMG_TYPE(type_char, img, sampler, pos) \
   read_image##type_char(img, sampler, pos)
-#define READ_IMG_TYPE(type_char, img, pos, sampler) \
-  _READ_IMG_TYPE(type_char, img, pos, sampler)
+#define READ_IMG_TYPE(type_char, img, sampler, pos) \
+  _READ_IMG_TYPE(type_char, img, sampler, pos)
 
+/////////////////////////////////
+// activation / activation_type4
+/////////////////////////////////
 inline CL_DTYPE activation(CL_DTYPE in
 #ifdef PRELU
                            ,
@@ -61,3 +82,20 @@ inline CL_DTYPE activation(CL_DTYPE in
 #endif
   return output;
 }
+
+inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
+#ifdef PRELU
+                                  ,
+                                  CL_DTYPE4 prelu_alpha
+#endif
+                                  ) {
+  CL_DTYPE4 output;
+#ifdef PRELU
+  output = select(prelu_alpha * in, in, in >= (CL_DTYPE4)0.0);
+#endif
+
+#ifdef RELU
+  output = fmax(in, (CL_DTYPE4)0);
+#endif
+  return output;
+}
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..6fe5596a4cf5cbce5b50c9a3d53be164aad8a0b5
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
@@ -0,0 +1,216 @@
+#include <cl_common.h>
+
+__kernel void conv2d_1x1(__private const int global_size_dim0,
+                         __private const int global_size_dim1,
+                         __private const int global_size_dim2,
+                         __read_only image2d_t input_image,
+                         __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                         __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                         __read_only image2d_t new_scale,
+                         __read_only image2d_t new_biase,
+#endif
+                         __write_only image2d_t output_image,
+                         __private const int stride,
+                         __private const int offset,
+                         __private const int input_c,
+                         __private const int input_c_origin,
+                         __private const int dilation,
+                         __private const int input_width,  /* of one block */
+                         __private const int input_height, /* of one block */
+                         __private const int output_width,
+                         __private const int output_height,
+                         __private const int old_w) {
+  CL_DTYPE zero = 0.0f;
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  int out_w0 = out_w;
+  int out_w1 = out_w + global_size_dim1;
+  int out_w2 = out_w + global_size_dim1 * 2;
+  int out_w3 = out_w + global_size_dim1 * 3;
+
+  int outpos_main = mul24(out_c, old_w);
+  int2 output_pos0 = (int2)(outpos_main + out_w0, out_nh);
+  int2 output_pos1 = (int2)(outpos_main + out_w1, out_nh);
+  int2 output_pos2 = (int2)(outpos_main + out_w2, out_nh);
+  int2 output_pos3 = (int2)(outpos_main + out_w3, out_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  int2 stride_xy = (int2)(stride, stride);
+
+  int2 ouput_pos_in_one_block0 = (int2)(out_w0, out_nh);
+  int2 in_pos_in_one_block0 =
+      ouput_pos_in_one_block0 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block1 = (int2)(out_w1, out_nh);
+  int2 in_pos_in_one_block1 =
+      ouput_pos_in_one_block1 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block2 = (int2)(out_w2, out_nh);
+  int2 in_pos_in_one_block2 =
+      ouput_pos_in_one_block2 * stride_xy + (int2)(offset, offset);
+
+  int2 ouput_pos_in_one_block3 = (int2)(out_w3, out_nh);
+  int2 in_pos_in_one_block3 =
+      ouput_pos_in_one_block3 * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE_CH
+  CL_DTYPE4 output0 =
+      READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+#elif defined(BIASE_ELE)
+  CL_DTYPE4 output0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos0);
+  CL_DTYPE4 output1 = output0;
+  CL_DTYPE4 output2 = output0;
+  CL_DTYPE4 output3 = output0;
+
+#else
+  CL_DTYPE4 output0 = 0.0f;
+  CL_DTYPE4 output1 = 0.0f;
+  CL_DTYPE4 output2 = 0.0f;
+  CL_DTYPE4 output3 = 0.0f;
+#endif
+
+  int max_w_bound = input_c * input_width;
+  int burndary_index = input_c * 4 - input_c_origin;
+  bool burndary_index_w =
+      burndary_index == 1 || burndary_index == 2 || burndary_index == 3;
+  bool burndary_index_z = burndary_index == 2 || burndary_index == 3;
+  bool burndary_index_y = burndary_index == 3;
+
+  for (int i = 0; i < input_c; ++i) {
+    // ------------0---------------
+    int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x,
+                         in_pos_in_one_block0.y);
+    CL_DTYPE4 input0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+
+    CL_DTYPE4 weight0 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 0));
+    CL_DTYPE4 weight1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 1));
+    CL_DTYPE4 weight2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 2));
+    CL_DTYPE4 weight3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler, (int2)(out_c, i * 4 + 3));
+    int bound_gap = max_w_bound - pos_in.x - 1;
+
+    bool outof_bound = bound_gap < input_width && bound_gap >= 0;
+    input0.w = select(input0.w, zero, outof_bound && burndary_index_w);
+    input0.z = select(input0.z, zero, outof_bound && burndary_index_z);
+    input0.y = select(input0.y, zero, outof_bound && burndary_index_y);
+
+    output0 = mad(input0.x, weight0, output0);
+    output0 = mad(input0.y, weight1, output0);
+    output0 = mad(input0.z, weight2, output0);
+    output0 = mad(input0.w, weight3, output0);
+    // -------------1--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block1.x,
+                    in_pos_in_one_block1.y);
+    CL_DTYPE4 input1 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+
+    bound_gap = max_w_bound - pos_in.x - 1;
+
+    outof_bound = bound_gap < input_width && bound_gap >= 0;
+    input1.w = select(input1.w, zero, outof_bound && burndary_index_w);
+    input1.z = select(input1.z, zero, outof_bound && burndary_index_z);
+    input1.y = select(input1.y, zero, outof_bound && burndary_index_y);
+
+    output1 = mad(input1.x, weight0, output1);
+    output1 = mad(input1.y, weight1, output1);
+    output1 = mad(input1.z, weight2, output1);
+    output1 = mad(input1.w, weight3, output1);
+
+    // -------------2--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block2.x,
+                    in_pos_in_one_block2.y);
+    CL_DTYPE4 input2 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+
+    bound_gap = max_w_bound - pos_in.x - 1;
+
+    outof_bound = bound_gap < input_width && bound_gap >= 0;
+    input2.w = select(input2.w, zero, outof_bound && burndary_index_w);
+    input2.z = select(input2.z, zero, outof_bound && burndary_index_z);
+    input2.y = select(input2.y, zero, outof_bound && burndary_index_y);
+
+    output2 = mad(input2.x, weight0, output2);
+    output2 = mad(input2.y, weight1, output2);
+    output2 = mad(input2.z, weight2, output2);
+    output2 = mad(input2.w, weight3, output2);
+
+    // -------------3--------------
+    pos_in = (int2)(i * input_width + in_pos_in_one_block3.x,
+                    in_pos_in_one_block3.y);
+    CL_DTYPE4 input3 =
+        READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, pos_in);
+    bound_gap = max_w_bound - pos_in.x - 1;
+
+    outof_bound = bound_gap < input_width && bound_gap >= 0;
+    input3.w =
+        select(input3.w,
+               zero,
+               outof_bound && (burndary_index == 1 || burndary_index == 2 ||
+                               burndary_index == 3));
+    input3.z =
+        select(input3.z,
+               zero,
+               outof_bound && (burndary_index == 2 || burndary_index == 3));
+    input3.y = select(input3.y, zero, outof_bound && burndary_index == 3);
+
+    output3 = mad(input3.x, weight0, output3);
+    output3 = mad(input3.y, weight1, output3);
+    output3 = mad(input3.z, weight2, output3);
+    output3 = mad(input3.w, weight3, output3);
+  }
+
+#ifdef BATCH_NORM
+  output0 = output0 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output1 = output1 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output2 = output2 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+
+  output3 = output3 * READ_IMG_TYPE(
+                          CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) +
+            READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+  output0 = activation_type4(output0);
+  output1 = activation_type4(output1);
+  output2 = activation_type4(output2);
+  output3 = activation_type4(output3);
+#endif
+
+  if (out_w0 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos0, output0);
+  }
+
+  if (out_w1 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos1, output1);
+  }
+
+  if (out_w2 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos2, output2);
+  }
+
+  if (out_w3 < old_w) {
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos3, output3);
+  }
+}
diff --git a/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
new file mode 100755
index 0000000000000000000000000000000000000000..1e3586b7fde8d79fe49327185c623ac613cd080d
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/depthwise_conv2d_kernel.cl
@@ -0,0 +1,322 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include <cl_common.h>
+
+__kernel void depth_conv2d_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int dilation,
+                                              __private const int input_c,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height, /* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    const int batch_index = out_nh / output_height;
+
+    const int out_nh_in_one_batch = out_nh % output_height;
+
+
+    int2 stride_xy = (int2)(stride, stride);
+    int2 ouput_pos_in_one_block = (int2)(out_w, out_nh_in_one_batch);
+
+    int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+#ifdef BIASE_CH
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(out_c, 0));
+#elif defined(BIASE_ELE)
+    CL_DTYPE4 output = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, output_pos);
+#else
+    CL_DTYPE4 output = 0.0f;
+#endif
+
+    const int filter_width = 3;
+    const int filter_height = 3;
+
+    int2 pos_in_input_block = (int2)(out_c * input_width, batch_index * input_height);
+
+    int2 pos_in_filter_block = (int2)(out_c * filter_width, batch_index * filter_height);
+
+    int filter_x = pos_in_filter_block.x ;
+    int filter_y = pos_in_filter_block.y ;
+
+    CL_DTYPE4 inputs[9];
+
+        inputs[0] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[1] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[2] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y - 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y - 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y - 1 >= input_height) << 15));
+
+        inputs[3] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+        /*
+        if (output_pos.x == 112 && output_pos.y == 0) {
+              CL_DTYPE4 input1 = inputs[3];
+              float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+              printf(" input4 3 - %v4hlf \n", in);
+              printf(" --- %d ---\n", in_pos_in_one_block.x - 1);
+        }
+        */
+
+
+        inputs[4] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        inputs[5] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        inputs[6] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x - 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x - 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x - 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+        inputs[7] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+        inputs[8] = select(READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_input_block.x + in_pos_in_one_block.x + 1, pos_in_input_block.y + in_pos_in_one_block.y + 1)),
+                           (CL_DTYPE4)(0.0f),
+                           (ushort4)((in_pos_in_one_block.x + 1 < 0 || in_pos_in_one_block.y + 1 < 0 || in_pos_in_one_block.x + 1 >= input_width || in_pos_in_one_block.y + 1 >= input_height) << 15));
+
+    CL_DTYPE4 filters[9];
+    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
+    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+
+    for(int i = 0 ;i < 9 ; i++){
+     output += inputs[i] * filters[i];
+    }
+#ifdef BATCH_NORM
+    output = output * READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(out_c, 0)) + READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+
+    /*
+
+    if (output_pos.x == 112 && output_pos.y == 0) {
+
+        for (int i = 0; i < 9; ++i) {
+            CL_DTYPE4 input1 = inputs[i];
+            float4 in = (float4)(input1.x, input1.y, input1.z, input1.w);
+            printf(" input4 %d - %v4hlf \n", i, in);
+        }
+
+        float4 out = (float4)(output.x, output.y, output.z, output.w);
+        printf(" depth wise output output4 = %v4hlf \n", out);
+        printf(" pos_in_input_block -x %d \n ", pos_in_input_block.x);
+        printf(" pos_in_input_block -y %d \n ", pos_in_input_block.y);
+        printf(" in_pos_in_one_block - x %d \n", in_pos_in_one_block.x);
+        printf(" in_pos_in_one_block - y %d \n", in_pos_in_one_block.y);
+    }
+
+    */
+
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+
+}
+
+
+
+__kernel void depth_conv2d_3x3s1(__private const int ou_ch_blk,
+                                              __private const int ou_w_blk,
+                                              __private const int ou_nh,
+                                              __read_only image2d_t input,
+                                              __read_only image2d_t filter,
+#if defined(BIASE_CH) || defined(BIASE_ELE)
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int pad,
+                                              __private const int dilation,
+                                              __private const int in_ch,
+                                              __private const int in_w,/* of one block */
+                                              __private const int in_h, /* of one block */
+                                              __private const int ou_w,
+                                              __private const int ou_h) {
+
+    const int ou_ch_blk_id = get_global_id(0);
+    const int ou_w_blk_id = get_global_id(1);
+    const int ou_nh_id = get_global_id(2);
+    const int w_blk_size = 2;
+
+    const int batch_id = ou_nh_id / ou_h;
+    int ou_col_id = ou_w_blk_id * w_blk_size;
+    int ou_row_id = ou_nh_id % ou_h;
+    int ou_x = mad24(ou_ch_blk_id, ou_w, ou_col_id);
+
+    // input pos in one block and on batch
+    int col_id = ou_col_id - pad;
+    int row_id = ou_row_id - pad;
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+#ifdef BIASE_CH
+    CL_DTYPE4 output[2];
+    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_ch_blk_id, 0));
+    output[1] = output[0];
+#elif defined(BIASE_ELE)
+    CL_DTYPE4 output[2];
+    output[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x, ou_nh_id));
+    if (ou_col_id + 1 < ou_w) {
+        output[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2)(ou_x + 1, ou_nh_id));
+    }
+#else
+    CL_DTYPE4 output[2] = {0.0f};
+#endif
+
+    CL_DTYPE4 inputs[12];
+
+    int filter_x = ou_ch_blk_id * 3;
+    int filter_y = 0;
+    CL_DTYPE4 filters[9];
+    filters[0] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y));
+    filters[1] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y));
+    filters[2] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y));
+
+    int in_x = mad24(ou_ch_blk_id, in_w, col_id);
+    int in_y = mad24(batch_id, in_h, row_id);
+
+    int y0 = select(in_y, -1, row_id < 0 || row_id >= in_h);
+    int x0 = select(in_x, -1, col_id < 0 || col_id >= in_w);
+    inputs[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y0));
+    int x1 = select(in_x + 1, -1, col_id + 1 < 0 || col_id + 1 >= in_w);
+    inputs[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y0));
+    int x2 = select(in_x + 2, -1, col_id + 2 < 0 || col_id + 2 >= in_w);
+    inputs[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y0));
+    int x3 = select(in_x + 3, -1, col_id + 3 < 0 || col_id + 3 >= in_w);
+    inputs[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y0));
+
+    output[0] = mad(inputs[0], filters[0], output[0]);
+    output[1] = mad(inputs[1], filters[0], output[1]);
+
+    output[0] = mad(inputs[1], filters[1], output[0]);
+    output[1] = mad(inputs[2], filters[1], output[1]);
+
+    output[0] = mad(inputs[2], filters[2], output[0]);
+    output[1] = mad(inputs[3], filters[2], output[1]);
+
+
+    filters[3] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 1));
+    filters[4] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 1));
+    filters[5] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 1));
+
+
+    int y1 = select(in_y + 1, -1, row_id + 1 < 0 || row_id + 1 >= in_h);
+    inputs[4] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y1));
+    inputs[5] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y1));
+    inputs[6] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y1));
+    inputs[7] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y1));
+
+
+    output[0] = mad(inputs[4], filters[3], output[0]);
+    output[1] = mad(inputs[5], filters[3], output[1]);
+
+    output[0] = mad(inputs[5], filters[4], output[0]);
+    output[1] = mad(inputs[6], filters[4], output[1]);
+
+    output[0] = mad(inputs[6], filters[5], output[0]);
+    output[1] = mad(inputs[7], filters[5], output[1]);
+
+
+    filters[6] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x,filter_y + 2));
+    filters[7] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 1,filter_y + 2));
+    filters[8] =  READ_IMG_TYPE(CL_DTYPE_CHAR, filter, sampler,(int2)(filter_x + 2,filter_y + 2));
+
+    int y2 = select(in_y + 2, -1, row_id + 2 < 0 || row_id + 2 >= in_h);
+    inputs[8] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x0, y2));
+    inputs[9] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x1, y2));
+    inputs[10] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x2, y2));
+    inputs[11] = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x3, y2));
+
+
+    output[0] = mad(inputs[8], filters[6], output[0]);
+    output[1] = mad(inputs[9], filters[6], output[1]);
+
+    output[0] = mad(inputs[9], filters[7], output[0]);
+    output[1] = mad(inputs[10], filters[7], output[1]);
+
+    output[0] = mad(inputs[10], filters[8], output[0]);
+    output[1] = mad(inputs[11], filters[8], output[1]);
+#ifdef BATCH_NORM
+    CL_DTYPE4 scale = READ_IMG_TYPE(CL_DTYPE_CHAR, new_scale, sampler, (int2)(ou_ch_blk_id, 0));
+    CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, new_biase, sampler, (int2)(ou_ch_blk_id, 0));
+    output[0] = mad(scale, output[0], biase);
+    if (ou_col_id + 1 < ou_w) {
+        output[1] = mad(scale, output[1], biase);
+    }
+#endif
+
+#ifdef RELU
+    output[0] = activation(output[0]);
+    output[1] = activation(output[1]);
+#endif
+
+    WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x, ou_nh_id), output[0]);
+    if (ou_col_id + 1 < ou_w) {
+        WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(ou_x + 1, ou_nh_id), output[1]);
+    }
+
+}
+
diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
index ecf719ae9316ed14743e872a1c2cde4b254b35ff..a95c6c6897944c9c943f65b72e51a2ced94befa6 100644
--- a/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/elementwise_add_kernel.cl
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <cl_common.h>
+
 __kernel void elementwise_add(__read_only image2d_t input, __read_only image2d_t bias, __write_only image2d_t outputImage) {
      int x = get_global_id(0);
      int y = get_global_id(1);
diff --git a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
index 0ca3b9141daf671737af8d24cd03e59587e33350..775166261d01dc639cd5af8cee49f7e7fb30cb19 100644
--- a/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/pool_kernel.cl
@@ -12,15 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#define MIN_VALUE -FLT_MAX
-
-__kernel void pool_max(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
+#include <cl_common.h>
+
+__kernel void pool_max(__read_only image2d_t input,
+    __write_only image2d_t output,
+    __private const int in_height,
+    __private const int in_width,
+    __private const int out_height,
+    __private const int out_width,
+    __private const int ksize_h,
+    __private const int ksize_w,
+    __private const int stride_h,
+    __private const int stride_w,
+    __private const int pad_top,
+    __private const int pad_left) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -40,25 +45,30 @@ __kernel void pool_max(
 
   const int pos_in_x = out_c * in_width;
   const int pos_in_y = out_n * in_height;
-  float4 max_value = (float4)(MIN_VALUE);
+  CL_DTYPE4 max_value = (CL_DTYPE4)(MIN_VALUE);
   for (int y = start_h; y < end_h; ++y) {
     for (int x = start_w; x < end_w; ++x) {
-      float4 tmp = read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
+      CL_DTYPE4 tmp = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
       max_value = max(max_value, tmp);
     }
   }
 
   const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imagef(output, (int2)(pos_out_x, out_nh), max_value);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), max_value);
 }
 
-__kernel void pool_avg(
-    __private const int in_height, __private const int in_width,
-    __private const int out_height, __private const int out_width,
-    __private const int pad_top, __private const int pad_left,
-    __private const int stride_h, __private const int stride_w,
-    __private const int ksize_h, __private const int ksize_w,
-    __read_only image2d_t input, __write_only image2d_t output) {
+__kernel void pool_avg(__read_only image2d_t input,
+  __write_only image2d_t output,
+  __private const int in_height,
+  __private const int in_width,
+  __private const int out_height,
+  __private const int out_width,
+  __private const int ksize_h,
+  __private const int ksize_w,
+  __private const int stride_h,
+  __private const int stride_w,
+  __private const int pad_top,
+  __private const int pad_left) {
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -76,15 +86,14 @@ __kernel void pool_avg(
 
   const int pos_in_x = out_c * in_width;
   const int pos_in_y = out_n * in_height;
-  float4 sum = (float4)(0.0f);
-  int num = 0;
+  CL_DTYPE4 sum = (CL_DTYPE4)(0.0f);
+
   for (int y = start_h; y < end_h; ++y) {
     for (int x = start_w; x < end_w; ++x) {
-      sum += read_imagef(input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
-      num++;
+      sum += READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(pos_in_x + x, pos_in_y + y));
     }
   }
-  float4 avg = sum / num;
+  CL_DTYPE4 avg = sum / (ksize_h * ksize_w);
   const int pos_out_x = mad24(out_c, out_width, out_w);
-  write_imagef(output, (int2)(pos_out_x, out_nh), avg);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(pos_out_x, out_nh), avg);
 }
diff --git a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl b/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
index a99ac79d32bcedb48354d2e179ef6c8c1ff7f997..43a27067c2f2c418d314f9bce95bccbbb51a9be0 100644
--- a/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
+++ b/lite/backends/opencl/cl_kernel/image/relu_kernel.cl
@@ -24,7 +24,7 @@ __kernel void relu(__read_only image2d_t input,
                             CLK_ADDRESS_CLAMP |
                             CLK_FILTER_NEAREST;
 
-  CL_DTYPE4 in = read_imagef(input, sampler, (int2)(x, y));
+  CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
   in = max((CL_DTYPE4)(0.0f), in);
-  write_imagef(output, (int2)(x, y), in);
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
 }
diff --git a/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..314be875d29d2125f9573d33010ee9d33317ea71
--- /dev/null
+++ b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
@@ -0,0 +1,162 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cl_common.h>
+
+__kernel void reshape(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_C,
+                      __private const int out_H,
+                      __private const int out_W,
+                      __private const int in_W,
+                      __private const int in_H,
+                      __private const int in_Stride0,
+                      __private const int in_Stride1,
+                      __private const int in_Stride2,
+                      __private const int out_Stride0,
+                      __private const int out_Stride1,
+                      __private const int out_Stride2) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  const int out_n = out_nh / out_H;
+  const int out_h = out_nh % out_H;
+  const int out_c0 = out_c * 4;
+  const int out_c1 = out_c * 4 + 1;
+  const int out_c2 = out_c * 4 + 2;
+  const int out_c3 = out_c * 4 + 3;
+
+  int count0 =
+      out_n * out_Stride2 + out_c0 * out_Stride1 + out_h * out_Stride0 + out_w;
+  int count1 =
+      out_n * out_Stride2 + out_c1 * out_Stride1 + out_h * out_Stride0 + out_w;
+  int count2 =
+      out_n * out_Stride2 + out_c2 * out_Stride1 + out_h * out_Stride0 + out_w;
+  int count3 =
+      out_n * out_Stride2 + out_c3 * out_Stride1 + out_h * out_Stride0 + out_w;
+
+  int in_n0 = count0 / in_Stride2;
+  int in_n1 = count1 / in_Stride2;
+  int in_n2 = count1 / in_Stride2;
+  int in_n3 = count2 / in_Stride2;
+
+  count0 = count0 % in_Stride2;
+  count1 = count1 % in_Stride2;
+  count2 = count2 % in_Stride2;
+  count3 = count3 % in_Stride2;
+
+  int in_c0 = count0 / in_Stride1;
+  int in_c1 = count1 / in_Stride1;
+  int in_c2 = count2 / in_Stride1;
+  int in_c3 = count3 / in_Stride1;
+
+  int in_h0 = (count0 % in_Stride1) / in_Stride0;
+  int in_h1 = (count1 % in_Stride1) / in_Stride0;
+  int in_h2 = (count2 % in_Stride1) / in_Stride0;
+  int in_h3 = (count3 % in_Stride1) / in_Stride0;
+
+  int in_w0 = (count0 % in_Stride1) % in_Stride0;
+  int in_w1 = (count1 % in_Stride1) % in_Stride0;
+  int in_w2 = (count2 % in_Stride1) % in_Stride0;
+  int in_w3 = (count3 % in_Stride1) % in_Stride0;
+
+  int2 input_pos0;
+  int2 input_pos1;
+  int2 input_pos2;
+  int2 input_pos3;
+
+  input_pos0.x = (in_c0 / 4) * in_W + in_w0;
+  input_pos0.y = in_n0 * in_H + in_h0;
+
+  input_pos1.x = (in_c1 / 4) * in_W + in_w1;
+  input_pos1.y = in_n1 * in_H + in_h1;
+
+  input_pos2.x = (in_c2 / 4) * in_W + in_w2;
+  input_pos2.y = in_n2 * in_H + in_h2;
+
+  input_pos3.x = (in_c3 / 4) * in_W + in_w3;
+  input_pos3.y = in_n3 * in_H + in_h3;
+
+  int2 output_pos;
+  output_pos.x = out_c * out_W + out_w;
+  output_pos.y = out_nh;
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  CL_DTYPE4 input0;
+  CL_DTYPE4 input1;
+  CL_DTYPE4 input2;
+  CL_DTYPE4 input3;
+  CL_DTYPE4 output;
+
+  input0 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos0);
+  if (in_c0 % 4 == 0) {
+    output.x = input0.x;
+  } else if (in_c0 % 4 == 1) {
+    output.x = input0.y;
+  } else if (in_c0 % 4 == 2) {
+    output.x = input0.z;
+  } else {
+    output.x = input0.w;
+  }
+  if (out_C - out_c * 4 >= 2) {
+    input1 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos1);
+    if (in_c1 % 4 == 0) {
+      output.y = input1.x;
+    } else if (in_c1 % 4 == 1) {
+      output.y = input1.y;
+    } else if (in_c1 % 4 == 2) {
+      output.y = input1.z;
+    } else {
+      output.y = input1.w;
+    }
+
+  } else {
+    output.y = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 3) {
+    input2 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos2);
+
+    if (in_c2 % 4 == 0) {
+      output.z = input2.x;
+    } else if (in_c2 % 4 == 1) {
+      output.z = input1.y;
+    } else if (in_c2 % 4 == 2) {
+      output.z = input2.z;
+    } else {
+      output.z = input2.w;
+    }
+  } else {
+    output.z = 0.0f;
+  }
+
+  if (out_C - out_c * 4 >= 4) {
+    input3 = READ_IMG_TYPE(CL_DTYPE_CHAR, input_image, sampler, input_pos3);
+    if (in_c3 % 4 == 0) {
+      output.w = input3.x;
+    } else if (in_c3 % 4 == 1) {
+      output.w = input3.y;
+    } else if (in_c3 % 4 == 2) {
+      output.w = input3.z;
+    } else {
+      output.w = input3.w;
+    }
+  } else {
+    output.w = 0.0f;
+  }
+
+  WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, output_pos, output);
+}
diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc
index c2504ab611e93399c70169f3f123d4a0514c07ad..0c7b2f8575a88082f6d79a5392c4468715a701b9 100644
--- a/lite/backends/opencl/cl_runtime.cc
+++ b/lite/backends/opencl/cl_runtime.cc
@@ -103,6 +103,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
 bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
   std::string build_option = options + " -cl-fast-relaxed-math -I " +
                              CLRuntime::Global()->cl_path() + "/cl_kernel";
+  VLOG(4) << "OpenCL build_option: " << build_option;
   status_ = program->build({*device_}, build_option.c_str());
   CL_CHECK_ERROR(status_);
 
diff --git a/lite/backends/opencl/target_wrapper.cc b/lite/backends/opencl/target_wrapper.cc
index 575f87d0f8d0192345c6ab111db46715a809a976..310567baa539697f6a67b59f6c0e5f29ce46a80e 100644
--- a/lite/backends/opencl/target_wrapper.cc
+++ b/lite/backends/opencl/target_wrapper.cc
@@ -24,6 +24,8 @@ static cl_channel_type GetCLChannelType(const PrecisionType type) {
   switch (type) {
     case PRECISION(kFloat):
       return CL_FLOAT;
+    case PRECISION(kFP16):
+      return CL_HALF_FLOAT;
     case PRECISION(kInt32):
       return CL_SIGNED_INT32;
     case PRECISION(kInt8):
@@ -58,17 +60,18 @@ void TargetWrapperCL::Free(void *ptr) {
 
 template <>
 void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
-                                          const size_t cl_image2d_height) {
+                                          const size_t cl_image2d_height,
+                                          void *host_ptr) {
   cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFloat)));
   cl_int status;
   cl::Image2D *cl_image =
       new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
                       img_format,
                       cl_image2d_width,
                       cl_image2d_height,
                       0,
-                      nullptr,
+                      host_ptr,
                       &status);
   if (status != CL_SUCCESS) {
     delete cl_image;
@@ -78,19 +81,20 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
   return cl_image;
 }
 
-template <>
-void *TargetWrapperCL::MallocImage<int8_t>(const size_t cl_image2d_width,
-                                           const size_t cl_image2d_height) {
-  cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt8)));
+template <>  // use int16_t represents half float
+void *TargetWrapperCL::MallocImage<int16_t>(const size_t cl_image2d_width,
+                                            const size_t cl_image2d_height,
+                                            void *host_ptr) {
+  cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kFP16)));
   cl_int status;
   cl::Image2D *cl_image =
       new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
                       img_format,
                       cl_image2d_width,
                       cl_image2d_height,
                       0,
-                      nullptr,
+                      host_ptr,
                       &status);
   if (status != CL_SUCCESS) {
     delete cl_image;
@@ -102,17 +106,18 @@ void *TargetWrapperCL::MallocImage<int8_t>(const size_t cl_image2d_width,
 
 template <>
 void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
-                                            const size_t cl_image2d_height) {
+                                            const size_t cl_image2d_height,
+                                            void *host_ptr) {
   cl::ImageFormat img_format(CL_RGBA, GetCLChannelType(PRECISION(kInt32)));
   cl_int status;
   cl::Image2D *cl_image =
       new cl::Image2D(CLRuntime::Global()->context(),
-                      CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                      CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0),
                       img_format,
                       cl_image2d_width,
                       cl_image2d_height,
                       0,
-                      nullptr,
+                      host_ptr,
                       &status);
   if (status != CL_SUCCESS) {
     delete cl_image;
diff --git a/lite/backends/opencl/target_wrapper.h b/lite/backends/opencl/target_wrapper.h
index 7753448052e17ac739f730c9fabcaf9533e0045e..c5ff9e900a70fd96ccb461c74fb61e33815a5e81 100644
--- a/lite/backends/opencl/target_wrapper.h
+++ b/lite/backends/opencl/target_wrapper.h
@@ -48,7 +48,8 @@ class TargetWrapper<TARGET(kOpenCL), cl::CommandQueue, cl::Event> {
 
   template <typename R>
   static void* MallocImage(const size_t cl_image2d_width,
-                           const size_t cl_image2d_height);
+                           const size_t cl_image2d_height,
+                           void* host_ptr = nullptr);
   static void FreeImage(void* image);
 
   static void* Map(void* buffer, size_t offset, size_t size);
diff --git a/lite/backends/xpu/CMakeLists.txt b/lite/backends/xpu/CMakeLists.txt
index f911f8e0e7c61481e1d4e309bc0635718be11206..4491fdeaefe9f16265bdee2c07ebb02b86a2b038 100644
--- a/lite/backends/xpu/CMakeLists.txt
+++ b/lite/backends/xpu/CMakeLists.txt
@@ -2,5 +2,4 @@ if(NOT LITE_WITH_XPU)
   return()
 endif()
 
-lite_cc_library(xpu_runtime SRCS runtime.cc DEPS ${xpu_runtime_libs})
-lite_cc_library(xpu_builder SRCS builder.cc DEPS ${xpu_builder_libs} xpu_runtime tensor op scope)
+lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
diff --git a/lite/backends/xpu/device.cc b/lite/backends/xpu/device.cc
new file mode 100644
index 0000000000000000000000000000000000000000..74a5681aa98f2c2d3d4025d91207f24f0733a19e
--- /dev/null
+++ b/lite/backends/xpu/device.cc
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/xpu/device.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build(
+    xtcl::network::xNetworkBuilder* builder,
+    xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
+    std::vector<xtcl::xExpr*>* outputs) {
+  VLOG(3) << "[XPU] Build model";
+  CHECK(builder != nullptr);
+  CHECK(outputs != nullptr);
+  CHECK_GT(outputs->size(), 0);
+
+  // The XPU compiler build the graph and fill all of the constant params, only
+  // one output is supported now.
+  xtcl::Array<xtcl::xExpr> all_outs;
+  for (size_t i = 0; i < outputs->size(); i++) {
+    all_outs.push_back(*outputs->at(i));
+  }
+  xtcl::xNetwork network =
+      builder->FinalizeNetwork(xtcl::relay::TupleNode::make(all_outs));
+  auto target = xtcl::Target::Create(device_name_);
+  auto compiler = xtcl::network::xTensorCompiler(network, target);
+  compiler.SetParams(*params);  // Set the data of constant tensors
+  compiler.Build();
+  return std::unique_ptr<xtcl::network::xRuntimeInstance>(
+      new xtcl::network::xRuntimeInstance(compiler.CreateRuntimeInstance()));
+}
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf9a8bf76af168a8a73f8f497b793df88f48f96b
--- /dev/null
+++ b/lite/backends/xpu/device.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace xpu {
+
+class Device {
+ public:
+  static Device& Global() {
+    static Device x;
+    return x;
+  }
+  Device() {}
+
+  // Build the XPU graph to the XPU runtime, return the XPU runtime which can be
+  // used to run inference.
+  std::unique_ptr<xtcl::network::xRuntimeInstance> Build(
+      xtcl::network::xNetworkBuilder* builder,
+      xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
+      std::vector<xtcl::xExpr*>* outputs);
+
+ private:
+  // Keep reserved fields
+  int device_id_{0};
+  std::string device_name_{"llvm"};
+};
+
+}  // namespace xpu
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/xpu/runtime.cc b/lite/backends/xpu/runtime.cc
deleted file mode 100644
index a2c34b95758e8abf81c8294507d0ca60aad7c021..0000000000000000000000000000000000000000
--- a/lite/backends/xpu/runtime.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/xpu/runtime.h"
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace xpu {
-
-// Extract the model data and recover the XPU model for inference, the function
-// is called by the graph computing kernel when the graph op is executed.
-// Due to the lack of XPU APIs for loading and recovering the XPU model from
-// memory, the key name is obtained from the weight tensor of graph op, to get
-// the runtime object for inference from the global variable 'DeviceInfo'.
-// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op.
-bool LoadModel(const lite::Tensor &model,
-               std::shared_ptr<xtcl::network::xRuntimeInstance> *runtime) {
-  LOG(INFO) << "[XPU] Load Model.";
-  CHECK_GT(model.dims().production(), 0);
-  std::string name(reinterpret_cast<const char *>(model.data<int8_t>()));
-  LOG(INFO) << "[XPU] Model Name: " << name;
-  CHECK(runtime != nullptr);
-  *runtime = DeviceInfo::Global().Find(name);
-  if (*runtime == nullptr) {
-    LOG(WARNING) << "[XPU] Load Model failed!";
-    return false;
-  }
-  return true;
-}
-
-}  // namespace xpu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/xpu/runtime.h b/lite/backends/xpu/runtime.h
deleted file mode 100644
index 4ff8d75bce6156d51a4988d427058da34460443f..0000000000000000000000000000000000000000
--- a/lite/backends/xpu/runtime.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <xtcl/xtcl.h>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace xpu {
-
-class DeviceInfo {
- public:
-  static DeviceInfo& Global() {
-    static DeviceInfo x;
-    return x;
-  }
-  DeviceInfo() {}
-
-  void Insert(const std::string& name,
-              std::shared_ptr<xtcl::network::xRuntimeInstance> runtime) {
-    if (runtimes_.find(name) != runtimes_.end()) {
-      LOG(WARNING) << "[XPU] Model " << name << " already exists.";
-      return;
-    }
-    runtimes_.emplace(std::make_pair(name, runtime));
-  }
-
-  void Clear() { runtimes_.clear(); }
-
-  std::shared_ptr<xtcl::network::xRuntimeInstance> Find(
-      const std::string& name) const {
-    if (runtimes_.find(name) != runtimes_.end()) {
-      return runtimes_.at(name);
-    } else {
-      return nullptr;
-    }
-  }
-
- private:
-  int device_id_{0};
-  std::string device_name_{"default"};
-  std::unordered_map<std::string,
-                     std::shared_ptr<xtcl::network::xRuntimeInstance>>
-      runtimes_;
-};
-
-bool LoadModel(const lite::Tensor& model,
-               std::shared_ptr<xtcl::network::xRuntimeInstance>* runtime);
-
-}  // namespace xpu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index a93b962a4723b2677defc16fdaf1d0922f1b48fa..34d9deff6a5262c16c2f74301771b73479f3ae30 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -33,9 +33,9 @@ lite_cc_library(scope SRCS scope.cc DEPS tensor)
 lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
 
 if (LITE_WITH_ARM)
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags NPU_DEPS npu_runtime)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context gflags)
 else()
-lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags XPU_DEPS xpu_runtime)
+lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context gflags)
 endif()
 
 #-------------------------------------------- GET CODE META INFO ------------------------------------------
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index bc77afd81e0859b9492b2068ce681098a9393923..6c0c917a3e6b18f926a5fa768131e36296301432 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -5,6 +5,6 @@ endif()
 
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index 561a508d20f1db9283a410b8ee35dd851149429c..fe36f1e1ba16ad85c44136b09a0d2e5d3fadf688 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -14,13 +14,38 @@
 
 #include "lite/core/arena/framework.h"
 #include "lite/core/context.h"
+#include "lite/operators/subgraph_op.h"
 
 namespace paddle {
 namespace lite {
 namespace arena {
 
 void TestCase::CreateInstruction() {
-  auto op = LiteOpRegistry::Global().Create(op_desc().Type());
+  std::shared_ptr<lite::OpLite> op = nullptr;
+  if (place_.target == TARGET(kNPU) || place_.target == TARGET(kXPU)) {
+    // Create a new block desc to wrap the original op desc
+    int sub_block_idx = 0;
+    auto sub_block_desc = new cpp::BlockDesc();
+    sub_block_desc->ClearOps();
+    sub_block_desc->ClearVars();
+    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_block_op_desc = *op_desc_;
+    // Add the block desc into the subgraph op which used to replace the
+    // original op
+    op_desc_.reset(new cpp::OpDesc());
+    op_desc_->SetType("subgraph");
+    op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
+    auto in_names = sub_block_op_desc->input_vars();
+    auto out_names = sub_block_op_desc->output_vars();
+    op_desc_->SetInput("Inputs", in_names);
+    op_desc_->SetOutput("Outputs", out_names);
+    op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
+    op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
+    op = LiteOpRegistry::Global().Create(op_desc().Type());
+    static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
+  } else {
+    op = LiteOpRegistry::Global().Create(op_desc().Type());
+  }
   CHECK(op) << "no op for " << op_desc().Type();
   op->Attach(*op_desc_, inst_scope_);
   auto kernels = op->CreateKernels({place_});
@@ -68,6 +93,19 @@ void TestCase::PrepareInputsForInstruction() {
   }
 }
 
+TestCase::~TestCase() {
+  if (op_desc_->Type() == "subgraph") {
+    // Release the subblock desc of Subgraph op
+    auto subgraph_op = const_cast<operators::SubgraphOp*>(
+        static_cast<const operators::SubgraphOp*>(instruction_->op()));
+    CHECK(subgraph_op);
+    auto sub_block_desc = subgraph_op->GetSubBlock();
+    if (sub_block_desc) {
+      delete sub_block_desc;
+    }
+  }
+}
+
 }  // namespace arena
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h
index 412ac0c167b8abe6d196dc25d1bc5b193d02965d..05af21bbdbfd6d00aa0eb3992fa732cf8f2e0fab 100644
--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -21,6 +21,7 @@
 #include <iomanip>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/core/op_registry.h"
@@ -42,7 +43,7 @@ class TestCase {
       : place_(place), scope_(new Scope), alias_(alias) {
     ctx_ = ContextScheduler::Global().NewContext(place_.target);
   }
-  virtual ~TestCase() {}
+  virtual ~TestCase();
 
   void Prepare() {
     PrepareScopes();
@@ -77,6 +78,20 @@ class TestCase {
   // kernel registry.
   void CheckKernelConsistWithDefinition() {}
 
+  // Get the real precision of the output for check precision. When the declare
+  // precision obtained from the kernel is any, we should set the precision of
+  // the output in test case.
+  bool GetPrecisonType(const std::string& var_name,
+                       PrecisionType* precision_type) {
+    auto res = precision_type_map_.find(var_name);
+    if (res == precision_type_map_.end()) {
+      return false;
+    } else {
+      *precision_type = precision_type_map_.at(var_name);
+      return true;
+    }
+  }
+
   Scope& scope() { return *scope_; }
 
   Scope* baseline_scope() { return base_scope_; }
@@ -105,6 +120,19 @@ class TestCase {
   // Prepare for the operator.
   virtual void PrepareOpDesc(cpp::OpDesc* op_desc) = 0;
 
+  // Set the real precision of the output for check precision. When the declare
+  // precision obtained from the kernel is any, we should set the precision of
+  // the output in test case.
+  void SetPrecisionType(const std::string& var_name,
+                        const PrecisionType& precision_type) {
+    auto res = precision_type_map_.find(var_name);
+    if (res == precision_type_map_.end()) {
+      precision_type_map_.insert({var_name, precision_type});
+    } else {
+      precision_type_map_.at(var_name) = precision_type;
+    }
+  }
+
  public:
   const Instruction& instruction() { return *instruction_; }
 
@@ -148,6 +176,7 @@ class TestCase {
   Scope* base_scope_{};
   std::unique_ptr<cpp::OpDesc> op_desc_;
   std::unique_ptr<Instruction> instruction_;
+  std::unordered_map<std::string, PrecisionType> precision_type_map_;
 };
 
 class Arena {
@@ -159,13 +188,17 @@ class Arena {
     tester_->Prepare();
   }
 
-  bool TestPrecision() {
+  bool TestPrecision(const std::vector<std::string>& exclude_outs = {}) {
     tester_->RunBaseline(tester_->baseline_scope());
     tester_->RunInstruction();
 
     bool success = true;
     for (auto& out : tester_->op_desc().OutputArgumentNames()) {
       for (auto& var : tester_->op_desc().Output(out)) {
+        if (std::find(exclude_outs.begin(), exclude_outs.end(), var) !=
+            exclude_outs.end()) {
+          continue;
+        }
         success = success && CompareTensor(out, var);
       }
     }
@@ -189,8 +222,11 @@ class Arena {
     // get tensor type.
     const Type* type =
         tester_->instruction().kernel()->GetOutputDeclType(arg_name);
-
-    switch (type->precision()) {
+    auto precision_type = type->precision();
+    if (precision_type == PRECISION(kAny)) {
+      CHECK(tester_->GetPrecisonType(var_name, &precision_type));
+    }
+    switch (precision_type) {
       case PRECISION(kFloat):
         return tester_->CheckPrecision<float>(var_name, abs_error_);
       case PRECISION(kInt8):
@@ -199,7 +235,6 @@ class Arena {
         return tester_->CheckPrecision<int32_t>(var_name, abs_error_);
       case PRECISION(kBool):
         return tester_->CheckPrecision<bool>(var_name, abs_error_);
-
       default:
         LOG(FATAL) << "not support type " << PrecisionToStr(type->precision());
         return false;
diff --git a/lite/core/context.h b/lite/core/context.h
index eb25e7e1d980de9e8f633591fc1320f2a7cd476d..2830bca5c1b1e3dce151e498dd502e6636e54950 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -25,12 +25,6 @@
 #include "lite/backends/opencl/cl_context.h"
 #include "lite/backends/opencl/cl_runtime.h"
 #endif
-#ifdef LITE_WITH_NPU
-#include "lite/backends/npu/runtime.h"
-#endif
-#ifdef LITE_WITH_XPU
-#include "lite/backends/xpu/runtime.h"
-#endif
 
 #include <map>
 #include <memory>
@@ -93,7 +87,7 @@ template <>
 class Context<TargetType::kXPU> {
  public:
   Context() {}
-  explicit Context(const NPUContext& ctx);
+  explicit Context(const XPUContext& ctx);
   // NOTE: InitOnce should only be used by ContextScheduler
   void InitOnce() {}
   void CopySharedTo(XPUContext* ctx) {}
diff --git a/lite/core/memory.h b/lite/core/memory.h
index cb4ac044e7af6994e5e404f379eeb12290e34778..18b9958911a6173c088b415369555235d63d184d 100644
--- a/lite/core/memory.h
+++ b/lite/core/memory.h
@@ -100,13 +100,14 @@ class Buffer {
   template <typename T>
   void ResetLazyImage2D(TargetType target,
                         const size_t img_w,
-                        const size_t img_h) {
+                        const size_t img_h,
+                        void* host_ptr = nullptr) {
     size_t size = sizeof(T) * img_w * img_h *
                   4;  // 4 for RGBA, un-used for opencl Image2D
     if (target != target_ || cl_image2d_width_ < img_w ||
         cl_image2d_height_ < img_h) {
       Free();
-      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h);
+      data_ = TargetWrapperCL::MallocImage<T>(img_w, img_h, host_ptr);
       target_ = target;
       space_ = size;  // un-used for opencl Image2D
       cl_image2d_width_ = img_w;
diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt
index a44b8348716449519486d37f6784e31ecc39f554..810ff0f875168da1c4411471b7ea3ea6617a9b4f 100644
--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -16,10 +16,12 @@ lite_cc_library(mir_passes
       fusion/interpolate_fuse_pass.cc
       fusion/conv_elementwise_fuse_pass.cc
       fusion/conv_activation_fuse_pass.cc
+      fusion/var_conv_2d_activation_fuse_pass.cc
       fusion/conv_bn_fuse_pass.cc
       fusion/elementwise_add_activation_fuse_pass.cc
       fusion/quant_dequant_fuse_pass.cc
       elimination/identity_scale_eliminate_pass.cc
+      elimination/elementwise_mul_constant_eliminate_pass.cc
       static_kernel_pick_pass.cc
       variable_place_inference_pass.cc
       type_target_cast_pass.cc
@@ -32,7 +34,7 @@ lite_cc_library(mir_passes
       demo_pass.cc
       runtime_context_assign_pass.cc
       memory_optimize_pass.cc
-  DEPS mir_pass types context ${mir_fusers} ${subgraph_passes})
+  DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs})
 
 # lite_cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
         #mir_ssa_graph scope op
diff --git a/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc b/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..863c01ef0646794b5cbe54d7a81a8f26dbf164ae
--- /dev/null
+++ b/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/pass.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+namespace {
+
+class ElementwiseMulConstantEliminator : public FuseBase {
+ public:
+  void BuildPattern() override {
+    auto* pre_op = OpNode("preop");    // the previous op's output need update
+    auto* post_op = OpNode("postop");  // the post op's output need update
+    // TODO(Superjomn) check has only one output
+    auto* x =
+        VarNode("x")->assert_is_op_input("elementwise_mul", "X")->AsOutput();
+    auto* y = VarNode("Y")->assert_is_op_input("elementwise_mul", "Y");
+
+    // create op nodes
+    auto* mul = OpNode("mul", "elementwise_mul")
+                    ->assert_is_op("elementwise_mul")
+                    ->AsIntermediate();
+
+    auto* fill_constant = OpNode("fill_constant", "fill_constant")
+                              ->assert_is_op("fill_constant")
+                              ->assert_op_attr<float>("value", 1.)
+                              ->AsIntermediate();
+    // create output node
+    auto* mul_out =
+        VarNode("output")->assert_is_op_output("elementwise_mul", "Out");
+    // create topology.
+    std::vector<PMNode*> add_inputs{x, y};
+    *pre_op >> *x;
+    *fill_constant >> *y;
+    add_inputs >> *mul >> *mul_out;
+    *mul_out >> *post_op;
+
+    // The pre_op will be eliminated, and a new output-updated op will insert.
+    mul_out->AsIntermediate();  // mul_out is pre_op's output, need to update
+    y->AsIntermediate();        // need to update
+  }
+
+ private:
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override {
+    auto& post_op = matched.at("postop")->AsStmt();
+    auto op_info = *post_op.op_info();
+
+    op_info.UpdateAllInputs(matched.at("output")->AsArg().name,
+                            matched.at("x")->AsArg().name);
+    post_op.ResetOp(op_info, graph->valid_places());
+
+    IR_NODE_LINK_TO(matched.at("x"), matched.at("postop"));
+  }
+};
+
+}  // namespace
+
+class ElementwiseMulConstantEliminatePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override {
+    ElementwiseMulConstantEliminator eliminator;
+    eliminator(graph.get());
+  }
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(elementwise_mul_constant_eliminate_pass,
+                  paddle::lite::mir::ElementwiseMulConstantEliminatePass)
+    .BindTargets({TARGET(kAny)});
diff --git a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
index acea48c742522d5b6b5f1f3b570fcbfe0c4be08d..345361047bbbad68cdd0b298a163214cbfe114fc 100644
--- a/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
+++ b/lite/core/mir/elimination/identity_scale_eliminate_pass.cc
@@ -25,7 +25,8 @@ namespace {
 class Eliminator : public FuseBase {
  public:
   void BuildPattern() override {
-    auto* pre_op = OpNode("preop");  // the previous op's output need update
+    // the previous op's output need updat
+    auto* pre_op = OpNode("preop")->assert_is_not_op_type("conditional_block");
     // TODO(Superjomn) check has only one output
     auto* x = VarNode("x")->assert_is_op_input("scale", "X");
     auto* scale_op = OpNode("scale", "scale")
diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt
index 5ac52837551f0b78d67dfe1733fe354ee2cf7f01..8699470955b663fc2562074e99529def72836794 100644
--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -10,6 +10,9 @@ lite_cc_library(fuse_conv_elementwise
 lite_cc_library(fuse_conv_activation
         SRCS conv_activation_fuser.cc
         DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_var_conv_activation
+        SRCS var_conv_2d_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_conv_bn
         SRCS conv_bn_fuser.cc
         DEPS pattern_matcher_high_api)
@@ -31,6 +34,7 @@ set(mir_fusers
     fuse_shuffle_channel
     fuse_conv_elementwise
     fuse_conv_activation
+    fuse_var_conv_activation
     fuse_conv_bn
     fuse_quant_dequant
     fuse_elementwise_add_activation
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
index 0d11b47db6a7f767f8cd032877d8647b0872b8d4..c5ce74e30e34b5878a534010b6cf8b86f91a1118 100644
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -30,7 +30,7 @@ void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
       break;
     }
   }
-  for (auto conv_type : {"conv2d", "depthwise_conv2d"}) {
+  for (auto conv_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
     for (auto act_type : act_types) {
       for (auto has_bias : {true, false}) {
         fusion::ConvActivationFuser fuser(conv_type, act_type, has_bias);
diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
index 5ab5f8c0a4797e51cce656de43883a68d4931e9b..4725ca74855d72674b922478acd1f6f3a3b59798 100644
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -26,7 +26,8 @@ namespace mir {
 void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   // initialze fuser params
   std::vector<bool> conv_has_bias_cases{true, false};
-  std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
+  std::vector<std::string> conv_type_cases{
+      "conv2d", "depthwise_conv2d", "conv2d_transpose"};
 
   // start fuse using params
   for (auto conv_has_bias : conv_has_bias_cases) {
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ce2248cbc23d8887a22f94c14b2507fb0cacbed
--- /dev/null
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/var_conv_2d_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void VarConv2dActivationFusePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  std::vector<std::string> act_types{"relu"};
+  for (auto act_type : act_types) {
+    fusion::VarConvActivationFuser fuser(act_type, "var_conv_2d");
+    fuser(graph.get());
+  }
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(lite_var_conv_2d_activation_fuse_pass,
+                  paddle::lite::mir::VarConv2dActivationFusePass)
+    .BindTargets({TARGET(kCUDA)});
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..7616aadef340d3e4d6bc11534dd839c91fe9ed1d
--- /dev/null
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class VarConv2dActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eabd97ae4513b84c9c002aa1587d45cce6b22e21
--- /dev/null
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/fusion/var_conv_2d_activation_fuser.h"
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+void VarConvActivationFuser::BuildPattern() {
+  // create nodes.
+  auto* input = VarNode("X")->assert_is_op_input(conv_type_, "X")->AsInput();
+  auto* filter = VarNode("W")->assert_is_op_input(conv_type_, "W")->AsInput();
+
+  auto* conv2d = OpNode("var_conv_2d", conv_type_)->AsIntermediate();
+
+  auto* act = OpNode("act", act_type_)->AsIntermediate();
+
+  auto* conv2d_out = VarNode("conv2d_out")
+                         ->assert_is_op_output(conv_type_, "Out")
+                         ->assert_is_op_input(act_type_, "X")
+                         ->AsIntermediate();
+  auto* conv2d_out_1 = VarNode("conv2d_out_1")
+                           ->assert_is_op_output(conv_type_, "Col")
+                           ->AsIntermediate();
+
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+
+  // create topology.
+  std::vector<PMNode*> conv2d_inputs{filter, input};
+  conv2d_inputs >> *conv2d >> *conv2d_out >> *act >> *out;
+  *conv2d >> *conv2d_out_1;
+}
+
+void VarConvActivationFuser::InsertNewNode(SSAGraph* graph,
+                                           const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
+  auto conv_old = matched.at("var_conv_2d")->stmt()->op();
+  auto* scope = conv_old->scope();
+  auto& valid_places = conv_old->valid_places();
+  conv_op->Attach(op_desc, scope);
+
+  auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places);
+
+  IR_NODE_LINK_TO(matched.at("X"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+
+cpp::OpDesc VarConvActivationFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("var_conv_2d")->stmt()->op_info();
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+  cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
+
+  if (act_type_ == "relu") {
+    op_desc.SetAttr("fuse_relu", true);
+  }
+  return op_desc;
+}
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.h b/lite/core/mir/fusion/var_conv_2d_activation_fuser.h
new file mode 100644
index 0000000000000000000000000000000000000000..68bc89f7d13d38dc07814f3296a25bfd7dea0248
--- /dev/null
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+
+class VarConvActivationFuser : public FuseBase {
+ public:
+  explicit VarConvActivationFuser(const std::string& act_type,
+                                  const std::string& conv_type)
+      : act_type_(act_type), conv_type_(conv_type) {}
+
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string act_type_;
+  std::string conv_type_;
+};
+
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc
index 76ea9555c29a245aa9f20b158f0706557940bef8..3a27360f94d7d828e1c19214d621f1dfe4e048ca 100644
--- a/lite/core/mir/graph_visualize_pass.cc
+++ b/lite/core/mir/graph_visualize_pass.cc
@@ -36,15 +36,6 @@ std::string Visualize(mir::SSAGraph* graph) {
 
   int id = 0;
   std::set<std::string> exists_args;
-  std::map<int, std::string> graph_col;  // Different colors of subgraphs
-  graph_col.insert({{1, "red"},
-                    {2, "green"},
-                    {3, "cyan"},
-                    {4, "bisque3"},
-                    {5, "coral"},
-                    {6, "darkseagreen1"},
-                    {7, "goldenrod1"},
-                    {8, "darkorchid"}});
   for (auto& node : graph->mutable_nodes()) {
     std::string key;
     if (node.IsArg()) {
@@ -52,24 +43,12 @@ std::string Visualize(mir::SSAGraph* graph) {
     } else {
       key = string_format("%s%d", node.AsStmt().op_type().c_str(), id++);
     }
-
     if (node.IsStmt()) {
-      auto& stmt = node.AsStmt();
-      auto sub_id = stmt.subgraph_id();
-      auto it = graph_col.find(sub_id);
-      if (sub_id > 0 && it != graph_col.end()) {
-        dot.AddNode(key,
-                    {Dot::Attr("shape", "box"),
-                     Dot::Attr("style", "filled"),
-                     Dot::Attr("color", "black"),
-                     Dot::Attr("fillcolor", it->second)});
-      } else {
-        dot.AddNode(key,
-                    {Dot::Attr("shape", "box"),
-                     Dot::Attr("style", "filled"),
-                     Dot::Attr("color", "black"),
-                     Dot::Attr("fillcolor", "yellow")});
-      }
+      dot.AddNode(key,
+                  {Dot::Attr("shape", "box"),
+                   Dot::Attr("style", "filled"),
+                   Dot::Attr("color", "black"),
+                   Dot::Attr("fillcolor", "yellow")});
       for (auto& x : node.inlinks) {
         auto name = x->AsArg().name;
         if (!exists_args.count(name)) {
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 4f41ba4a601ae763e6fa48c0a98de238252ea7c2..dbf32da2348c6aa39eb4f9d9c65b404e31fb3145 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -50,7 +50,7 @@ void MemoryOptimizePass::CollectLifeCycleByDevice(
                                         "lod_reset",
                                         "concat",
                                         "yolo_box",
-                                        "graph_op",
+                                        "subgraph",
                                         "feed",
                                         "fetch"};
     for (auto* tmp : node->inlinks) {
diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h
index 60fa1fb1ebe49e1be38a7d84cb82545389ea4aac..e2c8a68bde6ee18506de73a7531716695b3d54f1 100644
--- a/lite/core/mir/node.h
+++ b/lite/core/mir/node.h
@@ -64,9 +64,6 @@ class Node {
       return valid_kernels_;
     }
 
-    void ClearSubgraphID() { subgraph_id_ = -1 /* note: not 0 */; }
-    void SetSubgraphID(int id) { subgraph_id_ = id; }
-    int subgraph_id() const { return subgraph_id_; }
     void SetOp(const std::shared_ptr<OpLite>& op) { op_ = op; }
     const std::shared_ptr<OpLite> op() const { return op_; }
 
@@ -82,11 +79,6 @@ class Node {
 
     // Description.
     std::string desc;
-
-   protected:
-    // -1 means not in subgraph, 0 means supported but not one id, id started
-    // from 1
-    int subgraph_id_{-1};
   };
 
   struct Arg {
diff --git a/lite/core/mir/pattern_matcher.cc b/lite/core/mir/pattern_matcher.cc
index 8e0fc55be2389244ae065b4c2809bbdd74be370c..b625919cbfb6d26ecbbd1bad36772aff86bee087 100644
--- a/lite/core/mir/pattern_matcher.cc
+++ b/lite/core/mir/pattern_matcher.cc
@@ -377,6 +377,19 @@ PMNode *PMNode::assert_is_op(const std::string &op_type) {
   return this;
 }
 
+PMNode *PMNode::assert_is_not_op_type(const std::string &op_type) {
+  asserts_.emplace_back([op_type](const Node *x) {
+    if (x && x->IsStmt()) {
+      auto *op_info = x->stmt()->op_info();
+      if (op_info->Type() == op_type) {
+        return false;
+      }
+    }
+    return true;
+  });
+  return this;
+}
+
 PMNode *PMNode::assert_is_var() {
   asserts_.emplace_back([](const Node *x) { return x && x->IsArg(); });
   return this;
diff --git a/lite/core/mir/pattern_matcher.h b/lite/core/mir/pattern_matcher.h
index 47a0a30b5667ddc97b3783ab9edbab04281528a4..90c4359c6d3ade98cf60b5c23411e2026cdeccc9 100644
--- a/lite/core/mir/pattern_matcher.h
+++ b/lite/core/mir/pattern_matcher.h
@@ -123,6 +123,7 @@ struct PMNode {
   // Assertions, helper functions to simplify the pattern definition.
   PMNode* assert_is_op();
   PMNode* assert_is_op(const std::string& op_type);
+  PMNode* assert_is_not_op_type(const std::string& op_type);
   PMNode* assert_is_var();
   PMNode* assert_var_not_persistable();
   PMNode* assert_is_persistable_var();
diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc
index 8f22022789046900c3c09cfb122c914968d8d87f..2b5b65ce5903ede41137311c585c0e87eaaa0e9d 100644
--- a/lite/core/mir/ssa_graph.cc
+++ b/lite/core/mir/ssa_graph.cc
@@ -123,6 +123,9 @@ void SSAGraph::Build(const Program &program,
     return true;
   };
 
+  std::unordered_map<std::string, PrecisionType> var_types =
+      program.var_data_type();
+
   std::unordered_map<std::string, mir::Node *> arg_update_node_map_;
   for (auto &op : program.ops()) {
     VLOG(3) << op->op_info()->Type();
@@ -137,6 +140,10 @@ void SSAGraph::Build(const Program &program,
         arg_node->AsArg(name, node_storage_.size() - 1);
         arg_update_node_map_[name] = arg_node;
       }
+      if (var_types.count(name) && !arg_node->arg()->type) {
+        arg_node->arg()->type = LiteType::GetTensorTy(
+            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+      }
       if (is_weights(name)) arg_node->AsArg().is_weight = true;
       CHECK(arg_node->IsRoleSet());
       DirectedLink(arg_node, op_node);
@@ -146,6 +153,10 @@ void SSAGraph::Build(const Program &program,
       auto *arg_node = &node_storage_.back();
       arg_node->AsArg(name, node_storage_.size() - 1);
       arg_update_node_map_[name] = arg_node;
+      if (var_types.count(name) && !arg_node->arg()->type) {
+        arg_node->arg()->type = LiteType::GetTensorTy(
+            TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
+      }
 
       if (is_weights(name)) arg_node->AsArg().is_weight = true;
       CHECK(arg_node->IsRoleSet());
diff --git a/lite/core/mir/static_kernel_pick_pass.cc b/lite/core/mir/static_kernel_pick_pass.cc
index c49e4497099c5f04a39bf91e70ca8f48900e7ba7..1cc8942d611db389a44cbf6a244775a5b666b587 100644
--- a/lite/core/mir/static_kernel_pick_pass.cc
+++ b/lite/core/mir/static_kernel_pick_pass.cc
@@ -14,7 +14,10 @@
 
 #include "lite/core/mir/static_kernel_pick_pass.h"
 #include <algorithm>
+#include <list>
 #include <memory>
+#include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
@@ -43,13 +46,33 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     if (!node.IsStmt()) continue;
     auto& instruct = node.AsStmt();
 
+    std::unordered_map<std::string, PrecisionType> in_types;
+    std::unordered_map<std::string, PrecisionType> out_types;
+    for (std::list<Node*>::iterator i = node.inlinks.begin();
+         i != node.inlinks.end();
+         ++i) {
+      if ((*i)->arg()->type)
+        in_types[(*i)->arg()->name] = (*i)->arg()->type->precision();
+    }
+    for (std::list<Node*>::iterator i = node.outlinks.begin();
+         i != node.outlinks.end();
+         ++i) {
+      if ((*i)->arg()->type)
+        out_types[(*i)->arg()->name] = (*i)->arg()->type->precision();
+    }
     // Get candidate kernels
     std::vector<std::pair<float, std::unique_ptr<KernelBase>>> scored;
     CHECK(!instruct.kernels().empty()) << "No kernels found for "
                                        << instruct.op_type();
     VLOG(4) << "instruct.kernels().size():" << instruct.kernels().size();
     for (auto&& kernel : instruct.kernels()) {
-      float score = KernelGrade(instruct, *kernel, graph->valid_places());
+      float score = KernelGrade(instruct,
+                                *kernel,
+                                graph->valid_places(),
+                                in_types,
+                                out_types,
+                                instruct.op_info()->input_names(),
+                                instruct.op_info()->output_names());
       VLOG(4) << "kernel->summary():" << kernel->summary()
               << " score:" << score;
       scored.emplace_back(score, std::move(kernel));
@@ -99,7 +122,13 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
         instruct.ResetOp(update_desc, graph->valid_places());
         scored.clear();
         for (auto&& kernel : instruct.kernels()) {
-          float score = KernelGrade(instruct, *kernel, graph->valid_places());
+          float score = KernelGrade(instruct,
+                                    *kernel,
+                                    graph->valid_places(),
+                                    in_types,
+                                    out_types,
+                                    instruct.op_info()->input_names(),
+                                    instruct.op_info()->output_names());
           scored.emplace_back(score, std::move(kernel));
         }
         std::sort(scored.begin(), scored.end(), KernelScoreCmp);
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
index cd54e2654c22b98cbacc9a73bef7770a029c0b30..f655b298bf2d800f4adf142ad14b8ac05ca00482 100644
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -16,6 +16,8 @@
 
 #include <limits>
 #include <memory>
+#include <string>
+#include <unordered_map>
 #include <vector>
 #include "lite/core/mir/pass.h"
 #include "lite/core/types.h"
@@ -48,9 +50,14 @@ class StaticKernelPickPass : public mir::StmtPass {
 
  private:
   // Score the kernel.
-  size_t KernelGrade(const lite::mir::Node::Stmt& instruct,
-                     const lite::KernelBase& kernel,
-                     const std::vector<Place>& places) {
+  size_t KernelGrade(
+      const lite::mir::Node::Stmt& instruct,
+      const lite::KernelBase& kernel,
+      const std::vector<Place>& places,
+      const std::unordered_map<std::string, PrecisionType>& in_types,
+      const std::unordered_map<std::string, PrecisionType>& out_types,
+      const std::vector<std::string>& in_names,
+      const std::vector<std::string>& out_names) {
     CHECK_GT(places.size(), 0) << "valid_places is empty.";
     float final_score{-1.};
     Place winner_place{places[0]};
@@ -100,6 +107,37 @@ class StaticKernelPickPass : public mir::StmtPass {
                             core::KernelPickFactor::Factor::DataLayoutFirst);
       }
       VLOG(4) << "[score s3]:" << score;
+
+      // add new rules for precision: When the input types are consistent with
+      // kernel's input types  and the output types are consistent with kernel's
+      // output types. Select the kernel of the precision. Note that this
+      // strategy is not compatible with quantization, so skip quantization op.
+      if (!instruct.op_info()->HasAttr("enable_int8")) {
+        bool type_match = true;
+        for (size_t i = 0; i < in_names.size(); ++i) {
+          std::string tmp;
+          CHECK(instruct.op_info()->GetInputArgname(in_names[i], &tmp));
+          if (in_types.count(in_names[i]) &&
+              in_types.at(in_names[i]) !=
+                  kernel.GetInputDeclType(tmp)->precision()) {
+            type_match = false;
+          }
+        }
+        for (size_t i = 0; i < out_names.size(); ++i) {
+          std::string tmp;
+          CHECK(instruct.op_info()->GetOutputArgname(out_names[i], &tmp));
+          if (out_types.count(out_names[i]) &&
+              out_types.at(out_names[i]) !=
+                  kernel.GetOutputDeclType(tmp)->precision()) {
+            type_match = false;
+          }
+        }
+        if (type_match) {
+          score *= 2;
+        }
+        VLOG(4) << "[score s4]:" << score;
+      }
+
       if (weight * score > final_score) {
         final_score = weight * score;
         winner_place = place;
diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt
index 95b5fe5ae13e03940bda8d83fcfc252b4ca490ab..1ac4ab346f15edf9e039d3143c0a301d49a1c0b4 100644
--- a/lite/core/mir/subgraph/CMakeLists.txt
+++ b/lite/core/mir/subgraph/CMakeLists.txt
@@ -1,50 +1,30 @@
-
+lite_cc_library(subgraph_detector
+    SRCS subgraph_detector.cc
+    DEPS mir_pass types subgraph_op)
 lite_cc_library(subgraph_pass
-    SRCS subgraph_program_pass.cc
-    DEPS mir_pass types ${mir_fusers})
-lite_cc_test(test_subgraph_pass SRCS subgraph_program_pass_test.cc
-  DEPS subgraph_pass mir_passes gflags model_parser cxx_api
-  ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
+    SRCS subgraph_pass.cc
+    DEPS mir_pass types context ${mir_fusers} subgraph_detector)
 if (WITH_TESTING)
-  add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v1_tar_gz)
-  add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
-  set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-  set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-endif()
-
-set(subgraph_passes subgraph_pass)
-
-if(LITE_WITH_NPU)
-  lite_cc_library(npu_pass SRCS generate_npu_program_pass.cc
-      DEPS mir_pass types context ${mir_fusers} ${npu_bridges} graph_op subgraph_pass)
-  list(APPEND subgraph_passes npu_pass)
-  lite_cc_test(test_npu_pass SRCS generate_npu_program_pass_test.cc
-    DEPS npu_pass mir_passes paddle_api_full paddle_api_light gflags
-    ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
-         --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
-  if (WITH_TESTING)
-    add_dependencies(test_npu_pass extern_lite_download_mobilenet_v1_tar_gz)
-    add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
+    lite_cc_test(test_subgraph_detector
+        SRCS subgraph_detector_test.cc
+        DEPS subgraph_detector mir_passes gflags model_parser cxx_api
+        ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
+    add_dependencies(test_subgraph_detector
+        extern_lite_download_mobilenet_v1_tar_gz
+        extern_lite_download_mobilenet_v2_relu_tar_gz)
     set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set_target_properties(test_npu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  endif()
-endif()
-
-if(LITE_WITH_XPU)
-  lite_cc_library(xpu_pass SRCS generate_xpu_program_pass.cc
-      DEPS mir_pass types context ${mir_fusers} ${xpu_bridges} ${xpu_builder_libs} graph_op subgraph_pass)
-  list(APPEND subgraph_passes xpu_pass)
-  lite_cc_test(test_xpu_pass SRCS generate_xpu_program_pass_test.cc
-    DEPS xpu_pass mir_passes paddle_api_full gflags
-    ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
-         --optimized_model=${LITE_MODEL_DIR}/lite_npu_model_opt SERIAL)
-  if (WITH_TESTING)
-    add_dependencies(test_xpu_pass extern_lite_download_mobilenet_v1_tar_gz)
-    add_dependencies(test_subgraph_pass extern_lite_download_mobilenet_v2_relu_tar_gz)
+    set_target_properties(test_subgraph_detector PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+    lite_cc_test(test_subgraph_pass
+        SRCS subgraph_pass_test.cc
+        DEPS mir_passes paddle_api_full paddle_api_light gflags
+        ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1
+             --optimized_model_dir=${LITE_MODEL_DIR}/lite_model_opt SERIAL)
+    add_dependencies(test_subgraph_pass
+        extern_lite_download_mobilenet_v1_tar_gz
+        extern_lite_download_mobilenet_v2_relu_tar_gz)
     set(LINK_FLAGS "-Wl,--version-script ${PADDLE_SOURCE_DIR}/lite/core/lite.map")
-    set_target_properties(test_xpu_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  endif()
+    set_target_properties(test_subgraph_pass PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
 endif()
 
-set(subgraph_passes ${subgraph_passes} CACHE INTERNAL "subgraph_passes")
-message(STATUS "----> subgraph_passes: ${subgraph_passes}")
+set(mir_subgraphs subgraph_pass CACHE INTERNAL "mir_subgraphs")
+message(STATUS "----> mir_subgraphs: ${mir_subgraphs}")
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc
deleted file mode 100644
index 65c29aa68f1c8c5f5702ca97d27f9579edc7a951..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-
-#include "lite/backends/npu/builder.h"
-#include "lite/kernels/npu/bridges/paddle_use_npu_bridges.h"
-#include "lite/kernels/npu/bridges/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-std::shared_ptr<ge::Operator> GenerateNPUProgramPass::CvtVarNode(
-    lite::mir::Node* var_node, const Scope* scope) {
-  CHECK(var_node->IsArg());
-  const auto& arg = var_node->AsArg();
-  VLOG(4) << "[NPU] Convert var node " << arg.name;
-
-  auto* var = scope->FindVar(arg.name);
-  CHECK(var);
-  auto* tensor = var->GetMutable<lite::Tensor>();
-  CHECK(tensor);
-  auto dims = tensor->dims();
-  if (arg.is_weight) {
-    auto wgt = std::make_shared<ge::op::Const>(arg.name);
-    LOG(INFO) << "[NPU] Convert const var node " << arg.name;
-    VLOG(4) << dims;
-    wgt->set_attr_value(lite::npu::CvtTensor(tensor));
-    return wgt;
-  } else {
-    CHECK_EQ(dims.size(), 4);
-    LOG(INFO) << "[NPU] Convert data var node " << arg.name;
-    LOG(INFO) << dims;
-    // TODO(xxx): support more types and dims size
-    ge::TensorDesc desc(ge::Shape(dims.Vectorize()),
-                        ge::Format::FORMAT_NCHW,
-                        ge::DataType::DT_FLOAT);
-
-    //   auto size = desc.GetShape().GetShapeSize();
-    //  ge::TensorUtils::SetSize(desc, size*sizeof(float));
-    //  ge::TensorUtils::SetRealDimCnt(desc, 4);
-    auto data = std::make_shared<ge::op::Data>(arg.name);
-    data->update_input_desc_x(desc);
-    return data;
-  }
-  return nullptr;
-}
-
-void GenerateNPUProgramPass::CvtAllOpNodes(
-    const std::vector<Node*>& nodes2cvt,
-    lite::kernels::npu::bridges::node_map_type* converted_vars) {
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& cvtfunc_map = bridges.AllFunctions();
-  // return record all converted vars
-  // op node's inputs must be found in converted_vars
-  for (auto& node : nodes2cvt) {
-    lite::kernels::npu::bridges::node_map_type node_inputs;
-    auto& stmt = node->AsStmt();
-    for (auto& var_node : node->inlinks) {
-      auto& arg = var_node->AsArg();
-      // weight should be handled in the converter, so skip here
-      if (arg.is_weight) {
-        continue;
-      }
-      auto var_name = arg.name;
-      if (!converted_vars->count(var_name)) {
-        converted_vars->insert(
-            std::make_pair(var_name, CvtVarNode(var_node, stmt.op()->scope())));
-      }
-      node_inputs.insert(*converted_vars->find(var_name));
-    }
-    auto node_outputs = cvtfunc_map.at(stmt.op_type())(stmt.op(), node_inputs);
-    converted_vars->insert(node_outputs.begin(), node_outputs.end());
-  }
-}
-
-std::string GenerateNPUProgramPass::BuildNPUGraph(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::unordered_set<Node*>& in_data_vars,
-    const std::unordered_set<Node*>& out_data_vars,
-    int sub_id) {
-  auto ordered_nodes = GetTopologicalOrder(op_nodes);
-  lite::kernels::npu::bridges::node_map_type converted_vars;
-  CvtAllOpNodes(ordered_nodes, &converted_vars);
-
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  std::vector<ge::Operator> inputs;
-  std::vector<ge::Operator> outputs;
-  for (auto i : in_data_vars) {
-    auto argname = i->AsArg().name;
-    in_var_names.push_back(argname);
-    inputs.push_back(*converted_vars.at(argname));
-  }
-  for (auto i : out_data_vars) {
-    auto argname = i->AsArg().name;
-    out_var_names.push_back(argname);
-    outputs.push_back(*converted_vars.at(argname));
-  }
-
-  std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
-  weight->set_persistable(true);
-  weight->set_precision(PRECISION(kInt8));
-  // Compiling IR graph to NPU model and store mode data into weight tensor with
-  // persistable=true, Sothat the model parser can recognize it and save it to
-  // param files
-  if (!lite::npu::BuildModel(inputs, outputs, weight)) {
-    LOG(FATAL) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
-  } else {
-    LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
-  }
-  return weight_var_name;
-}
-
-void GenerateNPUProgramPass::GenNPUSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::unordered_set<Node*>& op_nodes,
-    int sub_id) {
-  std::unordered_set<Node*> in_data_vars;
-  std::unordered_set<Node*> in_wgt_vars;
-  std::unordered_set<Node*> out_data_vars;
-  std::unordered_set<Node*> out_unused_vars;
-  FindInputOutputVars(
-      op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
-
-  auto weight_var_name =
-      BuildNPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
-
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  InsertNewNode(graph,
-                weight_var_name,
-                any_op->scope(),
-                any_op->valid_places(),
-                in_data_vars,
-                in_wgt_vars,
-                out_data_vars,
-                out_unused_vars);
-
-  auto nodes2rm = GetNode2rm(
-      op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
-
-  GraphSafeRemoveNodes(graph.get(), nodes2rm);
-}
-
-void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "[NPU] Before NPU Pass \n" << Visualize(graph.get());
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& op_map = bridges.AllFunctions();
-  std::vector<std::string> supported_op_types;
-  for (auto& i : op_map) {
-    LOG(INFO) << "[NPU] Supported type: " << i.first;
-    supported_op_types.push_back(i.first);
-  }
-
-  int num_subgraph = FuseSubgraph(graph, supported_op_types);
-  InferOnce(graph);
-  auto op_nodes_all = ClassifySubgraph(graph);
-  CHECK_EQ(op_nodes_all.size(), num_subgraph);
-  int id = 1;
-  for (auto& op_nodes : op_nodes_all) {
-    LOG(INFO) << "[NPU] Converting Subgraph " << id;
-    GenNPUSubgraph(graph, op_nodes.second, id);
-    LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
-              << Visualize(graph.get());
-    id++;
-  }
-}
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(generate_npu_program_pass,
-                  paddle::lite::mir::subgraph::GenerateNPUProgramPass)
-    .BindTargets({TARGET(kNPU)});
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h
deleted file mode 100644
index 5b1a98c6ed0e10f4fae8832b9ba3c5f98f3d9ed9..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_npu_program_pass.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/backends/npu/builder.h"
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include "lite/kernels/npu/bridges/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-class GenerateNPUProgramPass : public SubgraphProgramPass {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-
- protected:
-  // nodes2cvt: op nodes to convert
-  // return cvted_vars: converted var nodes
-  void CvtAllOpNodes(const std::vector<Node*>& nodes2cvt,
-                     lite::kernels::npu::bridges::node_map_type* cvted_vars);
-
-  std::shared_ptr<ge::Operator> CvtVarNode(lite::mir::Node* var_node,
-                                           const Scope* scope);
-
-  std::string BuildNPUGraph(const std::unordered_set<Node*>& op_nodes,
-                            const std::unordered_set<Node*>& in_data_vars,
-                            const std::unordered_set<Node*>& out_data_vars,
-                            int sub_id);
-
-  void GenNPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
-                      const std::unordered_set<Node*>& op_nodes,
-                      int sub_id);
-};
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.cc b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
deleted file mode 100644
index 4340cb4ee3cccad32db9bc333b5856386812c62a..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.cc
+++ /dev/null
@@ -1,185 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
-#include <memory>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-std::shared_ptr<xtcl::xExpr> GenerateXPUProgramPass::CvtVarNode(
-    lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-    lite::mir::Node* var_node,
-    const Scope* scope) {
-  CHECK(var_node->IsArg());
-  const auto& arg = var_node->AsArg();
-  auto var_name = arg.name;
-  VLOG(4) << "[XPU] Convert var node " << var_name;
-
-  auto* var = scope->FindVar(var_name);
-  CHECK(var);
-  auto* tensor = var->GetMutable<lite::Tensor>();
-  CHECK(tensor);
-  auto dims = tensor->dims();
-  auto cvted_var_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-          var_name, lite::xpu::CvtShape(dims), ::xtcl::Float(32)));
-  if (arg.is_weight) {
-    auto cvted_var_tensor = lite::xpu::CvtTensor(tensor);
-    graph_ctx->params->emplace(std::make_pair(var_name, *cvted_var_tensor));
-  }
-  return cvted_var_node;
-}
-
-void GenerateXPUProgramPass::CvtAllOpNodes(
-    const std::vector<Node*>& op_nodes,
-    lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-    lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes) {
-  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  // return record all converted vars
-  // op node's inputs must be found in converted_vars
-  for (auto& node : op_nodes) {
-    lite::kernels::xpu::bridges::node_map_type input_nodes;
-    auto& stmt = node->AsStmt();
-    for (auto& var_node : node->inlinks) {
-      auto& arg = var_node->AsArg();
-      // weight should be handled in the converter, so skip here
-      if (arg.is_weight) {
-        continue;
-      }
-      auto var_name = arg.name;
-      if (!cvted_var_nodes->count(var_name)) {
-        cvted_var_nodes->insert(std::make_pair(
-            var_name, CvtVarNode(graph_ctx, var_node, stmt.op()->scope())));
-      }
-      input_nodes.insert(*cvted_var_nodes->find(var_name));
-    }
-    auto output_nodes =
-        supported_lists.at(stmt.op_type())(stmt.op(), graph_ctx, input_nodes);
-    cvted_var_nodes->insert(output_nodes.begin(), output_nodes.end());
-  }
-}
-
-std::string GenerateXPUProgramPass::BuildXPUGraph(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::unordered_set<Node*>& in_data_vars,
-    const std::unordered_set<Node*>& out_data_vars,
-    int sub_id) {
-  auto ordered_op_nodes = GetTopologicalOrder(op_nodes);
-  lite::kernels::xpu::bridges::graph_ctx_type graph_ctx;
-  graph_ctx.builder = std::make_shared<xtcl::network::xNetworkBuilder>();
-  graph_ctx.params =
-      std::make_shared<xtcl::network::xTensorCompiler::ParamNDArrayMap>();
-  lite::kernels::xpu::bridges::node_map_type cvted_var_nodes;
-  CvtAllOpNodes(ordered_op_nodes, &graph_ctx, &cvted_var_nodes);
-
-  std::string weight_var_name = "graph" + std::to_string(sub_id) + "_weights";
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  auto weight = any_op->scope()->Var(weight_var_name)->GetMutable<Tensor>();
-  weight->set_persistable(true);
-  weight->set_precision(PRECISION(kInt8));
-  // Compiling graph to XPU model and store mode data into weight tensor with
-  // persistable=true, Sothat the model parser can recognize it and save it to
-  // param files
-  std::vector<std::shared_ptr<xtcl::xExpr>> ordered_cvted_var_nodes;
-  for (auto out_data_var : out_data_vars) {
-    auto var_name = out_data_var->AsArg().name;
-    ordered_cvted_var_nodes.push_back(cvted_var_nodes[var_name]);
-  }
-  if (!lite::xpu::BuildModel(graph_ctx.builder,
-                             graph_ctx.params,
-                             &ordered_cvted_var_nodes,
-                             weight)) {
-    LOG(FATAL) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
-  } else {
-    LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
-  }
-  return weight_var_name;
-}
-
-void GenerateXPUProgramPass::GenXPUSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::unordered_set<Node*>& op_nodes,
-    int sub_id) {
-  std::unordered_set<Node*> in_data_vars;
-  std::unordered_set<Node*> in_wgt_vars;
-  std::unordered_set<Node*> out_data_vars;
-  std::unordered_set<Node*> out_unused_vars;
-  FindInputOutputVars(
-      op_nodes, &in_data_vars, &in_wgt_vars, &out_data_vars, &out_unused_vars);
-
-  auto weight_var_name =
-      BuildXPUGraph(op_nodes, in_data_vars, out_data_vars, sub_id);
-
-  auto any_op = (*op_nodes.begin())->AsStmt().op();
-  InsertNewNode(graph,
-                weight_var_name,
-                any_op->scope(),
-                any_op->valid_places(),
-                in_data_vars,
-                in_wgt_vars,
-                out_data_vars,
-                out_unused_vars);
-
-  auto nodes2rm = GetNode2rm(
-      op_nodes, {in_data_vars, in_wgt_vars, out_data_vars, out_unused_vars});
-
-  GraphSafeRemoveNodes(graph.get(), nodes2rm);
-}
-
-void GenerateXPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  LOG(INFO) << "[XPU] Before XPU Pass \n" << Visualize(graph.get());
-  const auto& bridges = lite::kernels::xpu::bridges::Factory::Instance();
-  const auto& op_map = bridges.AllFunctions();
-  std::vector<std::string> supported_op_types;
-  for (auto& i : op_map) {
-    LOG(INFO) << "[XPU] Supported type: " << i.first;
-    supported_op_types.push_back(i.first);
-  }
-
-  int num_subgraph = FuseSubgraph(graph, supported_op_types);
-  InferOnce(graph);
-  auto op_nodes_all = ClassifySubgraph(graph);
-  CHECK_EQ(op_nodes_all.size(), num_subgraph);
-  int id = 1;
-  for (auto& op_nodes : op_nodes_all) {
-    LOG(INFO) << "[XPU] Converting Subgraph " << id;
-    GenXPUSubgraph(graph, op_nodes.second, id);
-    LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
-              << Visualize(graph.get());
-    id++;
-  }
-}
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(generate_xpu_program_pass,
-                  paddle::lite::mir::subgraph::GenerateXPUProgramPass)
-    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.h b/lite/core/mir/subgraph/generate_xpu_program_pass.h
deleted file mode 100644
index 777642cfb6c61671a8aeb119c70664297573d9a7..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.h
+++ /dev/null
@@ -1,65 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/backends/xpu/builder.h"
-#include "lite/core/mir/pass.h"
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-class GenerateXPUProgramPass : public SubgraphProgramPass {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-
- protected:
-  // nodes2cvt: op nodes to convert
-  // return cvted_vars: converted var nodes
-  void CvtAllOpNodes(
-      const std::vector<Node*>& op_nodes,
-      lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-      lite::kernels::xpu::bridges::node_map_type* cvted_var_nodes);
-
-  std::shared_ptr<xtcl::xExpr> CvtVarNode(
-      lite::kernels::xpu::bridges::graph_ctx_type* graph_ctx,
-      lite::mir::Node* var_node,
-      const Scope* scope);
-
-  std::string BuildXPUGraph(const std::unordered_set<Node*>& op_nodes,
-                            const std::unordered_set<Node*>& in_data_vars,
-                            const std::unordered_set<Node*>& out_data_vars,
-                            int sub_id);
-
-  void GenXPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
-                      const std::unordered_set<Node*>& op_nodes,
-                      int sub_id);
-};
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc b/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc
deleted file mode 100644
index 728ecbc6b77666accd432b1ad82a03860588ab40..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/generate_xpu_program_pass_test.cc
+++ /dev/null
@@ -1,172 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <cmath>
-#include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
-#include "lite/api/test_helper.h"
-#include "lite/utils/cp_logging.h"
-
-DEFINE_string(model_file, "", "model file path of combined protobuf model");
-DEFINE_string(params_file, "", "params file path of combined protobuf model");
-DEFINE_string(optimized_model_dir, "", "path of optimized naive buffer model");
-DEFINE_string(input_tensor_shape, "1,3,224,224", "shapes of input tensors");
-DEFINE_int32(output_tensor_num, 1, "number of output tensors");
-
-namespace paddle {
-namespace lite {
-
-std::vector<std::vector<int64_t>> ParseShape(std::string txt) {
-  std::vector<std::vector<int64_t>> shape;
-  while (!txt.empty()) {
-    size_t idx = txt.find_first_of(":");
-    std::string dims = txt.substr(0, idx);
-    std::vector<int64_t> s;
-    while (!dims.empty()) {
-      size_t idx = dims.find_first_of(",");
-      int d = atoi(dims.substr(0, idx).c_str());
-      VLOG(3) << d;
-      s.push_back(d);
-      if (idx == std::string::npos) {
-        break;
-      } else {
-        dims = dims.substr(idx + 1);
-      }
-    }
-    shape.push_back(s);
-    if (idx == std::string::npos) {
-      break;
-    } else {
-      txt = txt.substr(idx + 1);
-    }
-  }
-  return shape;
-}
-
-int64_t ShapeProduction(std::vector<int64_t> shape) {
-  int64_t s = 1;
-  for (int64_t dim : shape) {
-    s *= dim;
-  }
-  return s;
-}
-
-void FillInputTensor(
-    const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
-    const std::vector<std::vector<int64_t>>& input_tensor_shape,
-    const float value) {
-  for (int i = 0; i < input_tensor_shape.size(); i++) {
-    auto input_tensor = predictor->GetInput(i);
-    input_tensor->Resize(input_tensor_shape[i]);
-    auto input_tensor_data = input_tensor->mutable_data<float>();
-    auto input_tensor_size = ShapeProduction(input_tensor->shape());
-    for (int j = 0; j < input_tensor_size; j++) {
-      input_tensor_data[j] = value;
-    }
-  }
-}
-
-void CompareOutputTensor(
-    const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
-    const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
-    const int output_tensor_num) {
-  for (int i = 0; i < output_tensor_num; i++) {
-    auto tar_output_tensor = tar_predictor->GetOutput(i);
-    auto ref_output_tensor = ref_predictor->GetOutput(i);
-    auto tar_output_tensor_data = tar_output_tensor->data<float>();
-    auto ref_output_tensor_data = ref_output_tensor->data<float>();
-    auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
-    auto ref_output_tensor_size = ShapeProduction(ref_output_tensor->shape());
-    EXPECT_EQ(tar_output_tensor_size, ref_output_tensor_size);
-    for (size_t j = 0; j < ref_output_tensor_size; j++) {
-      auto diff =
-          std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]) /
-          (std::fabs(ref_output_tensor_data[j]) + 1e-6);
-      VLOG(3) << diff;
-      EXPECT_LT(diff, 0.1);
-    }
-  }
-}
-
-std::shared_ptr<lite_api::PaddlePredictor> TestModel(
-    const std::string& model_dir,
-    const std::string& model_file,
-    const std::string& params_file,
-    const std::vector<lite_api::Place>& valid_places,
-    const std::vector<std::vector<int64_t>>& input_tensor_shape,
-    const std::string& optimized_model_dir) {
-  // generate optimized model
-  lite_api::CxxConfig cxx_config;
-  cxx_config.set_model_dir(model_dir);
-  cxx_config.set_model_file(model_file);
-  cxx_config.set_param_file(params_file);
-  cxx_config.set_valid_places(valid_places);
-  auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
-  FillInputTensor(predictor, input_tensor_shape, -1);
-  predictor->SaveOptimizedModel(optimized_model_dir,
-                                lite_api::LiteModelType::kNaiveBuffer);
-#if 0  // TODO(hong19860320) supports light api for XPU
-  // load optimized model
-  lite_api::MobileConfig mobile_config;
-  mobile_config.set_model_dir(optimized_model_dir);
-  mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
-  mobile_config.set_threads(1);
-  predictor = lite_api::CreatePaddlePredictor(mobile_config);
-  FillInputTensor(predictor, input_tensor_shape, 1);
-#endif
-  // run optimized model
-  for (int i = 0; i < FLAGS_warmup; i++) {
-    predictor->Run();
-  }
-  for (int i = 0; i < FLAGS_repeats; i++) {
-    auto start = GetCurrentUS();
-    predictor->Run();
-    LOG(INFO) << i << ", " << GetCurrentUS() - start << "us";
-  }
-  return predictor;
-}
-
-TEST(XPUSubgraph, compare) {
-  // parsing input tensor shape, supported formats: "1,3,224,224"
-  // "1,3,224,224:1,80"
-  std::vector<std::vector<int64_t>> input_tensor_shape =
-      ParseShape(FLAGS_input_tensor_shape);
-  // generate and run optimized CPU model
-  LOG(INFO) << " ================ CPU ================== ";
-  auto cpu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/CPU");
-  // generate and run optimized XPU model
-  LOG(INFO) << " ================ XPU ================== ";
-  auto xpu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kXPU), PRECISION(kFloat)},
-                 lite_api::Place{TARGET(kX86), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/XPU");
-  // verify results
-  CompareOutputTensor(xpu_predictor, cpu_predictor, FLAGS_output_tensor_num);
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d48b053a1a4140252d35e85d2351644d3c216e9
--- /dev/null
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -0,0 +1,551 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/subgraph/subgraph_detector.h"
+#include <memory>
+#include <set>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/dot.h"
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/pattern_matcher.h"
+#include "lite/operators/subgraph_op.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+using inference::analysis::Dot;
+
+std::string SubgraphVisualizer::operator()() {
+  inference::analysis::Dot dot;
+  const std::vector<std::string> subgraph_colors{
+      "red",          "green",          "cyan",           "bisque3",
+      "coral",        "darkseagreen1",  "goldenrod1",     "darkorchid",
+      "antiquewhite", "aquamarine",     "azure",          "bisque4",
+      "blue2",        "brown1",         "burlywood1",     "cadetblue1",
+      "chartreuse1",  "chocolate1",     "coral1",         "cornsilk",
+      "crimson",      "cyan4",          "darkgoldenrod4", "darkolivegreen2",
+      "darkorange2",  "darkorchid2",    "darkseagreen3",  "darkslategray",
+      "deeppink2",    "deepskyblue2",   "dodgerblue",     "firebrick",
+      "floralwhite",  "gold1",          "skyblue3",       "indianred",
+      "indigo",       "lavenderblush2", "lightblue1",     "lightsalmon3",
+      "khaki1",       "ivory4",         "sandybrown",     "olivedrab2",
+      "turquoise4",   "snow3",          "sienna4",        "salmon2",
+  };
+  std::unordered_map<Node *, int> subgraph_indices;
+  for (int i = 0; i < subgraphs_.size(); i++) {
+    for (int j = 0; j < subgraphs_[i].size(); j++) {
+      subgraph_indices[subgraphs_[i][j]] = i;
+    }
+  }
+  std::unordered_map<std::string, int> exists_ops;
+  std::set<std::string> exists_args;
+  for (auto &node : graph_->StmtTopologicalOrder()) {
+    if (!node->IsStmt()) {
+      continue;
+    }
+    auto op_type = node->AsStmt().op_type();
+    if (!exists_ops.count(op_type)) {
+      exists_ops[op_type] = 0;
+    } else {
+      exists_ops[op_type]++;
+    }
+    auto op_name = op_type + std::to_string(exists_ops[op_type]);
+    std::string op_color = "white";
+    if (subgraph_indices.count(node)) {
+      auto subgraph_idx = subgraph_indices[node];
+      op_name += "_subgraph_" + std::to_string(subgraph_idx);
+      op_color = subgraph_colors[subgraph_idx % subgraph_colors.size()];
+    }
+    dot.AddNode(op_name,
+                {Dot::Attr("shape", "box"),
+                 Dot::Attr("style", "filled"),
+                 Dot::Attr("color", "black"),
+                 Dot::Attr("fillcolor", op_color)});
+    for (auto &in_node : node->inlinks) {
+      auto arg_name = in_node->AsArg().name;
+      if (!exists_args.count(arg_name)) {
+        dot.AddNode(arg_name, {});
+        exists_args.insert(arg_name);
+      }
+      dot.AddEdge(arg_name, op_name, {});
+    }
+    for (auto &out_node : node->outlinks) {
+      auto arg_name = out_node->AsArg().name;
+      if (!exists_args.count(arg_name)) {
+        dot.AddNode(arg_name, {});
+        exists_args.insert(arg_name);
+      }
+      dot.AddEdge(op_name, arg_name, {});
+    }
+  }
+
+  auto res = dot.Build();
+  std::cout << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl;
+  return res;
+}
+
+// Find the ancestor node
+SubgraphDetector::node_dat_t *
+SubgraphDetector::node_dat_t::UnionFindAncestor() {
+  node_dat_t *ancestor = this;
+  while (ancestor->union_find_parent != ancestor) {
+    ancestor = ancestor->union_find_parent;
+  }
+  return ancestor;
+}
+
+// Merge the two adjacent nodes into one node.
+// Suppose we have two adjacent nodes src and dst.
+// We will perform the following operations:
+// 1. add all inputs(except src) of dst to src inlinks.
+// 2. add all outputs of dst to src outlinks.
+// 3. change all the dst's inputs and outputs
+// corresponding inlinks and outlinks to src node.
+// 4. delete all dst's inlinks and outlinks.
+void SubgraphDetector::node_dat_t::UnionFindCombine(node_dat_t *candidate) {
+  // Make this two node share the same ancestor.
+  union_find_parent = UnionFindAncestor();
+  node_dat_t *candidate_ancestor = candidate->UnionFindAncestor();
+  candidate_ancestor->union_find_parent = union_find_parent;
+  candidate->union_find_parent = union_find_parent;
+
+  // Obtain the input and output nodes for the combined one
+  std::unordered_set<node_dat_t *> inputs(inlinks.begin(), inlinks.end());
+  std::unordered_set<node_dat_t *> outputs(candidate->outlinks.begin(),
+                                           candidate->outlinks.end());
+  for (auto *out_node : outlinks) {
+    if (out_node != candidate) {
+      outputs.insert(out_node);
+    }
+  }
+  for (auto *in_node : candidate->inlinks) {
+    if (in_node != this) {
+      inputs.insert(in_node);
+    }
+  }
+
+// Update the dst and src node's inlinks and outlinks.
+#ifdef __clang__
+  inlinks = node_set_t(inputs.begin(), inputs.end());
+  outlinks = node_set_t(outputs.begin(), outputs.end());
+  candidate->inlinks.clear();
+  candidate->outlinks.clear();
+#else
+  inlinks = std::move(node_set_t(inputs.begin(), inputs.end()));
+  outlinks = std::move(node_set_t(outputs.begin(), outputs.end()));
+  candidate->inlinks.clear();
+  candidate->outlinks.clear();
+#endif
+
+  // Change all the dst inputs and outputs corresponding inlink and
+  // outlink to the src node.
+  for (auto *in_node : inlinks) {
+    for (auto *&out_node : in_node->outlinks) {
+      if (out_node == candidate) {
+        out_node = this;
+      }
+    }
+  }
+  for (auto *out_node : outlinks) {
+    for (auto *&in_node : out_node->inlinks) {
+      if (in_node == candidate) {
+        in_node = this;
+      }
+    }
+  }
+}
+
+// FlexibleDFS
+// If reverse is true, do reverse dfs.
+// If enter func is not nullptr, calls enter(node) before visiting any children
+// of node.
+// If leave func not nullptr, calls leave(node) after visiting all parents of
+// node.
+void SubgraphDetector::FlexibleDFS(
+    const node_set_t &source,
+    bool reverse,
+    const std::function<bool(const node_dat_t *)> &enter,
+    const std::function<bool(const node_dat_t *)> &leave) {
+  std::vector<std::pair<const node_dat_t *, bool>> stack;  // node, leave
+  for (auto &node : source) {
+    stack.push_back(std::pair<const node_dat_t *, bool>(node, false));
+  }
+  std::unordered_set<const node_dat_t *> visited;
+  while (!stack.empty()) {
+    auto top = stack.back();
+    stack.pop_back();
+
+    if (top.second) {
+      if (leave && !leave(top.first)) return;
+    }
+    if (visited.count(top.first)) continue;
+    visited.insert(top.first);
+
+    if (enter && !enter(top.first)) return;
+
+    if (leave)
+      stack.push_back(std::pair<const node_dat_t *, bool>(top.first, true));
+    const node_set_t iter_nodes =
+        reverse == true ? top.first->inlinks : top.first->outlinks;
+    for (auto *node : iter_nodes) {
+      if (!visited.count(node)) {
+        stack.push_back(std::pair<const node_dat_t *, bool>(node, false));
+      }
+    }
+  }
+}
+
+void SubgraphDetector::InitNodes(node_map_t *nodes) {
+  // Initialize and mark the subgraph detector nodes based on teller.
+  for (auto &it : *nodes) {
+    for (auto &in_node : it.first->inlinks) {
+      it.second->inlinks.push_back((*nodes)[in_node]);
+    }
+    for (auto &out_node : it.first->outlinks) {
+      it.second->outlinks.push_back((*nodes)[out_node]);
+    }
+    if (teller_(it.first)) {
+      it.second->marked = true;
+      if (it.first->IsStmt()) {
+        // If a function is inside the subgraph, mark all the output variables
+        // to be inside too, so that two marked functions will be inside a same
+        // subgraph, lets take a example:  A_function->var->B_function, if
+        // A_function is marked, var should also be marked, so that B_function
+        // will be in the same subgraph with A_function if B_function is
+        // marked.
+        for (auto &out_node : it.first->outlinks) {
+          (*nodes)[out_node]->marked = true;
+        }
+      }
+    }
+  }
+}  // namespace mir
+
+std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubgraphs(
+    node_map_t *nodes) {
+  for (auto &it : *nodes) {
+    node_dat_t *node = it.second;
+    if (!node->marked) {
+      continue;
+    }
+    //  Our algorithm must guarantee that:
+    //  1. The graph is always directed acyclic graph（DAG）.
+    //  2. If there is a path in the subgraph from X to Y (X and Y are both
+    //  nodes in the subgraph), then all paths from X to Y are in the
+    //  subgraph.
+    //
+    //  In order to achieve the above guarantee.
+    //  For adjacent nodes src -> dst.
+    //  1. Get all dst input nodes except src.
+    //  2. Reverse DFS from those input nodes
+    //  3. If there is a path from input nodes to src,
+    //  then the src and dst nodes can not be fused into one node,
+    //  otherwise it can be done.
+    while (true) {
+      std::unordered_set<node_dat_t *> contract_nodes;
+      for (auto *out_node : node->outlinks) {
+        // must be an candidate
+        if (!out_node->marked) continue;
+        // get all dst input nodes except src node.
+        node_set_t source_nodes;
+        for (auto *in_node : out_node->inlinks) {
+          if (in_node != node) {
+            source_nodes.push_back(in_node);
+          }
+        }
+
+        // Reverse DFS from the source_nodes.
+        bool have_excess_path = false;
+        FlexibleDFS(source_nodes,
+                    true,
+                    nullptr,
+                    [&have_excess_path, node](const node_dat_t *n) {
+                      if (n == node) {
+                        have_excess_path = true;
+                        return false;
+                      }
+                      return true;
+                    });
+        if (have_excess_path) continue;
+        contract_nodes.insert(out_node);
+      }
+      if (contract_nodes.empty()) break;
+
+      for (auto &contract_node : contract_nodes) {
+        node->UnionFindCombine(contract_node);
+      }
+    }
+  }
+
+  std::unordered_map<node_dat_t * /*ancestor*/, std::vector<Node *>> clusters;
+  for (auto &node : graph_->StmtTopologicalOrder()) {
+    if (!node->IsStmt()) continue;
+    if ((*nodes)[node]->marked) {
+      clusters[(*nodes)[node]->UnionFindAncestor()].push_back(node);
+    }
+  }
+  std::vector<std::vector<Node *>> subgraphs;
+  std::for_each(clusters.begin(),
+                clusters.end(),
+                [&](const decltype(clusters)::value_type &it) {
+                  subgraphs.push_back(it.second);
+                });
+  return subgraphs;
+}
+
+std::vector<std::vector<Node *>> SubgraphDetector::operator()() {
+  node_map_t nodes;
+  for (auto &node : graph_->mutable_nodes()) {
+    nodes[&node] = new node_dat_t(&node);
+    CHECK(nodes[&node]);
+  }
+  // Initialize and mark the subgraph detector nodes based on teller.
+  InitNodes(&nodes);
+  // Run the Extract algorithm to find all subgraphs.
+  std::vector<std::vector<Node *>> subgraphs = ExtractSubgraphs(&nodes);
+  for (auto &it : nodes) {
+    CHECK(it.second);
+    delete it.second;
+  }
+  return subgraphs;
+}
+
+void SubgraphFuser::InsertNewNode(SSAGraph *graph,
+                                  int subgraph_idx,
+                                  const std::vector<Node *> &subgraph_nodes) {
+  // Create and attach a new subgraph op
+  cpp::OpDesc subgraph_op_desc;
+  subgraph_op_desc.SetType("subgraph");
+
+  // Create a new sub block desc for storing all of Ops an Vars of the target
+  // subgraph and sub_block_idx is set as a attribute of subgraph op,
+  // sub_block_idx < 0 means it's a new subgraph op
+  int sub_block_idx = -(subgraph_idx + 1);
+  auto sub_block_desc = new cpp::BlockDesc();
+  sub_block_desc->ClearOps();
+  sub_block_desc->ClearVars();
+  for (auto &op_node : subgraph_nodes) {
+    auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
+    *sub_block_op_desc = *op_node->AsStmt().op_info();
+    sub_block_op_desc->SetAttr(
+        kKernelTypeAttr,
+        op_node->AsStmt().picked_kernel().SerializedKernelType());
+  }
+  subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
+
+  // Extract input and output nodes from the target subgraph
+  std::unordered_set<Node *> input_var_nodes;
+  std::unordered_set<Node *> weight_var_nodes;
+  std::unordered_set<Node *> output_var_nodes;
+  std::unordered_set<Node *> local_var_nodes;
+  std::unordered_set<Node *> unused_var_nodes;
+  ExtractInputsOutputs(subgraph_nodes,
+                       &input_var_nodes,
+                       &weight_var_nodes,
+                       &output_var_nodes,
+                       &local_var_nodes,
+                       &unused_var_nodes);
+
+  // Set input and output name mapping which stores the real inputs and
+  // outputs
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (auto &var_node : input_var_nodes) {
+    input_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : output_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("input_data_names",
+                                                     input_var_names);
+  subgraph_op_desc.SetAttr<std::vector<std::string>>("output_data_names",
+                                                     output_var_names);
+
+  // Set all of the inputs and outputs to the target subgraph op
+  // To prevent vars are removed in RuntimeProgram::UpdateVarsOfProgram()
+  for (auto &var_node : weight_var_nodes) {
+    input_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : local_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
+  for (auto &var_node : unused_var_nodes) {
+    output_var_names.push_back(var_node->AsArg().name);
+  }
+  subgraph_op_desc.SetInput("Inputs", input_var_names);
+  subgraph_op_desc.SetOutput("Outputs", output_var_names);
+  auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
+  static_cast<operators::SubgraphOp *>(subgraph_op.get())
+      ->SetSubBlock(sub_block_desc);
+  auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
+  subgraph_op->Attach(subgraph_op_desc, any_op->scope());
+
+  // Create and add a new subgraph node into the graph
+  auto subgraph_op_node =
+      graph->GraphCreateInstructNode(subgraph_op, any_op->valid_places());
+  for (auto &var_node : input_var_nodes) {
+    IR_NODE_LINK_TO(var_node, subgraph_op_node);
+  }
+  for (auto &var_node : weight_var_nodes) {
+    IR_NODE_LINK_TO(var_node, subgraph_op_node);
+  }
+  for (auto &var_node : output_var_nodes) {
+    IR_OP_VAR_LINK(subgraph_op_node, var_node);
+  }
+  for (auto &var_node : local_var_nodes) {
+    IR_OP_VAR_LINK(subgraph_op_node, var_node);
+  }
+  for (auto &var_node : unused_var_nodes) {
+    IR_OP_VAR_LINK(subgraph_op_node, var_node);
+  }
+
+  // Create and assign the context to the picked kernel of the new subgraph
+  // node
+  auto &inst = subgraph_op_node->AsStmt();
+  inst.picked_kernel().SetContext(
+      ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
+
+  // Remove subgraph nodes and unused var nodes
+  auto nodes2rm = GetNodes2RM(subgraph_nodes,
+                              {input_var_nodes,
+                               weight_var_nodes,
+                               output_var_nodes,
+                               local_var_nodes,
+                               unused_var_nodes});
+  GraphSafeRemoveNodes(graph, nodes2rm);
+}
+
+void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph,
+                                              const SubgraphTeller &teller,
+                                              int min_subgraph_size) {
+  std::vector<std::vector<Node *>> subgraphs =
+      SubgraphDetector(graph, teller)();
+  SubgraphVisualizer(graph, subgraphs)();
+  for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) {
+    if (subgraphs[subgraph_idx].size() >= min_subgraph_size) {
+      InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]);
+    }
+  }
+}
+
+void SubgraphFuser::operator()() {
+  ReplaceNodesWithSubgraphs(graph_, teller_, min_subgraph_size_);
+}
+
+void ExtractInputsOutputs(const std::vector<Node *> &op_nodes,
+                          std::unordered_set<Node *> *input_var_nodes,
+                          std::unordered_set<Node *> *weight_var_nodes,
+                          std::unordered_set<Node *> *output_var_nodes,
+                          std::unordered_set<Node *> *local_var_nodes,
+                          std::unordered_set<Node *> *unused_var_nodes) {
+  for (auto &op_node : op_nodes) {
+    for (auto &var_node : op_node->inlinks) {
+      if (var_node->AsArg().is_weight) {
+        weight_var_nodes->insert(var_node);
+        continue;
+      }
+      if (!var_node->inlinks.empty()) {
+        // Var can only come from one op node, so use front
+        auto *prev_op_node = var_node->inlinks.front();
+        if (std::find(op_nodes.begin(), op_nodes.end(), prev_op_node) !=
+            op_nodes.end()) {
+          continue;
+        }
+      }
+      input_var_nodes->insert(var_node);
+    }
+    for (auto &var_node : op_node->outlinks) {
+      if (var_node->outlinks.empty()) {
+        // The next op is empty so this var is actually unused
+        unused_var_nodes->insert(var_node);
+        continue;
+      }
+      // Var can have more than one next op node, So, if any one in the
+      // op_nodes then continue
+      bool next_op_in_nodes = false;
+      for (auto &next_op_node : var_node->outlinks) {
+        if (std::find(op_nodes.begin(), op_nodes.end(), next_op_node) !=
+            op_nodes.end()) {
+          next_op_in_nodes = true;
+        }
+      }
+      if (next_op_in_nodes) {
+        local_var_nodes->insert(var_node);
+        continue;
+      }
+      output_var_nodes->insert(var_node);
+    }
+  }
+}
+
+std::unordered_set<const Node *> GetNodes2RM(
+    const std::vector<Node *> &op_nodes,
+    const std::vector<std::unordered_set<Node *>> &excluded_var_nodes) {
+  std::unordered_set<const Node *> nodes2rm(op_nodes.begin(), op_nodes.end());
+  for (auto &op_node : op_nodes) {
+    for (auto &var_node : op_node->inlinks) {
+      if (!nodes2rm.count(var_node)) {
+        nodes2rm.insert(var_node);
+      }
+    }
+    for (auto &var_node : op_node->outlinks) {
+      if (!nodes2rm.count(var_node)) {
+        nodes2rm.insert(var_node);
+      }
+    }
+  }
+  // Excluded nodes should not be removed
+  for (auto &excluded_var_node : excluded_var_nodes) {
+    for (auto &var_node : excluded_var_node) {
+      if (nodes2rm.count(var_node)) {
+        nodes2rm.erase(var_node);
+      }
+    }
+  }
+  return nodes2rm;
+}
+
+static void SortHelper(Node *node,
+                       const std::unordered_set<Node *> &unordered_nodes,
+                       std::unordered_set<const Node *> *visited_nodes,
+                       std::vector<Node *> *ordered_nodes) {
+  for (auto &var_node : node->inlinks) {
+    if (var_node->inlinks.empty()) continue;
+    auto *op_node = var_node->inlinks.front();
+    if (unordered_nodes.count(op_node) && !visited_nodes->count(op_node)) {
+      SortHelper(op_node, unordered_nodes, visited_nodes, ordered_nodes);
+    }
+  }
+  ordered_nodes->push_back(node);
+  visited_nodes->insert(node);
+}
+
+std::vector<Node *> GetTopologicalOrder(
+    const std::unordered_set<Node *> &unordered_nodes) {
+  std::unordered_set<const Node *> visited_nodes;
+  std::vector<Node *> ordered_nodes;
+  for (auto &node : unordered_nodes) {
+    if (!node->IsStmt()) continue;
+    if (visited_nodes.count(node)) continue;
+    SortHelper(node, unordered_nodes, &visited_nodes, &ordered_nodes);
+  }
+  return ordered_nodes;
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_detector.h b/lite/core/mir/subgraph/subgraph_detector.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6873655e976a785383269972221f001196431f8
--- /dev/null
+++ b/lite/core/mir/subgraph/subgraph_detector.h
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+using SubgraphTeller = std::function<bool(Node*)>;
+
+class SubgraphVisualizer {
+ public:
+  SubgraphVisualizer(SSAGraph* graph,
+                     const std::vector<std::vector<Node*>>& subgraphs)
+      : graph_(graph), subgraphs_(subgraphs) {}
+  std::string operator()();
+
+ protected:
+  SSAGraph* graph_{nullptr};
+  std::vector<std::vector<Node*>> subgraphs_;
+};
+
+/*
+ * Divide the graph into subgraphs according to the specified conditions.
+ * Return the divided clusters, a cluster is consisted of the op nodes in the
+ * subgraph.
+ */
+class SubgraphDetector {
+ public:
+  // This is a simple representation of a graph. The SDNode hold the
+  // pointer of the Node. This is to avoid changing the original graph in the
+  // process of graph analysis.
+  struct node_dat_t;
+  using node_map_t = std::unordered_map<Node*, node_dat_t*>;
+  using node_set_t = std::vector<node_dat_t*>;
+  struct node_dat_t {
+    explicit node_dat_t(Node* _node) : node(_node) {}
+    Node* node;
+    bool marked{false};
+    node_dat_t* union_find_parent{this};
+    node_set_t inlinks{};
+    node_set_t outlinks{};
+    node_dat_t* UnionFindAncestor();
+    void UnionFindCombine(node_dat_t* candidate);
+  };
+  SubgraphDetector(SSAGraph* graph, const SubgraphTeller& teller)
+      : graph_(graph), teller_(teller) {}
+  std::vector<std::vector<Node*>> operator()();
+
+  void FlexibleDFS(const node_set_t& source,
+                   bool reverse,
+                   const std::function<bool(const node_dat_t*)>& enter,
+                   const std::function<bool(const node_dat_t*)>& leave);
+  void InitNodes(node_map_t* nodes);
+  std::vector<std::vector<Node*>> ExtractSubgraphs(node_map_t* nodes);
+
+ protected:
+  SSAGraph* graph_{nullptr};
+  SubgraphTeller teller_;
+};
+
+/*
+ * Replace all of subgraphs with the subgraph ops, a block desc is added into
+ * the subgraph op to wrap the original op nodes, keep all of var nodes of the
+ * original ops nodes as the inputs and outputs of the subgraph op
+ */
+class SubgraphFuser {
+ public:
+  SubgraphFuser(SSAGraph* graph,
+                const SubgraphTeller& teller,
+                int min_subgraph_size)
+      : graph_(graph), teller_(teller), min_subgraph_size_{min_subgraph_size} {}
+  void operator()();
+
+  // Remove the op nodes of the subgraphs and replace with the subgraph ops.
+  void ReplaceNodesWithSubgraphs(SSAGraph* graph,
+                                 const SubgraphTeller& teller,
+                                 int min_subgraph_size);
+  // Create a subgraph node with a block desc to wrap the original op nodes of
+  // the subgraph
+  void InsertNewNode(SSAGraph* graph,
+                     int subgraph_idx,
+                     const std::vector<Node*>& subgraph_nodes);
+
+ protected:
+  SSAGraph* graph_{nullptr};
+  SubgraphTeller teller_;
+  int min_subgraph_size_;
+};
+
+void ExtractInputsOutputs(const std::vector<Node*>& op_nodes,
+                          std::unordered_set<Node*>* input_var_nodes,
+                          std::unordered_set<Node*>* weight_var_nodes,
+                          std::unordered_set<Node*>* output_var_nodes,
+                          std::unordered_set<Node*>* local_var_nodes,
+                          std::unordered_set<Node*>* unused_var_nodes);
+
+std::unordered_set<const Node*> GetNodes2RM(
+    const std::vector<Node*>& op_nodes,
+    const std::vector<std::unordered_set<Node*>>& excluded_var_nodes);
+
+std::vector<Node*> GetTopologicalOrder(
+    const std::unordered_set<Node*>& unordered_nodes);
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph/subgraph_program_pass_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
similarity index 65%
rename from lite/core/mir/subgraph/subgraph_program_pass_test.cc
rename to lite/core/mir/subgraph/subgraph_detector_test.cc
index 22e20b81d831ff25df090a7565e671b9139122f7..3b0d7c5cd5c8a0d0901750148359f430b6d49894 100644
--- a/lite/core/mir/subgraph/subgraph_program_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_detector_test.cc
@@ -12,68 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
+#include "lite/core/mir/subgraph/subgraph_detector.h"
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
-#include "lite/core/mir/graph_visualize_pass.h"
 #include "lite/core/mir/ssa_graph.h"
 #include "lite/core/program.h"
 #include "lite/model_parser/cpp/program_desc.h"
 #include "lite/model_parser/model_parser.h"
 
 DEFINE_string(model_dir, "", "model_dir");
+DEFINE_string(model_file, "", "model file path of combined protobuf model");
+DEFINE_string(params_file, "", "params file path of combined protobuf model");
 
 namespace paddle {
 namespace lite {
 
-TEST(SubgraphTest, models) {
-  cpp::ProgramDesc program_desc;
-  auto scope = std::make_shared<Scope>();
-  // LoadModelPb(FLAGS_model_dir,
-  //             FLAGS_model_dir + "/model",
-  //             FLAGS_model_dir + "/params",
-  //             scope.get(),
-  //             &program_desc,
-  //             true);
-  LoadModelPb(FLAGS_model_dir, "", "", scope.get(), &program_desc);
-  std::vector<Place> valid_places({
-      Place{TARGET(kHost), PRECISION(kFloat)},
-#ifdef LITE_WITH_ARM
-      Place{TARGET(kARM), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_NPU
-      Place{TARGET(kNPU), PRECISION(kFloat)},
-#endif
-#ifdef LITE_WITH_XPU
-      Place{TARGET(kXPU), PRECISION(kFloat)},
-#endif
-  });
-  lite::Program program(program_desc, scope, valid_places);
-  auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
-  graph->Build(program, valid_places);
-
-  std::vector<std::string> supported_op_types{"concat",
-                                              "conv2d",
-                                              "depthwise_conv2d",
-                                              "batch_norm",
-                                              "scale",
-                                              "pool2d",
-                                              "mul",
-                                              "elementwise_add",
-                                              "softmax",
-                                              "split",
-                                              "relu",
-                                              "reshape2",
-                                              "transpose2"};
-  auto* pass = new mir::subgraph::SubgraphProgramPass;
-  ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
-  LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
-}
-
-// return output_var_names
+// The helper functions for building model manually
 std::vector<std::string> AddFCDesc(
     cpp::BlockDesc* block_desc,
     const std::shared_ptr<Scope>& scope,
@@ -84,24 +41,23 @@ std::vector<std::string> AddFCDesc(
   static int id = 0;
   std::string prefix = "fc_" + std::to_string(id);
   auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
-  auto* wgt = block_desc->AddVar<cpp::VarDesc>();
-  auto* bias = block_desc->AddVar<cpp::VarDesc>();
-  auto* out = block_desc->AddVar<cpp::VarDesc>();
 
+  auto* wgt = block_desc->AddVar<cpp::VarDesc>();
   wgt->SetName(prefix + "_W");
-  bias->SetName(prefix + "_Bias");
-  out->SetName(prefix + "_Out");
-  std::vector<std::string> out_var_names{prefix + "_Out"};
-
-  auto* wtensor = scope->Var(prefix + "_W")->GetMutable<lite::Tensor>();
+  auto* wtensor = scope->Var(prefix + "_W")->GetMutable<Tensor>();
   wtensor->Resize(wshape);
   wtensor->mutable_data<float>();
 
-  auto* btensor = scope->Var(prefix + "_Bias")->GetMutable<lite::Tensor>();
+  auto* bias = block_desc->AddVar<cpp::VarDesc>();
+  bias->SetName(prefix + "_Bias");
+  auto* btensor = scope->Var(prefix + "_Bias")->GetMutable<Tensor>();
   btensor->Resize({wshape[1]});
   btensor->mutable_data<float>();
 
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  auto* out = block_desc->AddVar<cpp::VarDesc>();
+  out->SetName(prefix + "_Out");
+  std::vector<std::string> out_var_names{prefix + "_Out"};
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
 
   op_desc->SetType("fc");
   op_desc->SetInput("Input", input_var_names);
@@ -127,7 +83,7 @@ std::vector<std::string> AddElementwiseAddDesc(
   out->SetName(prefix + "_Out");
   std::vector<std::string> out_var_names{prefix + "_Out"};
 
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
 
   op_desc->SetType("elementwise_add");
   op_desc->SetInput("X", input_X_names);
@@ -151,7 +107,7 @@ std::vector<std::string> AddFeedDesc(
   out->SetName(prefix + "_Out");
   std::vector<std::string> out_var_names{prefix + "_Out"};
 
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
 
   op_desc->SetType("feed");
   op_desc->SetInput("X", input_X_names);
@@ -174,7 +130,7 @@ std::vector<std::string> AddFetchDesc(
   out->SetName(prefix + "_Out");
   std::vector<std::string> out_var_names{prefix + "_Out"};
 
-  scope->Var(prefix + "_Out")->GetMutable<lite::Tensor>();
+  scope->Var(prefix + "_Out")->GetMutable<Tensor>();
 
   op_desc->SetType("fetch");
   op_desc->SetInput("X", input_X_names);
@@ -184,41 +140,88 @@ std::vector<std::string> AddFetchDesc(
   return out_var_names;
 }
 
-std::unique_ptr<mir::SSAGraph> BuildSimpleNet(
-    cpp::ProgramDesc* program_desc,
-    const std::shared_ptr<Scope>& scope,
-    const std::vector<Place>& valid_places) {
-  program_desc->ClearBlocks();
-  auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>();
+TEST(Subgraph, detect_simple_model) {
+  cpp::ProgramDesc program_desc;
+  std::vector<Place> valid_places{{TARGET(kHost), PRECISION(kFloat)}};
+  auto scope = std::make_shared<Scope>();
+  // Build a simple network
+  program_desc.ClearBlocks();
+  auto* block_desc = program_desc.AddBlock<cpp::BlockDesc>();
   block_desc->ClearOps();
   block_desc->ClearVars();
-
   auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
   var_desc->SetName("feed_var");
-  auto* feed_var = scope->Var("feed_var")->GetMutable<lite::Tensor>();
+  auto* feed_var = scope->Var("feed_var")->GetMutable<Tensor>();
   feed_var->Resize({1, 4});
   auto fc1_out = AddFCDesc(block_desc, scope, {"feed_var"}, {4, 5});
   auto fc2_out = AddFCDesc(block_desc, scope, fc1_out, {5, 2});
-
-  lite::Program program(*program_desc, scope, valid_places);
+  Program program(program_desc, scope, valid_places);
   auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
   graph->Build(program, valid_places);
-
-  return graph;
+  // Apply subgraph detector and check results
+  auto teller = [](mir::Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    auto op_type = stmt.op_type();
+    const std::vector<std::string> supported_types = {"fc"};
+    return std::find(supported_types.begin(), supported_types.end(), op_type) !=
+           supported_types.end();
+  };
+  std::vector<std::vector<mir::Node*>> subgraphs =
+      mir::SubgraphDetector(graph.get(), teller)();
+  ASSERT_EQ(subgraphs.size(), 1);
+  ASSERT_EQ(graph->nodes().size(), 9);
+  mir::SubgraphVisualizer(graph.get(), subgraphs)();
 }
 
-TEST(SubGraphTest, SimpleNet) {
+TEST(Subgraph, detect_custom_model) {
+  if (FLAGS_model_dir.empty() && FLAGS_model_file.empty() &&
+      FLAGS_params_file.empty()) {
+    LOG(INFO) << "Using --model_dir, or --model_file and --params_file to set "
+                 "the path of model files.";
+    return;
+  }
   cpp::ProgramDesc program_desc;
-  std::vector<Place> places{{TARGET(kHost), PRECISION(kFloat)}};
   auto scope = std::make_shared<Scope>();
-  auto graph = BuildSimpleNet(&program_desc, scope, places);
-
-  std::vector<std::string> supported_op_types{"fc"};
-  auto* pass = new mir::subgraph::SubgraphProgramPass;
-  ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
-
-  ASSERT_EQ(graph->nodes().size(), 9);
-  // LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
+  LoadModelPb(FLAGS_model_dir,
+              FLAGS_model_file,
+              FLAGS_params_file,
+              scope.get(),
+              &program_desc,
+              !FLAGS_model_file.empty() && !FLAGS_params_file.empty(),
+              false);
+  std::vector<Place> valid_places({
+#ifdef LITE_WITH_ARM
+      Place{TARGET(kARM), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_X86
+      Place{TARGET(kX86), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_NPU
+      Place{TARGET(kNPU), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_XPU
+      Place{TARGET(kXPU), PRECISION(kFloat)},
+#endif
+  });
+  Program program(program_desc, scope, valid_places);
+  auto graph = std::unique_ptr<mir::SSAGraph>(new mir::SSAGraph());
+  graph->Build(program, valid_places);
+  // Apply subgraph detector and check results
+  auto teller = [](mir::Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    auto op_type = stmt.op_type();
+    const std::vector<std::string> unsupported_types = {
+        "feed", "fetch", "subgraph"};
+    return std::find(unsupported_types.begin(),
+                     unsupported_types.end(),
+                     op_type) == unsupported_types.end();
+  };
+  std::vector<std::vector<mir::Node*>> subgraphs =
+      mir::SubgraphDetector(graph.get(), teller)();
+  ASSERT_EQ(subgraphs.size(), 1);
+  mir::SubgraphVisualizer(graph.get(), subgraphs)();
 }
 
 }  // namespace lite
diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b974ac7043e2fc1c656c4bad69e7ca50fffaff8c
--- /dev/null
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/mir/subgraph/subgraph_pass.h"
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#include "lite/core/mir/pass_registry.h"
+#include "lite/core/mir/subgraph/subgraph_detector.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
+#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
+void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  std::unordered_set<std::string> supported_lists;
+#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
+#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
+#undef USE_SUBGRAPH_BRIDGE
+  auto teller = [&](Node* node) {
+    if (!node->IsStmt()) return false;
+    auto& stmt = node->AsStmt();
+    return supported_lists.count(stmt.op_type()) != 0;
+  };
+  SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
+  fuser();
+}
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
+    .BindTargets({TARGET(kNPU)});
+REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
+    .BindTargets({TARGET(kXPU)});
diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..554f54304afcd2eac3069c101f2e19ff9391fa66
--- /dev/null
+++ b/lite/core/mir/subgraph/subgraph_pass.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "lite/core/mir/pass.h"
+
+namespace paddle {
+namespace lite {
+namespace mir {
+
+class NPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+class XPUSubgraphPass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
similarity index 68%
rename from lite/core/mir/subgraph/generate_npu_program_pass_test.cc
rename to lite/core/mir/subgraph/subgraph_pass_test.cc
index 1afb54c692592ca42d8b120dcf1a91922e19149c..45c82a4262f16ab180596375cc037cc0e9febec2 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -30,7 +30,9 @@ DEFINE_int32(output_tensor_num, 1, "number of output tensors");
 namespace paddle {
 namespace lite {
 
-std::vector<std::vector<int64_t>> ParseShape(std::string txt) {
+// The helper functions for loading and running model from command line and
+// verifying output data
+std::vector<std::vector<int64_t>> ShapeParsing(std::string txt) {
   std::vector<std::vector<int64_t>> shape;
   while (!txt.empty()) {
     size_t idx = txt.find_first_of(":");
@@ -65,7 +67,7 @@ int64_t ShapeProduction(std::vector<int64_t> shape) {
   return s;
 }
 
-void FillInputTensor(
+void FillInputTensors(
     const std::shared_ptr<lite_api::PaddlePredictor>& predictor,
     const std::vector<std::vector<int64_t>>& input_tensor_shape,
     const float value) {
@@ -80,7 +82,7 @@ void FillInputTensor(
   }
 }
 
-void CompareOutputTensor(
+void CheckOutputTensors(
     const std::shared_ptr<lite_api::PaddlePredictor>& tar_predictor,
     const std::shared_ptr<lite_api::PaddlePredictor>& ref_predictor,
     const int output_tensor_num) {
@@ -96,7 +98,7 @@ void CompareOutputTensor(
       auto abs_diff =
           std::fabs(tar_output_tensor_data[j] - ref_output_tensor_data[j]);
       auto rel_diff = abs_diff / (std::fabs(ref_output_tensor_data[j]) + 1e-6);
-      VLOG(3) << "val: " << tar_output_tensor_data[j]
+      VLOG(5) << "val: " << tar_output_tensor_data[j]
               << " ref: " << ref_output_tensor_data[j]
               << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff;
       EXPECT_LT(rel_diff, 0.1);
@@ -111,24 +113,23 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
     const std::vector<lite_api::Place>& valid_places,
     const std::vector<std::vector<int64_t>>& input_tensor_shape,
     const std::string& optimized_model_dir) {
-  // generate optimized model
+  // Generate optimized model
   lite_api::CxxConfig cxx_config;
   cxx_config.set_model_dir(model_dir);
   cxx_config.set_model_file(model_file);
   cxx_config.set_param_file(params_file);
   cxx_config.set_valid_places(valid_places);
   auto predictor = lite_api::CreatePaddlePredictor(cxx_config);
-  FillInputTensor(predictor, input_tensor_shape, 1);
   predictor->SaveOptimizedModel(optimized_model_dir,
                                 lite_api::LiteModelType::kNaiveBuffer);
-  // load optimized model
+  // Load optimized model
   lite_api::MobileConfig mobile_config;
   mobile_config.set_model_dir(optimized_model_dir);
   mobile_config.set_power_mode(lite_api::PowerMode::LITE_POWER_HIGH);
   mobile_config.set_threads(1);
   predictor = lite_api::CreatePaddlePredictor(mobile_config);
-  FillInputTensor(predictor, input_tensor_shape, 1);
-  // run optimized model
+  FillInputTensors(predictor, input_tensor_shape, 1);
+  // Run optimized model
   for (int i = 0; i < FLAGS_warmup; i++) {
     predictor->Run();
   }
@@ -140,32 +141,48 @@ std::shared_ptr<lite_api::PaddlePredictor> TestModel(
   return predictor;
 }
 
-TEST(NPUSubgraph, compare) {
-  // parsing input tensor shape, supported formats: "1,3,224,224"
-  // "1,3,224,224:1,80"
+TEST(Subgraph, generate_model_and_check_precision) {
+  if (FLAGS_model_dir.empty() && FLAGS_model_file.empty() &&
+      FLAGS_params_file.empty()) {
+    LOG(INFO) << "Using --model_dir, or --model_file and --params_file to set "
+                 "the path of model files.";
+    return;
+  }
+  // Parsing the shapes of input tensors from strings, supported formats:
+  // "1,3,224,224" and "1,3,224,224:1,80"
   std::vector<std::vector<int64_t>> input_tensor_shape =
-      ParseShape(FLAGS_input_tensor_shape);
-  // generate and run optimized CPU model
-  LOG(INFO) << " ================ CPU ================== ";
-  auto cpu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kARM), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/CPU");
-  // generate and run optimized NPU model
-  LOG(INFO) << " ================ NPU ================== ";
-  auto npu_predictor =
-      TestModel(FLAGS_model_dir,
-                FLAGS_model_file,
-                FLAGS_params_file,
-                {lite_api::Place{TARGET(kNPU), PRECISION(kFloat)},
-                 lite_api::Place{TARGET(kARM), PRECISION(kFloat)}},
-                input_tensor_shape,
-                FLAGS_optimized_model_dir + "/NPU");
-  // verify results
-  CompareOutputTensor(npu_predictor, cpu_predictor, FLAGS_output_tensor_num);
+      ShapeParsing(FLAGS_input_tensor_shape);
+  std::vector<lite_api::Place> valid_places({
+#ifdef LITE_WITH_ARM
+      lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_X86
+      lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
+#endif
+  });
+  // Generate and run optimized model on CPU as the reference predictor
+  auto ref_predictor = TestModel(FLAGS_model_dir,
+                                 FLAGS_model_file,
+                                 FLAGS_params_file,
+                                 valid_places,
+                                 input_tensor_shape,
+                                 FLAGS_optimized_model_dir + "/ref_opt_model");
+// Generate and run optimized model on NPU/XPU as the target predictor
+#ifdef LITE_WITH_NPU
+  valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
+#endif
+#ifdef LITE_WITH_XPU
+  valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
+#endif
+  auto tar_predictor = TestModel(FLAGS_model_dir,
+                                 FLAGS_model_file,
+                                 FLAGS_params_file,
+                                 valid_places,
+                                 input_tensor_shape,
+                                 FLAGS_optimized_model_dir + "/tar_opt_model");
+  // Check the difference of the output tensors between reference predictor and
+  // target predictor
+  CheckOutputTensors(tar_predictor, ref_predictor, FLAGS_output_tensor_num);
 }
 
 }  // namespace lite
diff --git a/lite/core/mir/subgraph/subgraph_program_pass.cc b/lite/core/mir/subgraph/subgraph_program_pass.cc
deleted file mode 100644
index 719a01dfd892f83da5e1d9b1efa6df758612acc7..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/subgraph_program_pass.cc
+++ /dev/null
@@ -1,345 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/core/mir/subgraph/subgraph_program_pass.h"
-#include <memory>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "lite/core/mir/graph_visualize_pass.h"
-#include "lite/core/mir/pass_registry.h"
-#include "lite/core/mir/pattern_matcher.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-std::unordered_map<int, std::unordered_set<Node*>>
-SubgraphProgramPass::ClassifySubgraph(const std::unique_ptr<SSAGraph>& graph) {
-  std::unordered_map<int, std::unordered_set<Node*>> op_nodes;
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    int sub_id = stmt.subgraph_id();
-    if (sub_id < 1) continue;
-    if (!op_nodes.count(sub_id)) {
-      op_nodes[sub_id] = std::unordered_set<Node*>();
-    }
-    op_nodes.at(sub_id).insert(item);
-  }
-  return op_nodes;
-}
-
-cpp::OpDesc SubgraphProgramPass::GenGraphOpDesc(
-    const std::string& weight_var_name,
-    const std::vector<std::string>& in_var_names,
-    const std::vector<std::string>& out_var_names) {
-  cpp::OpDesc op_desc;
-  op_desc.SetType("graph_op");
-  op_desc.SetInput("Inputs", in_var_names);
-  op_desc.SetInput("Weight", {weight_var_name});
-  op_desc.SetOutput("Outputs", out_var_names);
-  return op_desc;
-}
-
-void SubgraphProgramPass::InsertNewNode(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::string& weight_var_name,
-    Scope* scope,
-    const std::vector<Place>& valid_places,
-    std::unordered_set<Node*> in_data_vars,
-    std::unordered_set<Node*> in_wgt_vars,
-    std::unordered_set<Node*> out_data_vars,
-    std::unordered_set<Node*> out_unused_vars) {
-  std::vector<std::string> in_var_names;
-  std::vector<std::string> out_var_names;
-  for (auto i : in_data_vars) {
-    in_var_names.push_back(i->AsArg().name);
-  }
-  for (auto i : out_data_vars) {
-    out_var_names.push_back(i->AsArg().name);
-  }
-
-  auto op_desc = GenGraphOpDesc(weight_var_name, in_var_names, out_var_names);
-
-  auto graph_op = LiteOpRegistry::Global().Create("graph_op");
-  graph_op->Attach(op_desc, scope);
-  auto* new_op_node = graph->GraphCreateInstructNode(graph_op, valid_places);
-
-  for (auto& in_var : in_data_vars) {
-    IR_NODE_LINK_TO(in_var, new_op_node);
-  }
-  for (auto& in_var : in_wgt_vars) {
-    IR_NODE_LINK_TO(in_var, new_op_node);
-  }
-  for (auto& out_var : out_data_vars) {
-    IR_OP_VAR_LINK(new_op_node, out_var);
-  }
-  for (auto& out_var : out_unused_vars) {
-    IR_OP_VAR_LINK(new_op_node, out_var);
-  }
-
-  // add weight node to store pre-compilied NPU model
-  auto new_weight_node = graph->NewArgumentNode(weight_var_name);
-  new_weight_node->AsArg().is_weight = true;
-  new_weight_node->AsArg().is_persist = true;
-  DirectedLink(new_weight_node, new_op_node);
-
-  // assign context
-  auto& inst = new_op_node->AsStmt();
-  inst.picked_kernel().SetContext(
-      ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
-}
-
-void SubgraphProgramPass::SortHelper(
-    Node* node,
-    const std::unordered_set<Node*>& nodes_all,
-    std::unordered_set<const Node*>* visited_nodes,
-    std::vector<Node*>* ret) {
-  for (auto& var_node : node->inlinks) {
-    if (var_node->inlinks.empty()) continue;
-    auto* op_node = var_node->inlinks.front();
-    if (nodes_all.count(op_node) && !visited_nodes->count(op_node)) {
-      SortHelper(op_node, nodes_all, visited_nodes, ret);
-    }
-  }
-  ret->push_back(node);
-  visited_nodes->insert(node);
-}
-
-std::vector<Node*> SubgraphProgramPass::GetTopologicalOrder(
-    const std::unordered_set<Node*>& nodes) {
-  std::unordered_set<const Node*> visited;
-  std::vector<Node*> ret;
-  for (auto& node : nodes) {
-    if (!node->IsStmt()) continue;
-    if (visited.count(node)) continue;
-    SortHelper(node, nodes, &visited, &ret);
-  }
-  return ret;
-}
-
-void SubgraphProgramPass::FindInputOutputVars(
-    const std::unordered_set<Node*>& op_nodes,
-    std::unordered_set<Node*>* in_data_vars,
-    std::unordered_set<Node*>* in_wgt_vars,
-    std::unordered_set<Node*>* out_data_vars,
-    std::unordered_set<Node*>* out_unused_vars) {
-  for (auto& op_node : op_nodes) {
-    for (auto& in_var : op_node->inlinks) {
-      if (in_var->AsArg().is_weight) {
-        in_wgt_vars->insert(in_var);
-        continue;
-      }
-      if (!in_var->inlinks.empty()) {
-        // var can only come from one op node, so use front
-        auto* pre_op_node = in_var->inlinks.front();
-        if (op_nodes.count(pre_op_node)) {
-          continue;
-        }
-      }
-      in_data_vars->insert(in_var);
-    }
-    for (auto& out_var : op_node->outlinks) {
-      if (out_var->outlinks.empty()) {
-        // the next op is empty so this var is actually unused
-        out_unused_vars->insert(out_var);
-        continue;
-      }
-      // var can have more than one next op node
-      // so, if any one in the op_nodes then continue
-      bool next_op_in_nodes = false;
-      for (auto& next_op_node : out_var->outlinks) {
-        if (op_nodes.count(next_op_node)) {
-          next_op_in_nodes = true;
-        }
-      }
-      if (next_op_in_nodes) {
-        continue;
-      }
-
-      out_data_vars->insert(out_var);
-    }
-  }
-}
-
-std::unordered_set<const Node*> SubgraphProgramPass::GetNode2rm(
-    const std::unordered_set<Node*>& op_nodes,
-    const std::vector<std::unordered_set<Node*>>& excluded_nodes) {
-  std::unordered_set<const Node*> nodes2rm(op_nodes.begin(), op_nodes.end());
-  for (auto& op_node : op_nodes) {
-    for (auto& in_var : op_node->inlinks) {
-      if (!nodes2rm.count(in_var)) {
-        nodes2rm.insert(in_var);
-      }
-    }
-    for (auto& out_var : op_node->outlinks) {
-      if (!nodes2rm.count(out_var)) {
-        nodes2rm.insert(out_var);
-      }
-    }
-  }
-  // some nodes should not be removed
-  for (auto& e : excluded_nodes) {
-    for (auto& i : e) {
-      if (nodes2rm.count(i)) {
-        nodes2rm.erase(i);
-      }
-    }
-  }
-  return nodes2rm;
-}
-
-void SubgraphProgramPass::InferOnce(const std::unique_ptr<SSAGraph>& graph) {
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    auto& op = stmt.op();
-    auto scope = op->scope();
-    std::string op_type = op->op_info()->Type();
-    // check the dimension of input variables in the scope, must not be empty !
-    if (op_type == "feed") {
-      auto input_var_names = op->op_info()->output_names();
-      CHECK_GE(input_var_names.size(), 1);
-      for (auto input_var_name : input_var_names) {
-        auto input_var = scope->FindVar(input_var_name);
-        CHECK(input_var) << "No input variable '" << input_var_name
-                         << "' found in scope " << scope;
-        auto input = input_var->GetMutable<lite::Tensor>();
-        CHECK(!input->dims().empty()) << "The dimension of input variable '"
-                                      << input_var_name
-                                      << "' can not be empty.";
-      }
-      continue;
-    }
-    if (op_type == "fetch") {
-      continue;
-    }
-    op->CheckShape();
-    op->InferShape();
-
-#ifndef LITH_WITH_XPU
-    // TOOD(xxx): remove Launch() at last
-    auto& kkks = stmt.kernels();
-    if (!kkks.empty()) {
-      auto& kk = stmt.kernels().front();
-      if (kk) {
-        kk->Launch();
-      }
-    }
-#endif
-  }
-}
-
-void SubgraphProgramPass::InitSubgraphID(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::vector<std::string>& supported_op_types) {
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    stmt.ClearSubgraphID();
-    if (std::find(supported_op_types.begin(),
-                  supported_op_types.end(),
-                  stmt.op_type()) != supported_op_types.end()) {
-      stmt.SetSubgraphID(0);
-      LOG(INFO) << "supported " << stmt.op_type();
-    } else {
-      LOG(INFO) << "======= not supported " << stmt.op_type();
-    }
-  }
-}
-
-// mark current and all output supported nodes
-void SubgraphProgramPass::ChangeAllOutConnectedID(Node* node,
-                                                  int to_id,
-                                                  int from_id) {
-  if (!node) return;
-  if (node->IsStmt()) {
-    auto& stmt = node->AsStmt();
-    if (stmt.subgraph_id() == from_id) {
-      stmt.SetSubgraphID(to_id);
-      for (auto& i : node->outlinks) {
-        ChangeAllOutConnectedID(i, to_id, from_id);
-      }
-    } else {
-      LOG(INFO) << "failed op type:" << stmt.op_type();
-      return;
-    }
-  } else {
-    // this it arg node
-    bool all_out_op_supported = true;
-    for (auto& i : node->outlinks) {
-      if (!i->IsStmt()) return;
-      auto& stmt = i->AsStmt();
-      if (stmt.subgraph_id() < from_id) {
-        all_out_op_supported = false;
-      }
-    }
-    if (!all_out_op_supported) {
-      return;
-    }
-    for (auto& i : node->outlinks) {
-      CHECK(i->IsStmt());
-      auto& stmt = i->AsStmt();
-      if (stmt.subgraph_id() == from_id) {
-        stmt.SetSubgraphID(to_id);
-        for (auto& o : i->outlinks) {
-          ChangeAllOutConnectedID(o, to_id, from_id);
-        }
-      }
-    }
-  }
-}
-
-int SubgraphProgramPass::FuseSubgraphID(
-    const std::unique_ptr<SSAGraph>& graph) {
-  int sub_id = 1;  // id start from 1 not 0
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    // bool inputvar = false;
-    if (!item->IsStmt()) continue;
-    auto& stmt = item->AsStmt();
-    /*
-    if (stmt.subgraph_id() == -1) {
-      for (auto& i : item->outlinks) {
-        for (auto& j : i->outlinks) {
-          if (j->IsStmt()) {
-            auto& jstmt = j->AsStmt();
-            if (jstmt.subgraph_id() == 0) inputvar = true;
-          }
-        }
-      }
-    }
-    */
-    if (stmt.subgraph_id() != 0) continue;
-    ChangeAllOutConnectedID(item, sub_id);
-    sub_id++;
-  }
-  return sub_id - 1;
-}
-
-int SubgraphProgramPass::FuseSubgraph(
-    const std::unique_ptr<SSAGraph>& graph,
-    const std::vector<std::string>& supported_op_types) {
-  InitSubgraphID(graph, supported_op_types);
-  return FuseSubgraphID(graph);
-}
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_MIR_PASS(subgraph_program_pass,
-                  paddle::lite::mir::subgraph::SubgraphProgramPass)
-    .BindTargets({TARGET(kAny)});
diff --git a/lite/core/mir/subgraph/subgraph_program_pass.h b/lite/core/mir/subgraph/subgraph_program_pass.h
deleted file mode 100644
index 24c0233bbb428a71fa5645b23573494b5067d8b1..0000000000000000000000000000000000000000
--- a/lite/core/mir/subgraph/subgraph_program_pass.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-#include "lite/core/mir/pass.h"
-
-namespace paddle {
-namespace lite {
-namespace mir {
-namespace subgraph {
-
-class SubgraphProgramPass : public ProgramPass {
- public:
-  using key2nodes_t = std::map<std::string, Node*>;
-
-  // make all the linked ops in subgraph with same subgraph_id
-  // return the fused subgraph numbers
-  int FuseSubgraph(const std::unique_ptr<SSAGraph>& graph,
-                   const std::vector<std::string>& supported_op_types);
-
-  void Apply(const std::unique_ptr<SSAGraph>& graph) override{};
-
- protected:
-  void InferOnce(const std::unique_ptr<SSAGraph>& graph);
-
-  // clear all subgraph id and mark all ops, which could be fuse, as id zero
-  void InitSubgraphID(const std::unique_ptr<SSAGraph>& graph,
-                      const std::vector<std::string>& supported_op_types);
-
-  // make all the linked ops in subgraph with same subgraph_id
-  // return the fused subgraph numbers
-  int FuseSubgraphID(const std::unique_ptr<SSAGraph>& graph);
-
-  // // GenerateFusedGraph:
-  // std::unique_ptr<SSAGraph> GenerateFusedGraph(const
-  // std::unique_ptr<SSAGraph>& graph, int sub_num);
-  void ChangeAllOutConnectedID(Node* node, int to_id, int from_id = 0);
-
-  // Below function cloud be useful in child classes //
-  // classify node by subgraph id
-  std::unordered_map<int, std::unordered_set<Node*>> ClassifySubgraph(
-      const std::unique_ptr<SSAGraph>& graph);
-
-  // generate the graph op desc
-  cpp::OpDesc GenGraphOpDesc(const std::string& weight_var_name,
-                             const std::vector<std::string>& in_var_names,
-                             const std::vector<std::string>& out_var_names);
-
-  // insert a new graph op node
-  void InsertNewNode(const std::unique_ptr<SSAGraph>& graph,
-                     const std::string& weight_var_name,
-                     Scope* scope,
-                     const std::vector<Place>& valid_places,
-                     std::unordered_set<Node*> in_data_vars,
-                     std::unordered_set<Node*> in_wgt_vars,
-                     std::unordered_set<Node*> out_data_vars,
-                     std::unordered_set<Node*> out_unused_vars);
-
-  // Sort and return the topology order of nodes set
-  std::vector<Node*> GetTopologicalOrder(
-      const std::unordered_set<Node*>& nodes);
-
-  // find all input data vars, input weight vars,
-  // output data vars and output vars from the nodes
-  void FindInputOutputVars(const std::unordered_set<Node*>& op_nodes,
-                           std::unordered_set<Node*>* in_data_vars,
-                           std::unordered_set<Node*>* in_wgt_vars,
-                           std::unordered_set<Node*>* out_data_vars,
-                           std::unordered_set<Node*>* out_unused_vars);
-
-  // return the node to remove in the subgraph
-  std::unordered_set<const Node*> GetNode2rm(
-      const std::unordered_set<Node*>& op_nodes,
-      const std::vector<std::unordered_set<Node*>>& excluded_nodes);
-
- private:
-  // sort nodes to operational sequence
-  void SortHelper(Node* node,
-                  const std::unordered_set<Node*>& nodes_all,
-                  std::unordered_set<const Node*>* visited_nodes,
-                  std::vector<Node*>* ret);
-};
-
-}  // namespace subgraph
-}  // namespace mir
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index b008faa687474a88988adb9da81c594306298b26..ae74bd8d4d5647139a13509dfda0bb2b41ecc5c7 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -16,6 +16,7 @@
 #include <list>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/core/mir/graph_visualize_pass.h"
@@ -35,18 +36,23 @@ void TypeTargetTransformPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
   CHECK(!valid_places_.empty());
 
+  // record the copied node.
+  std::unordered_map<std::string, Node*> copied_nodes;
+
   for (auto& node : nodes) {
     if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
     auto inlinks = node->inlinks;
     for (auto* in : inlinks) {
-      ComplementInputs(graph.get(), node, in);
+      ComplementInputs(graph.get(), node, in, &copied_nodes);
     }
   }
 }
 
-void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph,
-                                               Node* inst_node,
-                                               Node* in) {
+void TypeTargetTransformPass::ComplementInputs(
+    SSAGraph* graph,
+    Node* inst_node,
+    Node* in,
+    std::unordered_map<std::string, Node*>* copied_nodes) {
   // If this input is out of date.
   if (inst_node->inlinks.end() ==
       std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
@@ -67,8 +73,13 @@ void TypeTargetTransformPass::ComplementInputs(SSAGraph* graph,
             << " for kernel " << inst.op()->DebugString() << " "
             << *in->AsArg().type << " -> " << *decl_arg_type;
     // Add an IoCopy instruction to make the input compatible with other dist.
-    AddIoCopyInst(
-        *in->AsArg().type, *decl_arg_type, in, graph, inst_node, valid_places_);
+    AddIoCopyInst(*in->AsArg().type,
+                  *decl_arg_type,
+                  in,
+                  graph,
+                  inst_node,
+                  copied_nodes,
+                  valid_places_);
   }
 }
 
@@ -78,128 +89,132 @@ void TypeTargetTransformPass::AddIoCopyInst(
     Node* in,
     SSAGraph* graph,
     Node* inst_node,
+    std::unordered_map<std::string, Node*>* copied_nodes,
     const std::vector<Place>& valid_places) {
   CHECK(!valid_places.empty()) << "valid_place should be set";
   // var -> new_transform_op -> new_var -> inst
   // So there will be a new Argument node and a new IoCopy Statement Node.
 
   CHECK(in->IsArg());
+
   // auto node_id = [&] { return graph->nodes().size(); };
   auto io_copy_output_name =
       string_format("%s/target_trans", in->AsArg().name.c_str());
   // string_format("%s/target_trans/%d", in->AsArg().name.c_str(), node_id());
-  // TODO(MyPandaShaoxiang) should set same place with input?
-  auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
-  // Set the place for io_copy_output_arg node, the target should be equal to
-  // to.target()
-  // The precision and layout should be equal to from.precision(), from.layout()
-  io_copy_output_arg->AsArg().type =
-      LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
-  auto* io_copy_inst = graph->NewInstructNode();
-
-  bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
-  std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
-  io_copy_output_arg->AsArg().is_persist = in_persist;
-  // create Op and kernels.
-  auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type);
-  CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed";
-  // CHECK(io_copy_op);
-  // Create the new var manually.
-  inst_node->AsStmt().op()->scope()->Var(io_copy_output_name);
-
-  // Create IoCopy Instruction.
-  cpp::OpDesc op_desc;
-  op_desc.SetType(io_copy_type);
-  op_desc.SetInput("Input", {in->AsArg().name});
-  op_desc.SetOutput("Out", {io_copy_output_name});
-
-  io_copy_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
-  auto kernels = io_copy_op->CreateKernels(valid_places);
-  // fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type
-  bool is_found = false;
-  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
-  for (auto& kernel : kernels) {
-    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-
-    VLOG(4) << "------ kernel info -------";
-    VLOG(4) << "*in_arg_ty(io_copy kernel input):" << *in_arg_ty;
-    VLOG(4) << "from(last kernel output):" << from;
-    VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
-    VLOG(4) << "to:" << to << "\n";
-
-    // kernel choose branch for opencl backend
-    //   judge inst's target whether is kOpenCL
-    //   Note: to == *decl_arg_type == in of inst, not output of last inst
-    // ignore [layout check] for layout between [to] and [from]
-    //   Because all of origin opencl insts in model, are not default layout
-    //   NCHW,
-    //   so skip layout check.
-    // detailed node info see below:
-    //     [*in->AsArg().type] -> [from]: out of inst's previous kernel
-    //     [*decl_arg_type] -> [to]: input of inst, not output of last
-    //     [in_arg_ty]: in of io_copy
-    //     [out_arg_ty]: out of io_copy
-    //
-    // noto: replace LITE_WITH_OPENCL macro with judge input and output target
-    // of io_copy
-    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
-         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
-        (TargetCompatibleTo(*in_arg_ty, from) &&
-         PrecisionCompatibleTo(*in_arg_ty, from) &&
-         DeviceCompatibleTo(*in_arg_ty, from) &&
-         TargetCompatibleTo(*out_arg_ty, to))) {
-      VLOG(4) << "picked, opencl found";
-      is_found = true;
-    } else if (TypeCompatible(*in_arg_ty, from) &&
-               out_arg_ty->target() == to.target()) {
-      VLOG(4) << "picked";
-      is_found = true;
-    }
 
-    if (is_found) {
-      selected_kernels.emplace_back(std::move(kernel));
-      // we pick the kernel
-      io_copy_inst->AsStmt(
-          io_copy_type, std::move(selected_kernels), io_copy_op);
-      break;
+  if (copied_nodes->count(in->AsArg().name)) {
+    // Remove the old link
+    RemoveDirectedLink(in, inst_node);
+
+    // Update the original instruction OpDesc.
+    // Update its input to the io_copy_output_name
+    // Add new link, newarg->inst
+    DirectedLink(copied_nodes->at(in->AsArg().name),
+                 inst_node);  // [io_copy kernel]'s output -> [current kernel]
+
+    UpdateInstNode(in, graph, inst_node, io_copy_output_name);
+  } else {
+    // TODO(MyPandaShaoxiang) should set same place with input?
+    auto* io_copy_output_arg = graph->NewArgumentNode(io_copy_output_name);
+    // Set the place for io_copy_output_arg node, the target should be equal to
+    // to.target()
+    // The precision and layout should be equal to from.precision(),
+    // from.layout()
+    io_copy_output_arg->AsArg().type =
+        LiteType::GetTensorTy(to.target(), from.precision(), from.layout());
+    auto* io_copy_inst = graph->NewInstructNode();
+
+    bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
+    std::string io_copy_type = in_persist ? "io_copy_once" : "io_copy";
+    io_copy_output_arg->AsArg().is_persist = in_persist;
+    // create Op and kernels.
+    auto io_copy_op = LiteOpRegistry::Global().Create(io_copy_type);
+    CHECK(io_copy_op) << "create op [" << io_copy_op << "] failed";
+    // CHECK(io_copy_op);
+    // Create the new var manually.
+    inst_node->AsStmt().op()->scope()->Var(io_copy_output_name);
+
+    // Create IoCopy Instruction.
+    cpp::OpDesc op_desc;
+    op_desc.SetType(io_copy_type);
+    op_desc.SetInput("Input", {in->AsArg().name});
+    op_desc.SetOutput("Out", {io_copy_output_name});
+
+    io_copy_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
+    auto kernels = io_copy_op->CreateKernels(valid_places);
+    // fix(MyPandaShaoxiang): select kernel that input_dcl_type same as in.type
+    bool is_found = false;
+    std::vector<std::unique_ptr<KernelBase>> selected_kernels;
+    for (auto& kernel : kernels) {
+      const Type* in_arg_ty = kernel->GetInputDeclType("Input");
+      const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
+
+      VLOG(4) << "------ kernel info -------";
+      VLOG(4) << "*in_arg_ty(io_copy kernel input):" << *in_arg_ty;
+      VLOG(4) << "from(last kernel output):" << from;
+      VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
+      VLOG(4) << "to:" << to << "\n";
+
+      // kernel choose branch for opencl backend
+      //   judge inst's target whether is kOpenCL
+      //   Note: to == *decl_arg_type == in of inst, not output of last inst
+      // ignore [layout check] for layout between [to] and [from]
+      //   Because all of origin opencl insts in model, are not default layout
+      //   NCHW,
+      //   so skip layout check.
+      // detailed node info see below:
+      //     [*in->AsArg().type] -> [from]: out of inst's previous kernel
+      //     [*decl_arg_type] -> [to]: input of inst, not output of last
+      //     [in_arg_ty]: in of io_copy
+      //     [out_arg_ty]: out of io_copy
+      //
+      // noto: replace LITE_WITH_OPENCL macro with judge input and output target
+      // of io_copy
+      if ((in_arg_ty->target() == TARGET(kOpenCL) ||
+           out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
+          (TargetCompatibleTo(*in_arg_ty, from) &&
+           PrecisionCompatibleTo(*in_arg_ty, from) &&
+           DeviceCompatibleTo(*in_arg_ty, from) &&
+           TargetCompatibleTo(*out_arg_ty, to))) {
+        VLOG(4) << "picked, opencl found";
+        is_found = true;
+      } else if (TypeCompatible(*in_arg_ty, from) &&
+                 out_arg_ty->target() == to.target()) {
+        VLOG(4) << "picked";
+        is_found = true;
+      }
+
+      if (is_found) {
+        selected_kernels.emplace_back(std::move(kernel));
+        // we pick the kernel
+        io_copy_inst->AsStmt(
+            io_copy_type, std::move(selected_kernels), io_copy_op);
+        (*copied_nodes)[in->AsArg().name] = io_copy_output_arg;
+        break;
+      }
+
+      VLOG(4) << "not picked";
     }
 
-    VLOG(4) << "not picked";
-  }
+    CHECK(is_found) << "Can't find a io_copy  kernel for io_copy op: " << from
+                    << ":" << in->AsArg().name << " -> " << to << ":"
+                    << inst_node->AsStmt().op_info()->Type();
+    // Remove the old link
+    RemoveDirectedLink(in, inst_node);
 
-  CHECK(is_found) << "Can't find a io_copy  kernel for io_copy op: " << from
-                  << ":" << in->AsArg().name << " -> " << to << ":"
-                  << inst_node->AsStmt().op_info()->Type();
-  // Remove the old link
-  RemoveDirectedLink(in, inst_node);
-
-  // Update the original instruction OpDesc.
-  // Update its input to the io_copy_output_name
-  // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
-  DirectedLink(in, io_copy_inst);  // [last kernel]'s output -> [io_copy kernel]
-  DirectedLink(
-      io_copy_inst,
-      io_copy_output_arg);  // [io_copy kernel] -> [io_copy kernel]'s output
-  DirectedLink(io_copy_output_arg,
-               inst_node);  // [io_copy kernel]'s output -> [current kernel]
+    // Update the original instruction OpDesc.
+    // Update its input to the io_copy_output_name
+    // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
+    DirectedLink(in,
+                 io_copy_inst);  // [last kernel]'s output -> [io_copy kernel]
+    DirectedLink(
+        io_copy_inst,
+        io_copy_output_arg);  // [io_copy kernel] -> [io_copy kernel]'s output
+    DirectedLink(io_copy_output_arg,
+                 inst_node);  // [io_copy kernel]'s output -> [current kernel]
 
-  // reset opdesc and update kernel information
-  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
-                in->AsArg().name,
-                io_copy_output_name);
-  auto original_selected_kernel =
-      std::move(inst_node->AsStmt().kernels().front());
-  auto update_op_info = *inst_node->AsStmt().op_info();
-  // ResetOp() will change the Stmt op_info_ value,
-  // after that the old op_info_ value will be nullified.
-  // So, we can't pass `*inst_node->AsStmt().op_info()` into ResetOp.
-  // `update_op_info` is the copy of `*inst_node->AsStmt().op_info().
-  // Whenever update the op_info of a stmt, we should call its ResetOp().
-  inst_node->AsStmt().ResetOp(update_op_info, graph->valid_places());
-  inst_node->AsStmt().kernels().clear();
-  inst_node->AsStmt().kernels().emplace_back(
-      std::move(original_selected_kernel));
+    UpdateInstNode(in, graph, inst_node, io_copy_output_name);
+  }
 
   std::string tmp;
   if (inst_node->AsStmt().op_info()->GetInputArgname("a", &tmp)) {
@@ -220,6 +235,28 @@ void TypeTargetTransformPass::SetValidPlaces(
   valid_places_ = valid_places;
 }
 
+void TypeTargetTransformPass::UpdateInstNode(Node* in,
+                                             SSAGraph* graph,
+                                             Node* inst_node,
+                                             std::string io_copy_output_name) {
+  // reset opdesc and update kernel information
+  UpdateInputTo(inst_node->AsStmt().op()->mutable_op_info(),
+                in->AsArg().name,
+                io_copy_output_name);
+  auto original_selected_kernel =
+      std::move(inst_node->AsStmt().kernels().front());
+  auto update_op_info = *inst_node->AsStmt().op_info();
+  // ResetOp() will change the Stmt op_info_ value,
+  // after that the old op_info_ value will be nullified.
+  // So, we can't pass `*inst_node->AsStmt().op_info()` into ResetOp.
+  // `update_op_info` is the copy of `*inst_node->AsStmt().op_info().
+  // Whenever update the op_info of a stmt, we should call its ResetOp().
+  inst_node->AsStmt().ResetOp(update_op_info, graph->valid_places());
+  inst_node->AsStmt().kernels().clear();
+  inst_node->AsStmt().kernels().emplace_back(
+      std::move(original_selected_kernel));
+}
+
 }  // namespace mir
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/core/mir/type_target_cast_pass.h b/lite/core/mir/type_target_cast_pass.h
index 8a8cfaf9f9282cb477f7b9dd404d6f869333221b..e9a275882f7c2cb813c1c0b8add5cc4ca89b0c8b 100644
--- a/lite/core/mir/type_target_cast_pass.h
+++ b/lite/core/mir/type_target_cast_pass.h
@@ -16,6 +16,7 @@
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "lite/core/mir/pass.h"
 #include "lite/core/op_registry.h"
@@ -44,13 +45,17 @@ class TypeTargetTransformPass : public ProgramPass {
  public:
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
 
-  void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in);
+  void ComplementInputs(SSAGraph* graph,
+                        Node* inst_node,
+                        Node* in,
+                        std::unordered_map<std::string, Node*>* copied_nodes);
 
   void AddIoCopyInst(const Type& from,
                      const Type& to,
                      Node* in,
                      SSAGraph* graph,
                      Node* inst_node,
+                     std::unordered_map<std::string, Node*>* copied_nodes,
                      const std::vector<Place>& valid_places);
 
   void SetValidPlaces(const std::vector<Place>& valid_places);
@@ -58,6 +63,11 @@ class TypeTargetTransformPass : public ProgramPass {
   const std::vector<Place>& valid_places() const { return valid_places_; }
 
  private:
+  void UpdateInstNode(Node* in,
+                      SSAGraph* graph,
+                      Node* inst_node,
+                      std::string io_copy_output_name);
+
   std::vector<Place> valid_places_;
 };
 
diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h
index 3f5d161a56aafa7fd9d058fd404e65cb04572116..875bf23082a24cb6fcae878b46cc9dcdbb2b76f7 100644
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -48,6 +48,10 @@ class VariablePlaceInferencePass : public DebugPass {
   void CheckAllArgumentTypeDetermined(SSAGraph* graph) {
     for (auto& node : graph->mutable_nodes()) {
       if (node.IsArg()) {
+        if (node.inlinks.size() == 0 && node.outlinks.size() == 0) {
+          // empty node
+          continue;
+        }
         CHECK(node.AsArg().type) << "node " << node.AsArg().name
                                  << " type not determined, " << &node;
       }
@@ -129,6 +133,17 @@ class VariablePlaceInferencePass : public DebugPass {
           } else {
             x_in->AsArg().type = type;
           }
+        } else if (x_in->AsArg().type->target() == TARGET(kUnk) &&
+                   x_in->AsArg().type->precision() != PRECISION(kUnk) &&
+                   x_in->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+          // If is quantization, infer the Int8 type.
+          if (type->precision() == PRECISION(kInt8)) {
+            x_in->AsArg().type = type;
+          } else {
+            PrecisionType tmp_ptype = x_in->AsArg().type->precision();
+            x_in->AsArg().type = LiteType::GetTensorTy(
+                type->target(), tmp_ptype, type->layout());
+          }
         }
       }
 
@@ -149,6 +164,17 @@ class VariablePlaceInferencePass : public DebugPass {
           } else {
             x_out->AsArg().type = type;
           }
+        } else if (x_out->AsArg().type->target() == TARGET(kUnk) &&
+                   x_out->AsArg().type->precision() != PRECISION(kUnk) &&
+                   x_out->AsArg().type->layout() == DATALAYOUT(kUnk)) {
+          // If is quantization, infer the Int8 type.
+          if (type->precision() == PRECISION(kInt8)) {
+            x_out->AsArg().type = type;
+          } else {
+            PrecisionType tmp_ptype = x_out->AsArg().type->precision();
+            x_out->AsArg().type = LiteType::GetTensorTy(
+                type->target(), tmp_ptype, type->layout());
+          }
         }
       }
     }
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index c23d3157e0a7ec77ec26afad6092d0be9a63a436..716ce9d6a82b07270b5029f4cddf6a6b808c6c21 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -40,6 +40,18 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
       return Create<TARGET(target__),                                        \
                     PRECISION(precision__),                                  \
                     DATALAYOUT(kNHWC)>(op_type);                             \
+    case DATALAYOUT(kImageDefault):                                          \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageDefault)>(op_type);                     \
+    case DATALAYOUT(kImageFolder):                                           \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageFolder)>(op_type);                      \
+    case DATALAYOUT(kImageNW):                                               \
+      return Create<TARGET(target__),                                        \
+                    PRECISION(precision__),                                  \
+                    DATALAYOUT(kImageNW)>(op_type);                          \
     default:                                                                 \
       LOG(FATAL) << "unsupported kernel layout " << DataLayoutToStr(layout); \
   }
@@ -54,6 +66,8 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
       CREATE_KERNEL1(target__, kFP16);                  \
     case PRECISION(kAny):                               \
       CREATE_KERNEL1(target__, kAny);                   \
+    case PRECISION(kInt32):                             \
+      CREATE_KERNEL1(target__, kInt32);                 \
     case PRECISION(kInt64):                             \
       CREATE_KERNEL1(target__, kInt64);                 \
     default:                                            \
@@ -136,6 +150,7 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kARM, kInt8, kNCHW);
   INIT_FOR(kARM, kAny, kNCHW);
   INIT_FOR(kARM, kAny, kAny);
+  INIT_FOR(kARM, kInt32, kNCHW);
 
   INIT_FOR(kOpenCL, kFloat, kNCHW);
   INIT_FOR(kOpenCL, kFloat, kNHWC);
@@ -144,6 +159,17 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kOpenCL, kFloat, kAny);
   INIT_FOR(kOpenCL, kInt8, kNCHW);
   INIT_FOR(kOpenCL, kAny, kAny);
+  INIT_FOR(kOpenCL, kFP16, kNCHW);
+  INIT_FOR(kOpenCL, kFP16, kNHWC);
+  INIT_FOR(kOpenCL, kFP16, kImageDefault);
+  INIT_FOR(kOpenCL, kFP16, kImageFolder);
+  INIT_FOR(kOpenCL, kFP16, kImageNW);
+  INIT_FOR(kOpenCL, kFloat, kImageDefault);
+  INIT_FOR(kOpenCL, kFloat, kImageFolder);
+  INIT_FOR(kOpenCL, kFloat, kImageNW);
+  INIT_FOR(kOpenCL, kAny, kImageDefault);
+  INIT_FOR(kOpenCL, kAny, kImageFolder);
+  INIT_FOR(kOpenCL, kAny, kImageNW);
 
   INIT_FOR(kNPU, kFloat, kNCHW);
   INIT_FOR(kNPU, kInt8, kNCHW);
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index d78ae690f9b019dff7728bd3e95c0b1406bea463..0df5cb41ecc4c631e8540f9595c3182122b99f5f 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -145,6 +145,9 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kARM),
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kInt32),
+                                      DATALAYOUT(kNCHW)> *,  //
               KernelRegistryForTarget<TARGET(kARM),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNHWC)> *,  //
@@ -173,6 +176,39 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kOpenCL),
                                       PRECISION(kAny),
                                       DATALAYOUT(kAny)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageNW)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageDefault)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageFolder)> *,  //
+              KernelRegistryForTarget<TARGET(kOpenCL),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kImageNW)> *,  //
 
               KernelRegistryForTarget<TARGET(kNPU),
                                       PRECISION(kAny),
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 38c9d0e29d5766dec21de76b740c1032ad44da7e..58060f5bb599b7b9854e5c9b53f24d733af22f15 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -27,12 +27,6 @@
 #include "lite/core/program.h"
 #include "lite/core/types.h"
 #include "lite/model_parser/model_parser.h"
-#ifdef LITE_WITH_NPU
-#include "lite/core/mir/subgraph/generate_npu_program_pass.h"
-#endif
-#ifdef LITE_WITH_XPU
-#include "lite/core/mir/subgraph/generate_xpu_program_pass.h"
-#endif
 
 namespace paddle {
 namespace lite {
@@ -68,12 +62,15 @@ class Optimizer {
            // TODO(Superjomn) Refine the fusion related design to select fusion
            // kernels for devices automatically.
            "lite_conv_activation_fuse_pass",              //
+           "lite_var_conv_2d_activation_fuse_pass",       //
            "lite_fc_fuse_pass",                           //
            "lite_shuffle_channel_fuse_pass",              //
            "lite_transpose_softmax_transpose_fuse_pass",  //
            "lite_interpolate_fuse_pass",                  //
            "identity_scale_eliminate_pass",               //
-#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA)
+           "elementwise_mul_constant_eliminate_pass",     //
+#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
+    (defined LITE_WITH_ARM)
            "lite_elementwise_add_activation_fuse_pass",  //
 #endif
            "static_kernel_pick_pass",        // pick original kernel from graph
@@ -109,7 +106,9 @@ class Optimizer {
 
            "runtime_context_assign_pass",
            "argument_type_display_pass",
-           "memory_optimize_pass"}};
+           "memory_optimize_pass",
+           "npu_subgraph_pass",
+           "xpu_subgraph_pass"}};
       RunPasses(passes_local);
     } else {
       RunPasses(passes);
@@ -121,13 +120,6 @@ class Optimizer {
 
   // Generate a new program based on the mir graph.
   std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
-    // Extra passes are applied for NPU and XPU, they depends on the shapes
-    // of input tensors. so GenRuntimeProgram() must be called after the shapes
-    // of input tensors are determined.
-    std::vector<std::string> subgraph_passes{"generate_npu_program_pass",
-                                             "generate_xpu_program_pass"};
-    RunPasses(subgraph_passes);
-
     auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
         "generate_program_pass");
     pass->Apply(graph_);
diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc
index a51b769c8f46a5ca8cb9ed74740b93844882cb16..78317f78ac6bf7024c1984c2127434d55b738ad6 100644
--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
@@ -21,6 +21,13 @@ namespace paddle {
 namespace lite {
 namespace profile {
 
+namespace {
+auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
+  return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
+         (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
+};
+}
+
 int Profiler::NewTimer(const OpCharacter& ch) {
   StatisUnit unit;
   unit.character = ch;
@@ -50,61 +57,66 @@ float Profiler::StopTiming(const int index, KernelContext* ctx) {
   return units_[index].timer->Stop(ctx);
 }
 
-std::string Profiler::Summary(bool concise) {
+std::string Profiler::Summary(bool concise, size_t w) {
+  using std::setw;
+  using std::left;
+  using std::fixed;
   STL::stringstream ss;
-  auto cout_title = [&ss](const std::string& title, const std::string& name) {
-    // clang-format off
-    ss << "===== " << title << ": " << name << " =====" << std::endl;
-    ss << std::setw(25) << std::left << "Operator Type" \
-       << std::setw(40) << std::left << "Kernel Name"   \
-       << std::setw(10) << std::left << "Remark"        \
-       << std::setw(10) << std::left << "Avg (ms)"      \
-       << std::setw(10) << std::left << "Min (ms)"      \
-       << std::setw(10) << std::left << "Max (ms)"      \
+  std::string title;
+  // Title.
+  if (concise) {
+    ss << "Timing cycle = " << units_.front().timer->LapTimes().Size()
        << std::endl;
-    // clang-format on
-  };
+    ss << "===== Concise Profiler Summary: " << name_ << ", Exclude " << w
+       << " warm-ups =====" << std::endl;
+  } else {
+    ss << "===== Detailed Profiler Summary: " << name_ << ", Exclude " << w
+       << " warm-ups =====" << std::endl;
+  }
+  ss << setw(25) << left << "Operator Type"
+     << " " << setw(40) << left << "Kernel Name"
+     << " " << setw(12) << left << "Remark"
+     << " " << setw(12) << left << "Avg (ms)"
+     << " " << setw(12) << left << "Min (ms)"
+     << " " << setw(12) << left << "Max (ms)"
+     << " " << setw(12) << left << "Last (ms)" << std::endl;
+  // Profile information.
   if (concise) {
-    auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
-      return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
-             (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
-    };
     std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
     for (auto& unit : units_) {
       auto ch = summary.find(unit.character);
       if (ch != summary.end()) {
-        ch->second.avg += unit.timer->LapTimes().Avg();
-        ch->second.min += unit.timer->LapTimes().Min();
-        ch->second.max += unit.timer->LapTimes().Max();
+        ch->second.avg += unit.timer->LapTimes().Avg(w);
+        ch->second.min += unit.timer->LapTimes().Min(w);
+        ch->second.max += unit.timer->LapTimes().Max(w);
       } else {
-        TimeInfo info({unit.timer->LapTimes().Avg(),
-                       unit.timer->LapTimes().Min(),
-                       unit.timer->LapTimes().Max()});
+        TimeInfo info({unit.timer->LapTimes().Avg(w),
+                       unit.timer->LapTimes().Min(w),
+                       unit.timer->LapTimes().Max(w)});
         summary.insert({unit.character, info});
       }
     }
-    cout_title("Concise Profiler Summary", name_);
     for (const auto& item : summary) {
       // clang-format off
-      ss << std::setw(25) << std::left << item.first.op_type      \
-         << std::setw(40) << std::left << item.first.kernel_name  \
-         << std::setw(10) << std::left << item.first.remark       \
-         << std::setw(10) << std::left << item.second.avg         \
-         << std::setw(10) << std::left << item.second.min         \
-         << std::setw(10) << std::left << item.second.max         \
-         << std::endl;
+      ss << setw(25) << left << fixed << item.first.op_type             \
+         << " " << setw(40) << left << fixed << item.first.kernel_name  \
+         << " " << setw(12) << left << fixed << item.first.remark       \
+         << " " << setw(12) << left << fixed << item.second.avg         \
+         << " " << setw(12) << left << fixed << item.second.min         \
+         << " " << setw(12) << left << fixed << item.second.max         \
+         << " " << std::endl;
       // clang-format on
     }
   } else {
-    cout_title("Detailed Profiler Summary", name_);
     for (auto& unit : units_) {
       // clang-format off
-      ss << std::setw(25) << std::left << unit.character.op_type        \
-         << std::setw(40) << std::left << unit.character.kernel_name    \
-         << std::setw(10) << std::left << unit.character.remark         \
-         << std::setw(10) << std::left << unit.timer->LapTimes().Avg()  \
-         << std::setw(10) << std::left << unit.timer->LapTimes().Min()  \
-         << std::setw(10) << std::left << unit.timer->LapTimes().Max()  \
+      ss << setw(25) << left << fixed << unit.character.op_type                \
+         << " " << setw(40) << left << fixed << unit.character.kernel_name     \
+         << " " << setw(12) << left << fixed << unit.character.remark          \
+         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Avg(w)  \
+         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Min(w)  \
+         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Max(w)  \
+         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Last(w) \
          << std::endl;
       // clang-format on
     }
diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h
index 0fce8167cdd5383c2cc4ae5d641433582f0ee6a7..4e9e9ae31c1a6d7f331eac2e77c4971986bd42a1 100644
--- a/lite/core/profile/profiler.h
+++ b/lite/core/profile/profiler.h
@@ -47,7 +47,7 @@ class Profiler final {
   int NewTimer(const OpCharacter& ch);
   void StartTiming(const int index, KernelContext* ctx);
   float StopTiming(const int index, KernelContext* ctx);
-  std::string Summary(bool concise = true);
+  std::string Summary(bool concise = true, size_t warm_up = 10);
 
  private:
   std::string name_{std::string("N/A")};
diff --git a/lite/core/profile/timer.h b/lite/core/profile/timer.h
index 1e86f0d7b9be4914bdf1a6874195276d3c1b61ee..e9bb16bd27d5ec6fd21814c35db52b2467a12b51 100644
--- a/lite/core/profile/timer.h
+++ b/lite/core/profile/timer.h
@@ -15,7 +15,7 @@
 #pragma once
 #include <algorithm>
 #include <chrono>  // NOLINT
-#include <list>
+#include <vector>
 #ifdef LITE_WITH_CUDA
 #include "lite/backends/cuda/cuda_utils.h"
 #endif
@@ -30,20 +30,44 @@ class TimeList {
  public:
   void Clear() { laps_t_.clear(); }
   void Add(T t) { laps_t_.push_back(t); }
-  T Max() const { return *std::max_element(laps_t_.begin(), laps_t_.end()); }
-  T Min() const { return *std::min_element(laps_t_.begin(), laps_t_.end()); }
-  T Sum() const { return std::accumulate(laps_t_.begin(), laps_t_.end(), 0.0); }
-  size_t Size() const { return laps_t_.size(); }
-  T Avg() const {
-    if (!Size()) {
+  T Last(size_t offset = 0) const {
+    if (!Size(offset)) {
       return 0;
     }
-    return Sum() / Size();
+    return laps_t_.back();
   }
-  const std::list<T>& Raw() const { return laps_t_; }
+  T Max(size_t offset = 0) const {
+    if (!Size(offset)) {
+      return 0;
+    }
+    return *std::max_element((laps_t_.begin() + offset), laps_t_.end());
+  }
+  T Min(size_t offset = 0) const {
+    if (!Size(offset)) {
+      return 0;
+    }
+    return *std::min_element((laps_t_.begin() + offset), laps_t_.end());
+  }
+  T Sum(size_t offset = 0) const {
+    if (!Size(offset)) {
+      return 0;
+    }
+    return std::accumulate((laps_t_.begin() + offset), laps_t_.end(), 0.0);
+  }
+  size_t Size(size_t offset = 0) const {
+    size_t size = (laps_t_.size() <= offset) ? 0 : (laps_t_.size() - offset);
+    return size;
+  }
+  T Avg(size_t offset = 0) const {
+    if (!Size(offset)) {
+      return 0;
+    }
+    return Sum(offset) / Size(offset);
+  }
+  const std::vector<T>& Raw() const { return laps_t_; }
 
  private:
-  std::list<T> laps_t_;
+  std::vector<T> laps_t_;
 };
 
 class Timer {
@@ -69,8 +93,10 @@ class Timer {
   const TimeList<float>& LapTimes() const { return laps_t_; }
 
  protected:
-  std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;
   TimeList<float> laps_t_;
+
+ private:
+  std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;
 };
 
 template <TargetType Target>
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 45796a478b3f2309912e6382b3380bf0734bd6ae..b0c61bf00ed29e2fa71072b64f11f6ba30f77691 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -17,6 +17,8 @@
 #include "lite/model_parser/cpp/block_desc.h"
 #include "lite/model_parser/cpp/op_desc.h"
 #include "lite/model_parser/cpp/var_desc.h"
+#include "lite/operators/conditional_block_op.h"
+#include "lite/operators/subgraph_op.h"
 #include "lite/operators/while_op.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/precision_profiler.h"
@@ -30,10 +32,32 @@ void RuntimeProgram::SaveOpInfosToProgram(cpp::ProgramDesc* desc) {
   // NOTE: RuntimeProgram do not has all meta info, so save model just update
   // upon origin model
   CHECK(desc->BlocksSize());
-  auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
-  main_block.ClearOps();
+  auto main_block = desc->GetBlock<cpp::BlockDesc>(0);
+  main_block->ClearOps();
   for (auto& node : instructions_) {
-    auto* op = main_block.AddOp<cpp::OpDesc>();
+    auto op_type = node.op()->op_info()->Type();
+    if (op_type == "subgraph") {
+      auto subgraph_op = const_cast<operators::SubgraphOp*>(
+          static_cast<const operators::SubgraphOp*>(node.op()));
+      int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
+      if (sub_block_idx < 0) {
+        // It's a new subgraph op when its sub_block_idx < 0, Now we add its
+        // subblock desc to the program desc, Then update its sub_block_idx to
+        // the index of block desc of the program desc.
+        sub_block_idx = desc->BlocksSize();
+        auto sub_block_desc = subgraph_op->GetSubBlock();
+        CHECK(sub_block_desc);
+        auto new_block_desc = desc->AddBlock<cpp::BlockDesc>();
+        *new_block_desc = *sub_block_desc;
+        delete sub_block_desc;
+        subgraph_op->mutable_op_info()->SetAttr<int32_t>("sub_block",
+                                                         sub_block_idx);
+        subgraph_op->SetSubBlock(new_block_desc);
+        // Update main block desc after a new subblock desc is added
+        main_block = desc->GetBlock<cpp::BlockDesc>(0);
+      }
+    }
+    auto op = main_block->AddOp<cpp::OpDesc>();
     *op = *node.op()->op_info();
     op->SetAttr(kKernelTypeAttr, node.kernel()->SerializedKernelType());
   }
@@ -123,7 +147,7 @@ void RuntimeProgram::Run() {
 #endif  // LITE_WITH_PROFILE
   }
 #ifdef LITE_WITH_PROFILE
-  LOG(INFO) << "\n" << profiler_.Summary();
+  LOG(INFO) << "\n" << profiler_.Summary(false, 0);
 #endif  // LITE_WITH_PROFILE
 }
 
@@ -141,12 +165,26 @@ void Program::Build(const cpp::ProgramDesc& prog) {
     VLOG(4) << "create Op [" << op_type << "]";
     auto op = LiteOpRegistry::Global().Create(op_type);
     CHECK(op) << "no Op found for " << op_type;
-    if (op_type == "while") {
+    if (op_type == "while" || op_type == "conditional_block" ||
+        op_type == "subgraph") {
       auto sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
-      auto sub_block =
+      CHECK(sub_block_idx >= 0 && sub_block_idx < program.BlocksSize())
+          << "Invalid attribute sub_block(" << sub_block_idx << ") for "
+          << op_type;
+      auto sub_block_desc =
           const_cast<cpp::ProgramDesc&>(prog).GetBlock<cpp::BlockDesc>(
               sub_block_idx);
-      static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(sub_block);
+      CHECK(sub_block_desc);
+      if (op_type == "while") {
+        static_cast<operators::WhileOpLite*>(op.get())->SetSubBlock(
+            sub_block_desc);
+      } else if (op_type == "conditional_block") {
+        static_cast<operators::ConditionalBlockOpLite*>(op.get())->SetSubBlock(
+            sub_block_desc);
+      } else if (op_type == "subgraph") {
+        static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(
+            sub_block_desc);
+      }
     }
     ops_.emplace_back(std::move(op));
     ops_.back()->Attach(op_desc, exec_scope_);
@@ -162,6 +200,27 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
   tmp_vars_.push_back("feed");
   tmp_vars_.push_back("fetch");
 
+  auto VarPrecision2KernlPrecision =
+      [](const lite::VarDescAPI::Type& type) -> PrecisionType {
+    switch (type) {
+      case lite::VarDescAPI::Type::FP32:
+        return PRECISION(kFloat);
+      case lite::VarDescAPI::Type::FP16:
+        return PRECISION(kFP16);
+      case lite::VarDescAPI::Type::INT8:
+        return PRECISION(kInt8);
+      case lite::VarDescAPI::Type::INT16:
+        return PRECISION(kInt16);
+      case lite::VarDescAPI::Type::INT32:
+        return PRECISION(kInt32);
+      case lite::VarDescAPI::Type::INT64:
+        return PRECISION(kInt64);
+      default:
+        // LOG(FATAL) << "not supported type: " << static_cast<int>(type);
+        return PRECISION(kUnk);
+    }
+  };
+
   auto program = prog;
   CHECK(program.BlocksSize());
   for (size_t b = 0; b < program.BlocksSize(); ++b) {
@@ -169,7 +228,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
     for (size_t i = 0; i < main_block.VarsSize(); ++i) {
       auto& var_desc = *main_block.GetVar<cpp::VarDesc>(i);
       if (!var_desc.Persistable()) {
+        if (var_desc.GetType() == lite::VarDescAPI::Type::LOD_TENSOR &&
+            VarPrecision2KernlPrecision(var_desc.GetDataType()) !=
+                PRECISION(kUnk)) {
+          var_data_type_[var_desc.Name()] =
+              VarPrecision2KernlPrecision(var_desc.GetDataType());
+        }
         tmp_vars_.push_back(var_desc.Name());
+        VLOG(4) << "var name: " << var_desc.Name() << " type is "
+                << static_cast<int>(var_desc.GetType()) << " data type is "
+                << static_cast<int>(var_desc.GetDataType());
         exec_scope_->Var(var_desc.Name());
         if (b > 0) {
           VLOG(4) << "var: " << var_desc.Name();
@@ -194,14 +262,10 @@ void Instruction::Run() {
   if (op_->run_once() && has_run_) {
     return;
   }
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << "kernel launch";
-#endif
+  // VLOG(4) << "kernel launch";
   op_->InferShape();
-#ifndef LITE_SHUTDOWN_LOG
-  VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
-          << TargetToStr(kernel_->target());
-#endif
+  // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
+  //        << TargetToStr(kernel_->target());
   kernel_->Launch();
   has_run_ = true;
 }
diff --git a/lite/core/program.h b/lite/core/program.h
index 1c1e4975c3a13bcfa9a22999a705f3a78b0fc68e..291252619b396f18576b935a0189f4ecdba7867f 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -16,6 +16,7 @@
 #include <list>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "lite/core/kernel.h"
@@ -63,6 +64,10 @@ struct Program {
   lite::Scope* exec_scope() { return exec_scope_; }
   lite::Scope* scope() { return scope_.get(); }
 
+  const std::unordered_map<std::string, PrecisionType>& var_data_type() const {
+    return var_data_type_;
+  }
+
  private:
   // Build from a program and scope.
   void Build(const cpp::ProgramDesc& program);
@@ -70,6 +75,7 @@ struct Program {
   void PrepareWorkspace(const cpp::ProgramDesc& program);
 
  private:
+  std::unordered_map<std::string, PrecisionType> var_data_type_;
   std::list<std::string> tmp_vars_;
   std::list<std::string> weights_;
   std::list<std::shared_ptr<OpLite>> ops_;
@@ -135,6 +141,11 @@ class LITE_API RuntimeProgram {
     set_profiler();
 #endif
   }
+  ~RuntimeProgram() {
+#ifdef LITE_WITH_PROFILE
+    LOG(INFO) << "\n" << profiler_.Summary();
+#endif  // LITE_WITH_PROFILE
+  }
 
   void Run();
 
diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc
index 1c7db871c7b525d6e4944fd0d669e81bcaff7f2a..ecfdcf3d1107953f1c41ea57b6f12187b29686c6 100644
--- a/lite/core/tensor.cc
+++ b/lite/core/tensor.cc
@@ -104,6 +104,12 @@ const cl::Image2D *TensorLite::data<float, cl::Image2D>() const {
   if (nullptr == buffer_->data()) return nullptr;
   return static_cast<const cl::Image2D *>(buffer_->data());
 }
+
+template <>  // use int16_t represent half float
+const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const {
+  if (nullptr == buffer_->data()) return nullptr;
+  return static_cast<const cl::Image2D *>(buffer_->data());
+}
 #endif
 
 }  // namespace lite
diff --git a/lite/core/tensor.h b/lite/core/tensor.h
index 8c4fe1604a517332e52b243404828e81af26f419..a1141c613e29326a5f9ffb2fdc1427e3fbe84481 100644
--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -147,9 +147,11 @@ class TensorLite {
 
 #ifdef LITE_WITH_OPENCL
   template <typename T, typename R = T>
-  R *mutable_data(const size_t img_w, const size_t img_h) {
+  R *mutable_data(const size_t img_w,
+                  const size_t img_h,
+                  void *host_ptr = nullptr) {
     target_ = TARGET(kOpenCL);
-    buffer_->ResetLazyImage2D<T>(target_, img_w, img_h);
+    buffer_->ResetLazyImage2D<T>(target_, img_w, img_h, host_ptr);
     return static_cast<cl::Image2D *>(buffer_->data());
   }
 #endif
@@ -251,6 +253,9 @@ bool TensorCompareWith(const TensorT &a, const TensorT &b) {
 #ifdef LITE_WITH_OPENCL
 template <>
 const cl::Image2D *TensorLite::data<float, cl::Image2D>() const;
+
+template <>  // use int16_t represent half float
+const cl::Image2D *TensorLite::data<int16_t, cl::Image2D>() const;
 #endif
 
 }  // namespace lite
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 0c8866eaf88145d3bb0703b32ffb3eaf80332898..f543c000f8a202d891cd27958fb23dcf38e0240c 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -1,3 +1,10 @@
+# NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
+# to the model_optimize_tool.
+if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
+    return()
+endif()
+
+message(STATUS "compile with lite ARM kernels")
 
 # 1. basic kernels for basic models
 # for conv op
@@ -41,6 +48,7 @@ add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc D
 add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(instance_norm_compute_arm ARM basic SRCS instance_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 ## 2.other basic kernels: basic kernels that not used in basic models
 add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -61,11 +69,17 @@ add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${
 add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_prod_compute_arm ARM extra SRCS reduce_prod_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(split_lod_tensor_compute_arm ARM extra SRCS split_lod_tensor_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(merge_lod_tensor_compute_arm ARM extra SRCS merge_lod_tensor_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(anchor_generator_compute_arm ARM extra SRCS anchor_generator_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(assign_value_compute_arm ARM extra SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
 
 # for OCR specific
@@ -87,13 +101,6 @@ add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEP
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
-# NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
-# to the model_optimize_tool.
-if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
-    return()
-endif()
-
-message(STATUS "compile with lite ARM kernels")
 
 lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm)
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
@@ -107,6 +114,8 @@ lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS tran
 lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
 lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
 if(LITE_BUILD_EXTRA)
+    lite_cc_test(test_split_lod_tensor_compute_arm SRCS split_lod_tensor_compute_test.cc DEPS split_lod_tensor_compute_arm)
+    lite_cc_test(test_merge_lod_tensor_compute_arm SRCS merge_lod_tensor_compute_test.cc DEPS merge_lod_tensor_compute_arm)
     lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm)
     lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
     lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
diff --git a/lite/kernels/arm/cast_compute.cc b/lite/kernels/arm/cast_compute.cc
index 1fef52bcb77b7c3efdcd848ee63f8ec46c16d6f8..266ae1fc916af4303aca274c39b9b4923fdbb154 100644
--- a/lite/kernels/arm/cast_compute.cc
+++ b/lite/kernels/arm/cast_compute.cc
@@ -74,6 +74,6 @@ void CastCompute::Run() {
 
 REGISTER_LITE_KERNEL(
     cast, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::CastCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.cc b/lite/kernels/arm/collect_fpn_proposals_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d54b96348e866bbe16898ddd6fdbd45beb62afa0
--- /dev/null
+++ b/lite/kernels/arm/collect_fpn_proposals_compute.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/collect_fpn_proposals_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+struct ScoreWithID {
+  float score;
+  int batch_id;
+  int index;
+  int level;
+  ScoreWithID() {
+    batch_id = -1;
+    index = -1;
+    level = -1;
+  }
+  ScoreWithID(float score_, int batch_id_, int index_, int level_) {
+    score = score_;
+    batch_id = batch_id_;
+    index = index_;
+    level = level_;
+  }
+};
+
+static inline bool CompareByScore(ScoreWithID a, ScoreWithID b) {
+  return a.score >= b.score;
+}
+
+static inline bool CompareByBatchid(ScoreWithID a, ScoreWithID b) {
+  return a.batch_id < b.batch_id;
+}
+
+void CollectFpnProposalsCompute::Run() {
+  auto& param = Param<operators::CollectFpnProposalsParam>();
+  auto multi_layer_rois = param.multi_level_rois;
+  auto multi_layer_scores = param.multi_level_scores;
+  auto* fpn_rois = param.fpn_rois;
+  int post_nms_topN = param.post_nms_topN;
+
+  if (multi_layer_rois.size() != multi_layer_scores.size()) {
+    LOG(FATAL) << "multi_layer_rois.size() should be equan to "
+                  "multi_layer_scores.size()";
+  }
+
+  size_t num_fpn_level = multi_layer_rois.size();
+  std::vector<int> integral_of_all_rois(num_fpn_level + 1, 0);
+  for (size_t i = 0; i < num_fpn_level; ++i) {
+    auto cur_rois_lod = multi_layer_rois[i]->lod().back();
+    integral_of_all_rois[i + 1] = static_cast<int>(
+        integral_of_all_rois[i] + cur_rois_lod[cur_rois_lod.size() - 1]);
+  }
+
+  std::vector<ScoreWithID> scores_of_all_rois(
+      integral_of_all_rois[num_fpn_level], ScoreWithID());
+  for (int i = 0; i < num_fpn_level; ++i) {
+    const float* cur_level_scores = multi_layer_scores[i]->data<float>();
+    int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i];
+    auto cur_scores_lod = multi_layer_scores[i]->lod().back();
+    int cur_batch_id = 0;
+    for (int j = 0; j < cur_level_num; ++j) {
+      if (j >= cur_scores_lod[cur_batch_id + 1]) {
+        cur_batch_id++;
+      }
+      int cur_index = j + integral_of_all_rois[i];
+      scores_of_all_rois[cur_index].score = cur_level_scores[j];
+      scores_of_all_rois[cur_index].index = j;
+      scores_of_all_rois[cur_index].level = i;
+      scores_of_all_rois[cur_index].batch_id = cur_batch_id;
+    }
+  }
+
+  // keep top post_nms_topN rois, sort the rois by the score
+  if (post_nms_topN > integral_of_all_rois[num_fpn_level]) {
+    post_nms_topN = integral_of_all_rois[num_fpn_level];
+  }
+  std::stable_sort(
+      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByScore);
+  scores_of_all_rois.resize(post_nms_topN);
+  // sort by batch id
+  std::stable_sort(
+      scores_of_all_rois.begin(), scores_of_all_rois.end(), CompareByBatchid);
+  // create a pointer array
+  std::vector<const float*> multi_fpn_rois_data(num_fpn_level);
+  for (int i = 0; i < num_fpn_level; ++i) {
+    multi_fpn_rois_data[i] = multi_layer_rois[i]->data<float>();
+  }
+
+  // initialize the outputs
+  const int kBoxDim = 4;
+  auto fpn_rois_data = fpn_rois->mutable_data<float>();
+  std::vector<uint64_t> lod0(1, 0);
+  int cur_batch_id = 0;
+  for (int i = 0; i < post_nms_topN; ++i) {
+    int cur_fpn_level = scores_of_all_rois[i].level;
+    int cur_level_index = scores_of_all_rois[i].index;
+    std::memcpy(fpn_rois_data,
+                multi_fpn_rois_data[cur_fpn_level] + cur_level_index * kBoxDim,
+                kBoxDim * sizeof(float));
+    fpn_rois_data += kBoxDim;
+    if (scores_of_all_rois[i].batch_id != cur_batch_id) {
+      cur_batch_id = scores_of_all_rois[i].batch_id;
+      lod0.emplace_back(i);
+    }
+  }
+  lod0.emplace_back(post_nms_topN);
+  lite::LoD lod;
+  lod.emplace_back(lod0);
+  fpn_rois->set_lod(lod);
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(collect_fpn_proposals,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::CollectFpnProposalsCompute,
+                     def)
+    .BindInput("MultiLevelRois", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("MultiLevelScores", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("FpnRois", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/xpu/bridges/registry.cc b/lite/kernels/arm/collect_fpn_proposals_compute.h
similarity index 62%
rename from lite/kernels/xpu/bridges/registry.cc
rename to lite/kernels/arm/collect_fpn_proposals_compute.h
index 4ab1b69a25a29aeb1c1ceaff25525459ef2e94cd..f1e7448a07aee4f9c2b57a1c6d2223f4262c59b4 100644
--- a/lite/kernels/xpu/bridges/registry.cc
+++ b/lite/kernels/arm/collect_fpn_proposals_compute.h
@@ -12,30 +12,27 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/xpu/bridges/registry.h"
-#include <utility>
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/axpy_op.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace xpu {
-namespace bridges {
+namespace arm {
 
-Factory& Factory::Instance() {
-  static Factory g_xpu_bridge;
-  return g_xpu_bridge;
-}
+class CollectFpnProposalsCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::CollectFpnProposalsParam;
 
-bool Factory::HasType(const std::string& op_type) const {
-  return map_.count(op_type);
-}
+  void Run() override;
 
-void Factory::Insert(const std::string& op_type, const func_type& func_name) {
-  map_.insert(std::make_pair(op_type, func_name));
-}
+  virtual ~CollectFpnProposalsCompute() = default;
+};
 
-}  // namespace bridges
-}  // namespace xpu
+}  // namespace arm
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc
index 95014b4ccd427e152dfe919643afa5ff5eb3011d..6118cbc6e403645cada84d2434497b084636a4a3 100644
--- a/lite/kernels/arm/compare_compute.cc
+++ b/lite/kernels/arm/compare_compute.cc
@@ -112,6 +112,42 @@ void CompareCompute<Functor>::Run() {
   }
 }
 
+template <template <typename T> class Functor>
+void CompareCompute_int32<Functor>::Run() {
+  auto &param = this->Param<operators::CompareParam>();
+
+  using CompareFunctor = Functor<int>;
+
+  const size_t x_size = param.X->numel();
+  const size_t y_size = param.Y->numel();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  bool *z = param.Out->template mutable_data<bool>();
+  const auto *x = param.X->template data<int>();
+  const auto *y = param.Y->template data<int>();
+  auto axis = param.axis;
+  bool force_cpu = param.force_cpu;
+  if (x_size == y_size) {
+    for (int i = 0; i < x_size; ++i) {
+      z[i] = CompareFunctor()(x[i], y[i]);
+    }
+  } else {
+    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
+    int outer_num, mid_num, inner_num;
+    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
+    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
+      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
+        auto y_data = y[mid_id];
+        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
+          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
+          z[index] = CompareFunctor()(x[index], y_data);
+          // z[index] = x[index] < y_data;
+        }
+      }
+    }
+  }
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -183,3 +219,27 @@ REGISTER_LITE_KERNEL(greater_equal,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(less_than,
+                     kARM,
+                     kInt32,
+                     kNCHW,
+                     paddle::lite::kernels::arm::CompareCompute_int32<
+                         paddle::lite::kernels::arm::_LessThanFunctor>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(equal,
+                     kARM,
+                     kInt32,
+                     kNCHW,
+                     paddle::lite::kernels::arm::CompareCompute_int32<
+                         paddle::lite::kernels::arm::_EqualFunctor>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .Finalize();
diff --git a/lite/kernels/arm/compare_compute.h b/lite/kernels/arm/compare_compute.h
index 65354022c6a5c0f1efbd179f0625f9451b327ab4..474a346a3d7bc922766976934c8d184b0fe4d373 100644
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/arm/compare_compute.h
@@ -33,8 +33,17 @@ class CompareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   void Run() override;
 
   ~CompareCompute() {}
+};
+
+template <template <typename T> class Functor>
+class CompareCompute_int32
+    : public KernelLite<TARGET(kARM), PRECISION(kInt32)> {
+ public:
+  using param_t = operators::LogicalParam;
+
+  void Run() override;
 
- private:
+  ~CompareCompute_int32() {}
 };
 
 }  // namespace arm
diff --git a/lite/kernels/arm/conditional_block_compute.cc b/lite/kernels/arm/conditional_block_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..225709b793d4718545a4077ba469c484fc8b36a3
--- /dev/null
+++ b/lite/kernels/arm/conditional_block_compute.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/conditional_block_compute.h"
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void ConditionalBlockCompute::PrepareForRun() {
+  auto& param = Param<operators::ConditionalBlockParam>();
+  auto cur_scope = param.scope;
+
+  executor_ =
+      std::make_shared<CondExecutor>(param.sub_block, cur_scope, place());
+}
+void ConditionalBlockCompute::Run() {
+  auto& param = Param<operators::ConditionalBlockParam>();
+  bool need_run = true;
+  if (param.is_scalar_condition) {
+    auto* cond = param.cond;
+    auto* cond_data = cond->data<bool>();
+    need_run = cond_data[0];
+  } else {
+    auto x = param.x;
+    for (auto pt : x) {
+      if (pt == nullptr || !pt->IsInitialized() || pt->dims().empty()) {
+        need_run = false;
+        break;
+      }
+    }
+  }
+  if (need_run) {
+    executor_->Run();
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(conditional_block,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::ConditionalBlockCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Cond", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Scope", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/conditional_block_compute.h b/lite/kernels/arm/conditional_block_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..91eadff931ec8aa54092347bcf18f8428130ef75
--- /dev/null
+++ b/lite/kernels/arm/conditional_block_compute.h
@@ -0,0 +1,106 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/program.h"
+#include "lite/operators/conditional_block_op.h"
+#ifdef LITE_WITH_PROFILE
+#include "lite/core/profile/basic_profiler.h"
+#include "lite/core/profile/precision_profiler.h"
+#include "lite/core/profile/profiler.h"
+#endif
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class CondExecutor {
+  typedef std::shared_ptr<OpLite> OpPtr;
+
+ public:
+  CondExecutor(cpp::BlockDesc *block, Scope *scope, Place place)
+      : scope_(scope), place_(place) {
+    int32_t op_size = block->OpsSize();
+    for (int32_t i = 0; i < op_size; ++i) {
+      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
+      auto op_type = op_desc.Type();
+      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
+      op_handler->Attach(op_desc, scope);
+
+      auto hostplace = place_;
+      hostplace.target = TARGET(kHost);
+      auto kernels = op_handler->CreateKernels({place_, hostplace});
+      CHECK_GT(kernels.size(), 0) << "cannot create kernel";
+      op_handler->AttachKernel(kernels[0].get());
+      op_handler->SetKernel(kernels);
+      ops_of_block_.push_back(op_handler);
+    }
+  }
+
+  void Run() {
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+    lite::profile::Profiler profiler;
+#endif  // LITE_WITH_PRECISION_PROFILE
+#endif  // LITE_WITH_PROFILE
+    for (auto &op_handler : ops_of_block_) {
+      op_handler->CheckShape();
+      op_handler->InferShape();
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+      std::unique_ptr<KernelBase> kernel(op_handler->GetKernel());
+      Instruction inst(op_handler, std::move(kernel));
+      inst.set_profiler(&profiler);
+#endif  // LITE_WITH_PRECISION_PROFILE
+#endif  // LITE_WITH_PROFILE
+      op_handler->Run();
+#ifdef LITE_WITH_PROFILE
+#ifdef LITE_WITH_PRECISION_PROFILE
+      LITE_PRECISION_PROFILE(inst)
+#endif  // LITE_WITH_PRECISION_PROFILE
+#endif  // LITE_WITH_PROFILE
+    }
+  }
+
+ private:
+  Scope *scope_;
+  Place place_;
+  std::vector<OpPtr> ops_of_block_;
+};
+
+class ConditionalBlockCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ConditionalBlockParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~ConditionalBlockCompute() = default;
+
+ private:
+  std::shared_ptr<CondExecutor> executor_;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index 69e507ba347583b3761fe38d86136a22f2576c15..8c76f243a647553de198bea5c581a620d99bbbeb 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -65,30 +65,20 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
       no_dilation && flag_dw) {
     /// dw conv impl
     impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking dw conv";
+    // VLOG(3) << "invoking dw conv";
   } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
-             no_dilation) {
-    bool use_winograd =
-        (threads == 1 && oc >= 4 && ic >= 4 && hout >= 6 && wout >= 6 &&
-         pads_equal) ||
-        (oc >= 32 && ic >= 32 && hout >= 16 && wout >= 16 && pads_equal);
-    if (use_winograd) {
-      /// winograd conv impl
-      impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
-      VLOG(3) << "invoking winograd conv";
-    } else {
-      /// direct conv impl
-      impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
-      VLOG(3) << "invoking direct conv";
-    }
+             no_dilation && pads_all_equal) {
+    /// winograd conv impl
+    impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
+    // VLOG(3) << "invoking winograd conv";
   } else if (param.groups == 1 && kw == 3 && stride == 2 &&
              chin * chout < 4 * hin * win && kps_equal && no_dilation) {
     /// direct conv impl
     impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking direct conv";
+    // VLOG(3) << "invoking direct conv";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking gemm like conv";
+    // VLOG(3) << "invoking gemm like conv";
   }
   impl_->SetContext(std::move(this->ctx_));
   impl_->SetParam(param);
@@ -127,14 +117,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
   if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
       no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    VLOG(3) << "Run DepthwiseConv Int8";
+    // VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
              kps_equal && no_dilation) {
     impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    VLOG(3) << "Run DirectConv Int8";
+    // VLOG(3) << "Run DirectConv Int8";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    VLOG(3) << "Run GemmLikeConvInt8";
+    // VLOG(3) << "Run GemmLikeConvInt8";
   }
   impl_->SetContext(std::move(this->ctx_));
   impl_->SetParam(param);
@@ -173,14 +163,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
   if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
       no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    VLOG(3) << "Run DepthwiseConv Int8";
+    // VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
              kps_equal && no_dilation) {
     impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    VLOG(3) << "Run DirectConv Int8";
+    // VLOG(3) << "Run DirectConv Int8";
   } else {
     impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    VLOG(3) << "Run GemmLikeConvInt8";
+    // VLOG(3) << "Run GemmLikeConvInt8";
   }
   impl_->SetContext(std::move(this->ctx_));
   impl_->SetParam(param);
diff --git a/lite/kernels/arm/conv_depthwise.cc b/lite/kernels/arm/conv_depthwise.cc
index e2eaef51ddcb169313e6675d497ca4d7cab438d3..adaae92472b8134053a8022934482fe8d79e284e 100644
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -30,7 +30,7 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   auto kw = w_dims[3];
   // select dw conv kernel
   if (kw == 3) {
-    VLOG(5) << "invoke 3x3 dw conv fp32";
+    // VLOG(5) << "invoke 3x3 dw conv fp32";
     auto paddings = *param.paddings;
     bool pads_equal =
         ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
@@ -54,7 +54,7 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
       flag_trans_weights_ = true;
     }
   } else if (kw == 5) {
-    VLOG(5) << "invoke 5x5 dw conv fp32";
+    // VLOG(5) << "invoke 5x5 dw conv fp32";
     impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
   } else {
     LOG(FATAL) << "this type dw conv not impl";
@@ -86,7 +86,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
   /// select dw conv kernel
   if (kw == 3) {
     // trans weights
-    VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
+    // VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
     impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
     int cround = ROUNDUP(w_dims[0], 8);
     weights_.Resize({cround / 8, 1, kh * kw, 8});
@@ -96,7 +96,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
     flag_trans_weights_ = true;
   } else if (kw == 5) {
     // trans weights
-    VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
+    // VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
     impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
     int cround = ROUNDUP(w_dims[0], 8);
     weights_.Resize({cround / 8, 1, kh * kw, 8});
@@ -145,7 +145,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
   /// select dw conv kernel
   if (kw == 3) {
     // trans weights
-    VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
+    // VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
     impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
     int cround = ROUNDUP(w_dims[0], 8);
     weights_.Resize({cround / 8, 1, kh * kw, 8});
@@ -155,7 +155,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
     flag_trans_weights_ = true;
   } else if (kw == 5) {
     // trans weights
-    VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
+    // VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
     impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
     int cround = ROUNDUP(w_dims[0], 8);
     weights_.Resize({cround / 8, 1, kh * kw, 8});
diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc
index d02cabf277a5e25e2dc731b5bcf0eabe601c9aae..c5cf0b237fc0548ac2bb7549d3950b3cead2b74c 100644
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -43,79 +43,47 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   int oh = o_dims[2];
   int ow = o_dims[3];
   int tile_block = 8;
-#ifdef __aarch64__
-  tile_block = 16;
-#endif
-  int parallel_threads =
-      (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block;
-  if (threads <= 2 && parallel_threads >= threads) {
-    if (last_kernel_is_c4_ == 1) {
+  choose_small_ = ow * oh / (tile_block * threads) < 36 ? true : false;
+  if (choose_small_) {
+    wino_iw = 4;
+
+    if (last_function_ == 0) {
       return;
     }
-    last_kernel_is_c4_ = 1;
-    auto pad = *(param.paddings);
-    int pad_h = pad[0];
-    int pad_w = pad[2];
-    int oc_pad = (oc + 3) / 4 * 4;
-    int ic_pad = (ic + 3) / 4 * 4;
-    const int new_input_size =
-        (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
-    const int temp_size =
-        (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 256 + 512) * threads;
-    ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
-
-    weights_.Resize({1, 1, 1, 64 * oc_pad * ic_pad});
-    ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
-    void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
-    auto weights_data_ = weights_.mutable_data<float>();
-    lite::arm::math::weight_trans_c4(
-        weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
-    free(trans_tmp_ptr);
+    last_function_ = 0;
   } else {
-    if (last_kernel_is_c4_ == 0) {
+    wino_iw = 8;
+    if (last_function_ == 1) {
       return;
     }
-    last_kernel_is_c4_ = 0;
-    int tile_w = (ow + 5) / 6;
-    int tile_h = (oh + 5) / 6;
-
-    int size_tile = tile_h * tile_w;
-    int size_trans_channel = 8 * 8 * size_tile;
-    int max_ch = ic > oc ? ic : oc;
-
-    const int n_wino = size_tile;
-    ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) *
-                        sizeof(float));
-
-    const int m_wino = oc;
-    int hblock = lite::arm::math::get_hblock(&ctx);
-    int m_round = hblock * ((m_wino + hblock - 1) / hblock);
-    weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic});
-    ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) *
-                        sizeof(float));
-    auto weights_wino =
-        static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic));
-    void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
-    lite::arm::math::winograd_transform_weights(
-        weights_wino, param.filter->data<float>(), oc, ic, trans_tmp_ptr);
-    auto weights_trans = weights_.mutable_data<float>();
-    for (int i = 0; i < 64; ++i) {
-      float* packed_weights = weights_trans + i * m_round * ic;
-      const float* weights_wino_ptr = weights_wino + i * oc * ic;
-      lite::arm::math::prepackA(packed_weights,
-                                weights_wino_ptr,
-                                1.f,
-                                ic,
-                                0,
-                                m_wino,
-                                0,
-                                ic,
-                                false,
-                                &ctx);
-    }
-    free(trans_tmp_ptr);
-    free(weights_wino);
+    last_function_ = 1;
   }
+  auto pad = *(param.paddings);
+  int pad_h = pad[0];
+  int pad_w = pad[2];
+  int oc_pad = (oc + 3) / 4 * 4;
+  int ic_pad = (ic + 3) / 4 * 4;
+  const int new_input_size =
+      (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
+  const int temp_size =
+      (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 4 * wino_iw * wino_iw +
+       8 * wino_iw * wino_iw) *
+      threads;
+  ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
+
+  weights_.Resize({1, 1, 1, wino_iw * wino_iw * oc_pad * ic_pad});
+  ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
+  void* trans_tmp_ptr = malloc(sizeof(float) * wino_iw * wino_iw * oc * ic);
+  auto weights_data_ = weights_.mutable_data<float>();
+  if (!choose_small_) {
+    lite::arm::math::weight_trans_c4_8x8(
+        weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
+  } else {
+    lite::arm::math::weight_trans_c4_4x4(
+        weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
+  }
+  free(trans_tmp_ptr);
+
   last_shape_ = x_dims;
 }
 
@@ -145,14 +113,7 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   int ow = o_dims[3];
   int oc = o_dims[1];
 
-  int tile_block = 8;
-#ifdef __aarch64__
-  tile_block = 16;
-#endif
-  int threads = ctx.threads();
-  int parallel_threads =
-      (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block;
-  if (threads <= 2 && parallel_threads >= threads) {
+  if (!choose_small_) {
     lite::arm::math::conv_compute_6x6_3x3(i_data,
                                           o_data,
                                           bs,
@@ -167,19 +128,38 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                                           param,
                                           &ctx);
   } else {
-    lite::arm::math::conv_winograd3x3(i_data,
-                                      o_data,
-                                      bs,
-                                      oc,
-                                      oh,
-                                      ow,
-                                      ic,
-                                      ih,
-                                      iw,
-                                      w_data,
-                                      b_data,
-                                      param,
-                                      &ctx);
+    int tile_block = 8;
+    int block_count =
+        (((ow + 1) / 2) * ((oh + 1) / 2) + tile_block - 1) / tile_block;
+    if (block_count != 1) {
+      lite::arm::math::conv_compute_2x2_3x3(i_data,
+                                            o_data,
+                                            bs,
+                                            oc,
+                                            oh,
+                                            ow,
+                                            ic,
+                                            ih,
+                                            iw,
+                                            w_data,
+                                            b_data,
+                                            param,
+                                            &ctx);
+    } else {
+      lite::arm::math::conv_compute_2x2_3x3_small(i_data,
+                                                  o_data,
+                                                  bs,
+                                                  oc,
+                                                  oh,
+                                                  ow,
+                                                  ic,
+                                                  ih,
+                                                  iw,
+                                                  w_data,
+                                                  b_data,
+                                                  param,
+                                                  &ctx);
+    }
   }
 }
 
diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h
index 40ea54b2918ad6c1b18d36a6df287c7e3eb312a6..1a184ac0ccae1967a2f77110ce2a6fb619cf2e8e 100644
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
@@ -40,7 +40,9 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
   Tensor weights_;
   DDim last_shape_;
   int workspace_size_{0};
-  int last_kernel_is_c4_{-1};
+  int last_function_{-1};
+  bool choose_small_{false};
+  int wino_iw{8};
 };
 
 }  // namespace arm
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.cc b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0871a3e84b42c8bcabbad53a8e98dc1d220714fb
--- /dev/null
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/distribute_fpn_proposals_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+const int kBoxDim = 4;
+
+template <typename T>
+static inline T BBoxArea(const T* box, bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+void DistributeFpnProposalsCompute::Run() {
+  auto& param = Param<operators::DistributeFpnProposalsParam>();
+  const lite::Tensor* fpn_rois = param.fpn_rois;
+  std::vector<lite::Tensor*> multi_fpn_rois = param.multi_fpn_rois;
+  lite::Tensor* restore_index = param.restore_index;
+  int min_level = param.min_level;
+  int max_level = param.max_level;
+  int refer_level = param.refer_level;
+  int refer_scale = param.refer_scale;
+  int num_level = max_level - min_level + 1;
+
+  CHECK_EQ(fpn_rois->lod().size(), 1);
+  auto fpn_rois_lod = fpn_rois->lod().back();
+  int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1];
+  std::vector<int> target_level;
+  // record the number of rois in each level
+  std::vector<int> num_rois_level(num_level, 0);
+  std::vector<int> num_rois_level_integral(num_level + 1, 0);
+  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+    auto fpn_rois_slice =
+        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
+                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
+    const float* rois_data = fpn_rois_slice.data<float>();
+    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+      // get the target level of current rois
+      float roi_scale = std::sqrt(BBoxArea(rois_data, false));
+      int tgt_lvl = std::floor(
+          std::log2(roi_scale / refer_scale + static_cast<float>(1e-6)) +
+          refer_level);
+      tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level));
+      target_level.push_back(tgt_lvl);
+      num_rois_level[tgt_lvl - min_level]++;
+      rois_data += kBoxDim;
+    }
+  }
+  // define the output rois
+  // pointer which point to each level fpn rois
+  std::vector<float*> multi_fpn_rois_data(num_level);
+  // lod0 which will record the offset information of each level rois
+  std::vector<std::vector<uint64_t>> multi_fpn_rois_lod0;
+  for (int i = 0; i < num_level; ++i) {
+    // allocate memory for each level rois
+    multi_fpn_rois[i]->Resize({num_rois_level[i], kBoxDim});
+    multi_fpn_rois_data[i] = multi_fpn_rois[i]->mutable_data<float>();
+    std::vector<uint64_t> lod0(1, 0);
+    multi_fpn_rois_lod0.push_back(lod0);
+    // statistic start point for each level rois
+    num_rois_level_integral[i + 1] =
+        num_rois_level_integral[i] + num_rois_level[i];
+  }
+  restore_index->Resize({fpn_rois_num, 1});
+  int* restore_index_data = restore_index->mutable_data<int>();
+  std::vector<int> restore_index_inter(fpn_rois_num, -1);
+  // distribute the rois into different fpn level by target level
+  for (size_t i = 0; i < fpn_rois_lod.size() - 1; ++i) {
+    Tensor fpn_rois_slice =
+        fpn_rois->Slice<float>(static_cast<int64_t>(fpn_rois_lod[i]),
+                               static_cast<int64_t>(fpn_rois_lod[i + 1]));
+    const float* rois_data = fpn_rois_slice.data<float>();
+    size_t cur_offset = fpn_rois_lod[i];
+    // std::vector<size_t > lod_offset[num_level];
+    for (int j = 0; j < num_level; j++) {
+      multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]);
+    }
+    for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) {
+      int lvl = target_level[cur_offset + j];
+      memcpy(multi_fpn_rois_data[lvl - min_level],
+             rois_data,
+             kBoxDim * sizeof(float));
+      multi_fpn_rois_data[lvl - min_level] += kBoxDim;
+      int index_in_shuffle = num_rois_level_integral[lvl - min_level] +
+                             multi_fpn_rois_lod0[lvl - min_level][i + 1];
+      restore_index_inter[index_in_shuffle] = cur_offset + j;
+      multi_fpn_rois_lod0[lvl - min_level][i + 1]++;
+      rois_data += kBoxDim;
+    }
+  }
+  for (int i = 0; i < fpn_rois_num; ++i) {
+    restore_index_data[restore_index_inter[i]] = i;
+  }
+  // merge lod information into LoDTensor
+  for (int i = 0; i < num_level; ++i) {
+    lite::LoD lod;
+    lod.emplace_back(multi_fpn_rois_lod0[i]);
+    multi_fpn_rois[i]->set_lod(lod);
+  }
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(distribute_fpn_proposals,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::DistributeFpnProposalsCompute,
+                     def)
+    .BindInput("FpnRois", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("MultiFpnRois", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("RestoreIndex", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.h b/lite/kernels/arm/distribute_fpn_proposals_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..e150b338de05f8411b7007f7e39f37ce6b4b5503
--- /dev/null
+++ b/lite/kernels/arm/distribute_fpn_proposals_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/distribute_fpn_proposals_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class DistributeFpnProposalsCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::DistributeFpnProposalsParam;
+
+  void Run() override;
+
+  virtual ~DistributeFpnProposalsCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/elementwise_compute.cc b/lite/kernels/arm/elementwise_compute.cc
index 2e57b6a3b37c91845d75444333fb205683cfd81c..94c5e140bace0e08e962ac74b82a3f9b241adb11 100644
--- a/lite/kernels/arm/elementwise_compute.cc
+++ b/lite/kernels/arm/elementwise_compute.cc
@@ -161,20 +161,21 @@ void ElementwiseSubActivationCompute::Run() {
   }
 }
 
-void ElementwiseMulCompute::Run() {
-  auto& param = Param<operators::ElementwiseParam>();
-  const float* x_data = param.X->data<float>();
-  const float* y_data = param.Y->data<float>();
-  float* out_data = param.Out->mutable_data<float>();
+template <typename T, PrecisionType PType>
+void ElementwiseMulCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ElementwiseParam>();
+  auto* x_data = param.X->template data<T>();
+  auto* y_data = param.Y->template data<T>();
+  auto* out_data = param.Out->template mutable_data<T>();
   int axis = param.axis;
   auto x_dims = param.X->dims();
   auto y_dims = param.Y->dims();
   int pre, n, post;
   if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
-    lite::arm::math::elementwise_mul_broadcast(
+    lite::arm::math::elementwise_mul_broadcast<T>(
         x_data, y_data, out_data, pre, n, post);
   } else {
-    lite::arm::math::elementwise_mul(
+    lite::arm::math::elementwise_mul<T>(
         x_data, y_data, out_data, x_dims.production());
   }
 }
@@ -347,17 +348,24 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(elementwise_mul,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::ElementwiseMulCompute,
-                     def)
+using elementwise_mul_float =
+    paddle::lite::kernels::arm::ElementwiseMulCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(
+    elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
+using elementwise_mul_int32 =
+    paddle::lite::kernels::arm::ElementwiseMulCompute<int, PRECISION(kInt32)>;
+REGISTER_LITE_KERNEL(
+    elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(
     fusion_elementwise_mul_activation,
     kARM,
diff --git a/lite/kernels/arm/elementwise_compute.h b/lite/kernels/arm/elementwise_compute.h
index e76449aebcfa16317df99771f2b686d9a179ec25..731010a0d189c08f031363e6df95652c000a237b 100644
--- a/lite/kernels/arm/elementwise_compute.h
+++ b/lite/kernels/arm/elementwise_compute.h
@@ -54,8 +54,8 @@ class ElementwiseSubActivationCompute
   virtual ~ElementwiseSubActivationCompute() = default;
 };
 
-class ElementwiseMulCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ElementwiseMulCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   void Run() override;
 
diff --git a/lite/kernels/arm/elementwise_compute_test.cc b/lite/kernels/arm/elementwise_compute_test.cc
index 2bc5863a181b9b46147dc090647588b5a4b178f2..b0ac3a7d33d92239c83147a3fe7615cd2fbf0249 100644
--- a/lite/kernels/arm/elementwise_compute_test.cc
+++ b/lite/kernels/arm/elementwise_compute_test.cc
@@ -329,13 +329,13 @@ TEST(elementwise_mul_arm, retrive_op) {
 }
 
 TEST(elementwise_mul_arm, init) {
-  ElementwiseMulCompute elementwise_mul;
+  ElementwiseMulCompute<float, PRECISION(kFloat)> elementwise_mul;
   ASSERT_EQ(elementwise_mul.precision(), PRECISION(kFloat));
   ASSERT_EQ(elementwise_mul.target(), TARGET(kARM));
 }
 
 TEST(elementwise_mul, compute) {
-  ElementwiseMulCompute elementwise_mul;
+  ElementwiseMulCompute<float, PRECISION(kFloat)> elementwise_mul;
   operators::ElementwiseParam param;
   lite::Tensor x, y, output, output_ref;
 
diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc
index 05d43dddec47a303a89a2d48b3fb91ff45e6e2c0..ad475538576b9cc73a43bac49cba1a6cf1c73edb 100644
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -20,8 +20,7 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-template <typename T>
-class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::FillConstantParam;
 
@@ -86,9 +85,8 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
   virtual ~FillConstantCompute() = default;
 };
 
-template <typename T>
 class FillConstantBatchLikeCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
  public:
   using param_t = operators::FillConstantBatchLikeParam;
 
@@ -135,24 +133,23 @@ class FillConstantBatchLikeCompute
 // float
 REGISTER_LITE_KERNEL(fill_constant,
                      kARM,
-                     kFloat,
+                     kAny,
                      kNCHW,
-                     paddle::lite::kernels::arm::FillConstantCompute<float>,
+                     paddle::lite::kernels::arm::FillConstantCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("ShapeTensor",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("ShapeTensorList",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
-REGISTER_LITE_KERNEL(
-    fill_constant_batch_size_like,
-    kARM,
-    kFloat,
-    kNCHW,
-    paddle::lite::kernels::arm::FillConstantBatchLikeCompute<float>,
-    def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+REGISTER_LITE_KERNEL(fill_constant_batch_size_like,
+                     kARM,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::arm::FillConstantBatchLikeCompute,
+                     def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/gather_compute.cc b/lite/kernels/arm/gather_compute.cc
index a46a6f9d6ab4850506c681ac3ca80e23d18b97d4..c91b86e53f59deb362470f12ab55332ec9e96e8f 100644
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -29,7 +29,7 @@ void GatherCompute::Run() {
   auto index_size = param.Index->dims()[0];
   auto src_dims = param.X->dims();
   const float* p_src = param.X->data<float>();
-  const float* p_index = param.Index->data<float>();
+  const int* p_index = param.Index->data<int>();
 
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) {
@@ -50,6 +50,8 @@ void GatherCompute::Run() {
 
 REGISTER_LITE_KERNEL(
     gather, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/instance_norm_compute.cc b/lite/kernels/arm/instance_norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3e82c53ac066ad6d2db25f675d90774aea4fe3a
--- /dev/null
+++ b/lite/kernels/arm/instance_norm_compute.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/instance_norm_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void InstanceNormCompute::PrepareForRun() {}
+
+void InstanceNormCompute::Run() {
+  auto& param = this->Param<param_t>();
+  const float* in = param.x->data<float>();
+  const float* scale = param.scale->data<float>();
+  const float* bias = param.bias->data<float>();
+  float* out = param.out->mutable_data<float>();
+  float* saved_mean = param.saved_mean->mutable_data<float>();
+  float* saved_variance = param.saved_variance->mutable_data<float>();
+  float epsilon = param.epsilon;
+
+  int n = param.x->dims()[0];
+  int c = param.x->dims()[1];
+  int nc = n * c;
+  int height = param.x->dims()[2];
+  int width = param.x->dims()[3];
+  int spatial_size = height * width;
+// compute saved_mean and saved_variance
+#pragma omp parallel for
+  for (int i = 0; i < nc; ++i) {
+    const float* in_p = in + i * spatial_size;
+    float sum_spatial = 0.f;
+    float summ_spatial = 0.f;
+    for (int h = 0; h < height; ++h) {
+      int w = width;
+      float32x4_t sum0 = vdupq_n_f32(0.f);
+      float32x4_t sum1 = vdupq_n_f32(0.f);
+      float32x4_t sum2 = vdupq_n_f32(0.f);
+      float32x4_t sum3 = vdupq_n_f32(0.f);
+      float32x4_t summ0 = vdupq_n_f32(0.f);
+      float32x4_t summ1 = vdupq_n_f32(0.f);
+      float32x4_t summ2 = vdupq_n_f32(0.f);
+      float32x4_t summ3 = vdupq_n_f32(0.f);
+      float32x4_t in0, in1, in2, in3;
+      for (; w > 15; w -= 16) {
+        in0 = vld1q_f32(in_p);
+        in1 = vld1q_f32(in_p + 4);
+        in2 = vld1q_f32(in_p + 8);
+        in3 = vld1q_f32(in_p + 12);
+        sum0 = vaddq_f32(sum0, in0);
+        sum1 = vaddq_f32(sum1, in1);
+        summ0 = vmlaq_f32(summ0, in0, in0);
+        summ1 = vmlaq_f32(summ1, in1, in1);
+        sum2 = vaddq_f32(sum2, in2);
+        sum3 = vaddq_f32(sum3, in3);
+        summ2 = vmlaq_f32(summ2, in2, in2);
+        summ3 = vmlaq_f32(summ3, in3, in3);
+        in_p += 16;
+      }
+      for (; w > 7; w -= 8) {
+        in0 = vld1q_f32(in_p);
+        in1 = vld1q_f32(in_p + 4);
+        sum0 = vaddq_f32(sum0, in0);
+        sum1 = vaddq_f32(sum1, in1);
+        summ0 = vmlaq_f32(summ0, in0, in0);
+        summ1 = vmlaq_f32(summ1, in1, in1);
+        in_p += 8;
+      }
+      for (; w > 3; w -= 4) {
+        in0 = vld1q_f32(in_p);
+        sum0 = vaddq_f32(sum0, in0);
+        summ0 = vmlaq_f32(summ0, in0, in0);
+        in_p += 4;
+      }
+      float sum = 0.f;
+      float summ = 0.f;
+      for (; w > 0; w--) {
+        sum += *in_p;
+        summ += (*in_p) * (*in_p);
+        in_p++;
+      }
+      sum0 = vaddq_f32(sum0, sum1);
+      sum2 = vaddq_f32(sum2, sum3);
+      summ0 = vaddq_f32(summ0, summ1);
+      summ2 = vaddq_f32(summ2, summ3);
+      sum0 = vaddq_f32(sum0, sum2);
+      summ0 = vaddq_f32(summ0, summ2);
+      float32x2_t sum_low = vpadd_f32(vget_low_f32(sum0), vget_high_f32(sum0));
+      float32x2_t sum_high =
+          vpadd_f32(vget_low_f32(summ0), vget_high_f32(summ0));
+      float32x2_t sum_mix = vpadd_f32(sum_low, sum_high);
+      sum += vget_lane_f32(sum_mix, 0);
+      summ += vget_lane_f32(sum_mix, 1);
+      sum_spatial += sum;
+      summ_spatial += summ;
+    }
+    float mean = sum_spatial / spatial_size;
+    // float variance = summ / spatial_size - mean * mean;
+    // the flolowing code has higher precision than above comment code
+    float variance = (summ_spatial - mean * mean * spatial_size) / spatial_size;
+    float std = 1.f / sqrtf(variance + epsilon);
+
+    saved_mean[i] = mean;
+    saved_variance[i] = std;
+  }
+// compute instance_norm result: out = scale * (in - mean) / std + bias
+#pragma omp parallel for
+  for (int i = 0; i < nc; ++i) {
+    const float* in_p = in + i * spatial_size;
+    float* out_p = out + i * spatial_size;
+    int j = spatial_size;
+    const float sstd_val = scale[i % c] * saved_variance[i];
+    const float bias_val = bias[i % c];
+    const float mean_val = saved_mean[i];
+    const float32x4_t vsstd = vdupq_n_f32(sstd_val);
+    const float32x4_t vbias = vdupq_n_f32(bias_val);
+    const float32x4_t vmean = vdupq_n_f32(mean_val);
+    float32x4_t in0, in1, submean0, submean1, out0, out1;
+    for (; j > 7; j -= 8) {
+      in0 = vld1q_f32(in_p);
+      in1 = vld1q_f32(in_p + 4);
+      submean0 = vsubq_f32(in0, vmean);
+      submean1 = vsubq_f32(in1, vmean);
+      out0 = vmlaq_f32(vbias, submean0, vsstd);
+      out1 = vmlaq_f32(vbias, submean1, vsstd);
+      vst1q_f32(out_p, out0);
+      vst1q_f32(out_p + 4, out1);
+      in_p += 8;
+      out_p += 8;
+    }
+    for (; j > 3; j -= 4) {
+      in0 = vld1q_f32(in_p);
+      submean0 = vsubq_f32(in0, vmean);
+      out0 = vmlaq_f32(vbias, submean0, vsstd);
+      vst1q_f32(out_p, out0);
+      in_p += 4;
+      out_p += 4;
+    }
+    for (; j > 0; j--) {
+      *out_p = (*in_p - mean_val) * sstd_val + bias_val;
+      in_p++;
+      out_p++;
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(instance_norm,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::InstanceNormCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/instance_norm_compute.h b/lite/kernels/arm/instance_norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fc056a372282b28613b1dd248c7c3087be76ede
--- /dev/null
+++ b/lite/kernels/arm/instance_norm_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class InstanceNormCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::InstanceNormParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~InstanceNormCompute() = default;
+
+ private:
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/interpolate_compute.cc b/lite/kernels/arm/interpolate_compute.cc
index 0398dabeaee4c042b33ac5572b783b126bc8ddb4..760b2fcf0630a632d1f1bbaeda7760d2de25a7a4 100644
--- a/lite/kernels/arm/interpolate_compute.cc
+++ b/lite/kernels/arm/interpolate_compute.cc
@@ -84,8 +84,10 @@ REGISTER_LITE_KERNEL(bilinear_interp,
                      paddle::lite::kernels::arm::BilinearInterpCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
@@ -97,8 +99,10 @@ REGISTER_LITE_KERNEL(nearest_interp,
                      paddle::lite::kernels::arm::NearestInterpCompute,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("OutSize",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/merge_lod_tensor_compute.cc b/lite/kernels/arm/merge_lod_tensor_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..938056376f0092d8adbf5d7756259a289c5c3687
--- /dev/null
+++ b/lite/kernels/arm/merge_lod_tensor_compute.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/merge_lod_tensor_compute.h"
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+void MergeLodTensorCompute::Run() {
+  auto &param = Param<operators::MergeLodTensorParam>();
+  const lite::Tensor *x = param.x;
+  const lite::Tensor *mask = param.mask;
+  const lite::Tensor *in_true = param.in_true;
+  const lite::Tensor *in_false = param.in_false;
+  lite::Tensor *out = param.out;
+  int level = param.level;
+
+  CHECK(in_true->IsInitialized() || in_false->IsInitialized());
+
+  auto &in_true_dim = in_true->dims();
+  auto &in_false_dim = in_false->dims();
+
+  // only merge the first dim
+  int64_t batch_size = 0;
+  std::vector<int64_t> out_shape;
+  if (in_true->IsInitialized()) {
+    batch_size += in_true->dims()[0];
+  }
+  if (in_false->IsInitialized()) {
+    batch_size += in_false->dims()[0];
+  }
+  out_shape.push_back(batch_size);
+  if (in_true->IsInitialized()) {
+    for (int i = 1; i < in_true_dim.size(); i++) {
+      out_shape.push_back(in_true_dim[i]);
+    }
+  } else {
+    for (int i = 1; i < in_false_dim.size(); i++) {
+      out_shape.push_back(in_false_dim[i]);
+    }
+  }
+  out->Resize(out_shape);
+
+  size_t base_num = static_cast<size_t>(out->numel() / batch_size);
+  auto *out_data = out->mutable_data<float>();
+  auto *out_lod = out->mutable_lod();
+  out_lod->clear();
+  auto &mask_dim = mask->dims();
+  auto *mask_data = mask->data<bool>();
+
+  size_t out_offset = 0;
+  size_t in_true_idx = 0;
+  size_t in_false_idx = 0;
+  for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+    const Tensor *input = nullptr;
+    size_t *in_idx = nullptr;
+    if (static_cast<int>(mask_data[i]) == 0) {
+      input = in_false;
+      in_idx = &in_false_idx;
+    } else {
+      input = in_true;
+      in_idx = &in_true_idx;
+    }
+    auto lod_and_offset = lite::arm::math::GetSubLoDAndAbsoluteOffset(
+        input->lod(), *in_idx, (*in_idx) + 1, 0);
+    auto &lod_length = lod_and_offset.first;
+
+    lite::arm::math::AppendLoD(out_lod, lod_length);
+
+    size_t start_offset = lod_and_offset.second.first;
+    size_t end_offset = lod_and_offset.second.second;
+
+    CHECK(end_offset >= start_offset);
+    size_t len = end_offset - start_offset;
+    if (len == 0) {
+      continue;
+    }
+    auto *in_src = input->data<float>() + base_num * start_offset;
+    auto *out_dest = out_data + base_num * out_offset;
+    size_t copy_num = base_num * len * sizeof(float);
+    memcpy(out_dest, in_src, copy_num);
+    out_offset += len;
+    (*in_idx) += 1;
+  }
+
+  for (size_t i = 0; i < level; i++) {
+    out_lod->insert(out_lod->begin(), x->lod()[i]);
+  }
+
+  return;
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(merge_lod_tensor,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::MergeLodTensorCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Mask", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .BindInput("InTrue", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("InFalse", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/merge_lod_tensor_compute.h b/lite/kernels/arm/merge_lod_tensor_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..43d325d9b695da6dae229292e9ca8e73d8344444
--- /dev/null
+++ b/lite/kernels/arm/merge_lod_tensor_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/merge_lod_tensor_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class MergeLodTensorCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MergeLodTensorParam;
+
+  void Run() override;
+
+  virtual ~MergeLodTensorCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/merge_lod_tensor_compute_test.cc b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..914a58308bdf0d5c6d374d5f81ca38224941c85d
--- /dev/null
+++ b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/merge_lod_tensor_compute.h"
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+TEST(merge_lod_tensor_arm, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "merge_lod_tensor");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(merge_lod_tensor_arm, init) {
+  MergeLodTensorCompute cpt;
+  ASSERT_EQ(cpt.precision(), PRECISION(kFloat));
+  ASSERT_EQ(cpt.target(), TARGET(kARM));
+}
+
+TEST(merge_lod_tensor_arm_0, compute) {
+  DeviceInfo::Init();
+  Tensor x;
+  Tensor mask;
+  Tensor in_true;
+  Tensor in_false;
+  Tensor out;
+  int level = 0;
+
+  // set dims and lod
+  mask.Resize({3, 1});
+
+  in_true.Resize({1, 1});
+  LoD in_true_lod;
+  std::vector<uint64_t> in_true_lod0 = {0, 1};
+  in_true_lod.push_back(in_true_lod0);
+  in_true.set_lod(in_true_lod);
+
+  in_false.Resize({4, 1});
+  LoD in_false_lod;
+  std::vector<uint64_t> in_false_lod0 = {0, 2, 4};
+  in_false_lod.push_back(in_false_lod0);
+  in_false.set_lod(in_false_lod);
+
+  // initialize data
+  auto* in_true_data = in_true.mutable_data<float>();
+  for (size_t i = 0; i < in_true.numel(); i++) {
+    in_true_data[i] = static_cast<float>(i);
+  }
+  auto* in_false_data = in_false.mutable_data<float>();
+  for (size_t i = 0; i < in_false.numel(); i++) {
+    in_false_data[i] = static_cast<float>(i + 1);
+  }
+  auto* mask_data = mask.mutable_data<bool>();
+  for (size_t i = 0; i < mask.numel(); i++) {
+    mask_data[i] = static_cast<bool>(i % 2);
+  }
+
+  // prepare kernel params and run to obtain output_data
+  MergeLodTensorCompute op;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  op.SetContext(std::move(ctx));
+
+  operators::MergeLodTensorParam param;
+  param.x = &x;
+  param.mask = &mask;
+  param.in_true = &in_true;
+  param.in_false = &in_false;
+  param.out = &out;
+  param.level = level;
+  op.SetParam(param);
+  op.Launch();
+
+  auto* out_data = out.data<float>();
+  std::vector<float> out_ref = {1, 2, 0, 3, 4};
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-5);
+  }
+}
+TEST(merge_lod_tensor_arm_1, compute) {
+  DeviceInfo::Init();
+  Tensor x;
+  Tensor mask;
+  Tensor in_true;
+  Tensor in_false;
+  Tensor out;
+  int level = 0;
+
+  // set dims and lod
+  mask.Resize({3, 1});
+
+  in_true.Resize({3, 3});
+  LoD in_true_lod = {{0, 1}, {0, 3}};
+  in_true.set_lod(in_true_lod);
+
+  in_false.Resize({6, 3});
+  LoD in_false_lod = {{0, 2, 4}, {0, 1, 3, 5, 6}};
+  in_false.set_lod(in_false_lod);
+
+  // initialize data
+  auto* in_true_data = in_true.mutable_data<float>();
+  for (size_t i = 0; i < in_true.numel(); i++) {
+    in_true_data[i] = static_cast<float>(i);
+  }
+  auto* in_false_data = in_false.mutable_data<float>();
+  for (size_t i = 0; i < in_false.numel(); i++) {
+    in_false_data[i] = static_cast<float>(i + 1);
+  }
+  auto* mask_data = mask.mutable_data<bool>();
+  for (size_t i = 0; i < mask.numel(); i++) {
+    mask_data[i] = static_cast<bool>(i % 2);
+  }
+
+  // prepare kernel params and run to obtain output_data
+  MergeLodTensorCompute op;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  op.SetContext(std::move(ctx));
+
+  operators::MergeLodTensorParam param;
+  param.x = &x;
+  param.mask = &mask;
+  param.in_true = &in_true;
+  param.in_false = &in_false;
+  param.out = &out;
+  param.level = level;
+  op.SetParam(param);
+  op.Launch();
+
+  auto* out_data = out.data<float>();
+  std::vector<float> out_ref = {1,  2,  3,  4,  5,  6,  7,  8,  9,
+                                0,  1,  2,  3,  4,  5,  6,  7,  8,
+                                10, 11, 12, 13, 14, 15, 16, 17, 18};
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-5);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(merge_lod_tensor, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc
index c9f0fed47854226327be86a02a9429a003fe4762..f97d58f9642239beebb4d4b204d069b691c06bc4 100644
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -41,18 +41,20 @@ void PoolCompute::Run() {
   std::vector<int>& paddings = *param.paddings;
 
   std::string& pooling_type = param.pooling_type;
-  bool global_pooling = param.global_pooling;
   bool exclusive = param.exclusive;
   bool adaptive = param.adaptive;
   bool ceil_mode = param.ceil_mode;
   bool use_quantizer = param.use_quantizer;
   std::string& data_format = param.data_format;
 
-  bool pads_equal =
-      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
-
-  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
-                   (paddings[0] == paddings[2]);
+  bool pads_equal = (paddings[0] == paddings[1]) &&
+                    (paddings[2] == paddings[3]) &&
+                    (paddings[0] == paddings[2]);
+  bool kps_equal =
+      (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && pads_equal;
+  bool global_pooling = (paddings[0] == 0) && (ksize[0] == in_dims[2]) &&
+                        (ksize[1] == in_dims[3]) && pads_equal;
+  global_pooling = param.global_pooling || global_pooling;
   if (global_pooling) {
     for (size_t i = 0; i < ksize.size(); ++i) {
       paddings[2 * i] = 0;
@@ -83,8 +85,7 @@ void PoolCompute::Run() {
       return;
     }
   } else {
-    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && pads_equal &&
-        kps_equal) {
+    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling2x2s2_max(din,
                                           dout,
@@ -110,7 +111,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
-               pads_equal && kps_equal) {
+               kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s1p1_max(din,
                                             dout,
@@ -136,7 +137,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
-               pads_equal && kps_equal) {
+               kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s1p0_max(din,
                                             dout,
@@ -162,7 +163,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
-               pads_equal && kps_equal) {
+               kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s2p0_max(din,
                                             dout,
@@ -188,7 +189,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
-               pads_equal && kps_equal) {
+               kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s2p1_max(din,
                                             dout,
diff --git a/lite/kernels/arm/reduce_prod_compute.cc b/lite/kernels/arm/reduce_prod_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..843be09891c6c2c799eb703687c5aa1b9401c644
--- /dev/null
+++ b/lite/kernels/arm/reduce_prod_compute.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/reduce_prod_compute.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+template <typename T, PrecisionType Ptype>
+void ReduceProdCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<operators::ReduceParam>();
+  auto* input = param.x->template data<T>();
+  auto x_dims = param.x->dims();
+  int x_rank = x_dims.size();
+  auto* output = param.output->template mutable_data<T>();
+  std::vector<int> dim = param.dim;
+  bool keep_dim = param.keep_dim;
+  bool reduce_all = param.reduce_all;
+
+  if (!dim.empty()) {
+    for (int i = 0; i < dim.size(); i++) {
+      if (dim[i] < 0) {
+        dim[i] += x_rank;
+      }
+    }
+  }
+
+  if (reduce_all) {
+    lite::arm::math::reduce_prod_all(input, output, x_dims.production());
+  } else {
+    CHECK_EQ(x_rank, 4U);
+    int n_in = x_dims[0];
+    int c_in = x_dims[1];
+    int h_in = x_dims[2];
+    int w_in = x_dims[3];
+
+    if (dim.size() == 1) {
+      switch (dim[0]) {
+        case 0:
+          lite::arm::math::reduce_prod_n(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 1:
+          lite::arm::math::reduce_prod_c(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 2:
+          lite::arm::math::reduce_prod_h(input, output, n_in, c_in, h_in, w_in);
+          break;
+        case 3:
+          lite::arm::math::reduce_prod_w(input, output, n_in, c_in, h_in, w_in);
+          break;
+        default:
+          LOG(FATAL) << "dim[0] should be less than 4.";
+      }
+    } else if (dim.size() == 2) {
+      if (dim[0] == 0 && dim[1] == 1) {
+        lite::arm::math::reduce_prod_nc(input, output, n_in, c_in, h_in, w_in);
+      } else if (dim[0] == 1 && dim[1] == 2) {
+        lite::arm::math::reduce_prod_ch(input, output, n_in, c_in, h_in, w_in);
+      } else if (dim[0] == 2 && dim[1] == 3) {
+        lite::arm::math::reduce_prod_hw(input, output, n_in, c_in, h_in, w_in);
+      } else {
+        LOG(FATAL)
+            << "Only support the values of the dim are 0,1 1,2 or 2,3 for now.";
+      }
+    } else {
+      LOG(FATAL) << "dim's size over than 2, which is not supported now!!";
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using reduce_prob_arm_int32 =
+    paddle::lite::kernels::arm::ReduceProdCompute<int, PRECISION(kInt32)>;
+using reduce_prob_arm_float =
+    paddle::lite::kernels::arm::ReduceProdCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(
+    reduce_prod, kARM, kInt32, kNCHW, reduce_prob_arm_int32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(
+    reduce_prod, kARM, kFloat, kNCHW, reduce_prob_arm_float, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/arm/reduce_prod_compute.h b/lite/kernels/arm/reduce_prod_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f13e2cdd12c9fdc76f004f54a5a9fd0c06eb3ef
--- /dev/null
+++ b/lite/kernels/arm/reduce_prod_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+#include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+template <typename T, PrecisionType Ptype>
+class ReduceProdCompute : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  void Run() override;
+
+  virtual ~ReduceProdCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/shape_compute.cc b/lite/kernels/arm/shape_compute.cc
index f382004d7af3c45bb6c2b6ce952738e21c048238..3928e845023dd10c66704e1d752d2e5d2d7a5aff 100644
--- a/lite/kernels/arm/shape_compute.cc
+++ b/lite/kernels/arm/shape_compute.cc
@@ -37,5 +37,5 @@ void ShapeCompute::Run() {
 REGISTER_LITE_KERNEL(
     shape, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ShapeCompute, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .Finalize();
diff --git a/lite/kernels/arm/slice_compute.cc b/lite/kernels/arm/slice_compute.cc
index dc9c5d3da0bba40eaf1c2456a15f8e3ef566dbf3..05f48917aae105decc8bbb8e86fbef501d7fb8be 100644
--- a/lite/kernels/arm/slice_compute.cc
+++ b/lite/kernels/arm/slice_compute.cc
@@ -42,11 +42,10 @@ inline std::vector<int32_t> get_new_data_from_tensor(
   return vec_new_data;
 }
 
-void SliceCompute::PrepareForRun() {}
-
-void SliceCompute::Run() {
+template <typename T, PrecisionType PType>
+void SliceCompute<T, PType>::Run() {
   auto& ctx = this->ctx_->template As<ARMContext>();
-  auto& param = this->Param<operators::SliceParam>();
+  auto& param = this->template Param<operators::SliceParam>();
 
   auto in = param.X;
   auto in_dims = in->dims();
@@ -156,8 +155,8 @@ void SliceCompute::Run() {
   }
 
   auto new_out_dims = out->dims();
-  const auto* x_data = in->data<int>();
-  auto* o_data = out->mutable_data<int>();
+  const auto* x_data = in->template data<T>();
+  auto* o_data = out->template mutable_data<T>();
   lite::arm::math::slice(
       x_data, in_dims.data(), axes, starts, ends, o_data, &ctx);
 }
@@ -167,8 +166,9 @@ void SliceCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    slice, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SliceCompute, def)
+using slice_float =
+    paddle::lite::kernels::arm::SliceCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(slice, kARM, kFloat, kNCHW, slice_float, def)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("StartsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("EndsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
@@ -176,3 +176,14 @@ REGISTER_LITE_KERNEL(
     .BindInput("EndsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+using slice_int32 =
+    paddle::lite::kernels::arm::SliceCompute<int, PRECISION(kInt32)>;
+REGISTER_LITE_KERNEL(slice, kARM, kInt32, kNCHW, slice_int32, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("StartsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("EndsTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("StartsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("EndsTensorList", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .Finalize();
diff --git a/lite/kernels/arm/slice_compute.h b/lite/kernels/arm/slice_compute.h
index 701dacfbb4d011814805f4111eee89dcadfefe80..152a3284b83f64ce20810444aab2634b32a16c97 100644
--- a/lite/kernels/arm/slice_compute.h
+++ b/lite/kernels/arm/slice_compute.h
@@ -22,12 +22,12 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace arm {
-class SliceCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+
+template <typename T, PrecisionType PType>
+class SliceCompute : public KernelLite<TARGET(kARM), PType> {
  public:
   using param_t = operators::SliceParam;
 
-  void PrepareForRun() override;
-
   void Run() override;
 
   ~SliceCompute() {}
diff --git a/lite/kernels/arm/split_lod_tensor_compute.cc b/lite/kernels/arm/split_lod_tensor_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..16603bc5fd5965e525122b76801281b0f48ccae7
--- /dev/null
+++ b/lite/kernels/arm/split_lod_tensor_compute.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/split_lod_tensor_compute.h"
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+void SplitLodTensorCompute::Run() {
+  auto &param = Param<operators::SplitLodTensorParam>();
+  const lite::Tensor *x = param.x;
+  const lite::Tensor *mask = param.mask;
+  lite::Tensor *out_true = param.out_true;
+  lite::Tensor *out_false = param.out_false;
+  int level = param.level;
+
+  auto &x_lod = x->lod();
+  auto &mask_dim = mask->dims();
+  auto *mask_data = mask->data<bool>();
+
+  std::vector<std::vector<CopyRange>> copy_ranges(2);
+  // set out_true/out_false lod
+  for (size_t t = 0; t < 2; t++) {
+    LoD *lod = nullptr;
+    if (t == 0) {
+      lod = out_false->mutable_lod();
+    } else {
+      lod = out_true->mutable_lod();
+    }
+    lod->clear();
+    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+      // VLOG(4) << "mask: " << mask_data[i];
+      if (static_cast<size_t>(mask_data[i]) == t) {
+        size_t start_idx = i;
+        auto lod_and_offset = lite::arm::math::GetSubLoDAndAbsoluteOffset(
+            x_lod, start_idx, start_idx + 1, level);
+
+        auto &lod_length = lod_and_offset.first;
+        lite::arm::math::AppendLoD(lod, lod_length);
+
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+      }
+    }
+  }
+
+  for (size_t t = 0; t < 2; ++t) {
+    Tensor *out;
+    if (t == 0) {
+      out = out_false;
+    } else {
+      out = out_true;
+    }
+    auto &ranges = copy_ranges[t];
+    size_t height = std::accumulate(
+        ranges.begin(), ranges.end(), 0UL, [](size_t a, const CopyRange &b) {
+          return a + b.end - b.begin;
+        });
+    auto x_dim = x->dims();
+    x_dim[0] = static_cast<int64_t>(height);
+    out->Resize(x_dim);
+    auto *x_data = x->data<float>();
+    auto *out_data = out->mutable_data<float>();
+    auto out_dim = out->dims();
+    size_t base_num = static_cast<size_t>(out->numel() / out_dim[0]);
+    size_t offset = 0;
+    for (auto &each_range : ranges) {
+      size_t len = each_range.end - each_range.begin;
+      if (len == 0) {
+        continue;
+      }
+
+      auto *x_from = x_data + base_num * each_range.begin;
+      auto *out_dest = out_data + base_num * offset;
+      size_t copy_num = base_num * len * sizeof(float);
+      memcpy(out_dest, x_from, copy_num);
+      offset += len;
+    }
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(split_lod_tensor,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::SplitLodTensorCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Mask", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .BindOutput("OutTrue", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("OutFalse", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/split_lod_tensor_compute.h b/lite/kernels/arm/split_lod_tensor_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..4fae31165721cb6da9bdf553ab0409a53814a749
--- /dev/null
+++ b/lite/kernels/arm/split_lod_tensor_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/split_lod_tensor_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+class SplitLodTensorCompute
+    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SplitLodTensorParam;
+
+  void Run() override;
+
+  virtual ~SplitLodTensorCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/split_lod_tensor_compute_test.cc b/lite/kernels/arm/split_lod_tensor_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3b2004c786698b70b4c54b68d696a9cf5f5221fd
--- /dev/null
+++ b/lite/kernels/arm/split_lod_tensor_compute_test.cc
@@ -0,0 +1,165 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/split_lod_tensor_compute.h"
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+TEST(split_lod_tensor_arm, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "split_lod_tensor");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(split_lod_tensor_arm, init) {
+  SplitLodTensorCompute cpt;
+  ASSERT_EQ(cpt.precision(), PRECISION(kFloat));
+  ASSERT_EQ(cpt.target(), TARGET(kARM));
+}
+
+TEST(split_lod_tensor_arm_0, compute) {
+  DeviceInfo::Init();
+  Tensor x;
+  Tensor mask;
+  Tensor out_true;
+  Tensor out_false;
+  int level = 0;
+
+  // set dims and lod
+  VLOG(5) << "set dims and lod";
+  x.Resize({5, 1});
+  LoD x_lod;
+  std::vector<uint64_t> x_lod0 = {0, 2, 3, 5};
+  x_lod.push_back(x_lod0);
+  x.set_lod(x_lod);
+  mask.Resize({3, 1});
+  out_true.Resize({5, 1});
+  out_false.Resize({5, 1});
+
+  // initialize data
+  VLOG(5) << "initialize data";
+  auto* x_data = x.mutable_data<float>();
+  for (size_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* mask_data = mask.mutable_data<bool>();
+  for (size_t i = 0; i < mask.numel(); i++) {
+    mask_data[i] = static_cast<bool>(i % 2);
+  }
+
+  // prepare kernel params and run to obtain output_data
+  VLOG(5) << "prepare kernel params";
+  SplitLodTensorCompute op;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  op.SetContext(std::move(ctx));
+
+  VLOG(5) << "run kernel";
+  operators::SplitLodTensorParam param;
+  param.x = &x;
+  param.mask = &mask;
+  param.out_true = &out_true;
+  param.out_false = &out_false;
+  param.level = level;
+  op.SetParam(param);
+  op.Launch();
+
+  VLOG(5) << "obtain results";
+  auto* out_true_data = out_true.data<float>();
+  std::vector<float> out_true_ref = {2};
+  for (int i = 0; i < out_true.numel(); i++) {
+    LOG(INFO) << out_true_data[i];
+    EXPECT_NEAR(out_true_data[i], out_true_ref[i], 1e-5);
+  }
+  auto* out_false_data = out_false.data<float>();
+  std::vector<float> out_false_ref = {0, 1, 3, 4};
+  for (int i = 0; i < out_false.numel(); i++) {
+    LOG(INFO) << out_false_data[i];
+    EXPECT_NEAR(out_false_data[i], out_false_ref[i], 1e-5);
+  }
+}
+TEST(split_lod_tensor_arm_1, compute) {
+  DeviceInfo::Init();
+  Tensor x;
+  Tensor mask;
+  Tensor out_true;
+  Tensor out_false;
+  int level = 0;
+
+  // set dims and lod
+  x.Resize({9, 3});
+  LoD x_lod;
+  std::vector<uint64_t> x_lod0 = {0, 2, 3, 5};
+  std::vector<uint64_t> x_lod1 = {0, 1, 3, 6, 8, 9};
+  x_lod.push_back(x_lod0);
+  x_lod.push_back(x_lod1);
+  x.set_lod(x_lod);
+  mask.Resize({3, 1});
+  out_true.Resize({9, 2});
+  out_false.Resize({9, 2});
+
+  // initialize data
+  auto* x_data = x.mutable_data<float>();
+  for (size_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* mask_data = mask.mutable_data<bool>();
+  for (size_t i = 0; i < mask.numel(); i++) {
+    mask_data[i] = static_cast<bool>(i % 2);
+  }
+
+  // prepare kernel params and run to obtain output_data
+  SplitLodTensorCompute op;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<ARMContext>();
+  op.SetContext(std::move(ctx));
+
+  operators::SplitLodTensorParam param;
+  param.x = &x;
+  param.mask = &mask;
+  param.out_true = &out_true;
+  param.out_false = &out_false;
+  param.level = level;
+  op.SetParam(param);
+  op.Launch();
+
+  auto* out_true_data = out_true.data<float>();
+  std::vector<float> out_true_ref = {9, 10, 11, 12, 13, 14, 15, 16, 17};
+  for (int i = 0; i < out_true.numel(); i++) {
+    EXPECT_NEAR(out_true_data[i], out_true_ref[i], 1e-5);
+  }
+  auto* out_false_data = out_false.data<float>();
+  std::vector<float> out_false_ref = {
+      0, 1, 2, 3, 4, 5, 6, 7, 8, 18, 19, 20, 21, 22, 23, 24, 25, 26};
+  for (int i = 0; i < out_false.numel(); i++) {
+    EXPECT_NEAR(out_false_data[i], out_false_ref[i], 1e-5);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(split_lod_tensor, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/while_compute.h b/lite/kernels/arm/while_compute.h
index c8ddc6086c0dbbcb855306e8ce5c268ff2ed8cf8..f735d96f9190755daacdf846a2d99901c1a14493 100644
--- a/lite/kernels/arm/while_compute.h
+++ b/lite/kernels/arm/while_compute.h
@@ -36,7 +36,7 @@ class StepExecutor {
       auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
       auto op_type = op_desc.Type();
       auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      VLOG(4) << "while: creating Op [" << op_type << "]";
+      // VLOG(4) << "while: creating Op [" << op_type << "]";
       op_handler->Attach(op_desc, scope);
 
       auto hostplace = place_;
@@ -51,9 +51,9 @@ class StepExecutor {
 
   void Run() {
     for (auto &op_handler : ops_of_block_) {
-      VLOG(4) << op_handler->op_info()->Repr();
+      // VLOG(4) << op_handler->op_info()->Repr();
       op_handler->InferShape();
-      VLOG(4) << "while: infered shape";
+      // VLOG(4) << "while: infered shape";
       op_handler->Run();
     }
   }
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index 4bf1cbf5210214befb3620f8b7d70923f41f98f2..313bb0da1cb2dad2e6867d266bc7acb1ac52a183 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -26,7 +26,7 @@ add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS
 ${lite_kernel_deps} cudnn_pool)
 add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
-add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm ${math_cuda})
 add_kernel(sequence_reverse_compute_cuda CUDA basic SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_concat_compute_cuda CUDA basic SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_arithmetic_compute_cuda CUDA basic SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps})
diff --git a/lite/kernels/cuda/layout_compute.cc b/lite/kernels/cuda/layout_compute.cc
index 6b56d9e1de28cbec57b4b45aff1d1b237b1784b9..17462a5015142540e7b1d5cb9eb1e74acd9621b5 100644
--- a/lite/kernels/cuda/layout_compute.cc
+++ b/lite/kernels/cuda/layout_compute.cc
@@ -29,7 +29,7 @@ inline DDim trim_singular_dims(const DDim& dims) {
   }
   std::vector<int64_t> trim_dims;
   trim_dims.resize(actual_dims_size);
-  for (int i = 0; i < actual_dims_size; ++i) {
+  for (size_t i = 0; i < actual_dims_size; ++i) {
     trim_dims[i] = dims[i];
   }
   if (trim_dims.size() == 0) {
@@ -41,6 +41,7 @@ inline DDim trim_singular_dims(const DDim& dims) {
 #define NCHWTONHWC(type)                                                  \
   auto& param = this->template Param<param_t>();                          \
   auto& ctx = this->ctx_->template As<CUDAContext>();                     \
+  auto stream = ctx.exec_stream();                                        \
   auto input = param.x->template data<type>();                            \
   auto input_dim = param.x->dims();                                       \
   DDim input_trim_dim = trim_singular_dims(input_dim);                    \
@@ -56,11 +57,12 @@ inline DDim trim_singular_dims(const DDim& dims) {
   int w = input_dim[3];                                                   \
   param.y->Resize({n, h, w, c});                                          \
   auto output = param.y->template mutable_data<type>(TARGET(kCUDA));      \
-  lite::cuda::math::NCHW2NHWC<type>(n, c, h * w, input, output, &ctx);
+  trans.NCHW2NHWC(n, c, h* w, input, output, &stream);
 
 #define NHWCTONCHW(type)                                                  \
   auto& param = this->template Param<param_t>();                          \
   auto& ctx = this->ctx_->template As<CUDAContext>();                     \
+  auto stream = ctx.exec_stream();                                        \
   auto input = param.x->template data<type>();                            \
   auto input_dim = param.x->dims();                                       \
   DDim input_trim_dim = trim_singular_dims(input_dim);                    \
@@ -76,7 +78,7 @@ inline DDim trim_singular_dims(const DDim& dims) {
   int c = input_dim[3];                                                   \
   param.y->Resize({n, c, h, w});                                          \
   auto output = param.y->template mutable_data<type>(TARGET(kCUDA));      \
-  lite::cuda::math::NHWC2NCHW<type>(n, c, h * w, input, output, &ctx);
+  trans.NHWC2NCHW(n, c, h* w, input, output, &stream);
 
 void NCHWToNHWCCompute::Run() { NCHWTONHWC(float) }
 
diff --git a/lite/kernels/cuda/layout_compute.h b/lite/kernels/cuda/layout_compute.h
index 10a0961212dde34a35dcc43b07bc0207ed2c93a3..634f73038e5a9a7a215af89278e786055426b8c0 100644
--- a/lite/kernels/cuda/layout_compute.h
+++ b/lite/kernels/cuda/layout_compute.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include "lite/backends/cuda/math/transpose.h"
 #include "lite/core/kernel.h"
 
 namespace paddle {
@@ -25,6 +26,9 @@ class NCHWToNHWCCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   using param_t = operators::LayoutParam;
   void Run() override;
   virtual ~NCHWToNHWCCompute() = default;
+
+ private:
+  lite::cuda::math::Transpose<float> trans;
 };
 
 class NCHWToNHWCComputeInt8
@@ -33,6 +37,9 @@ class NCHWToNHWCComputeInt8
   using param_t = operators::LayoutParam;
   void Run() override;
   virtual ~NCHWToNHWCComputeInt8() = default;
+
+ private:
+  lite::cuda::math::Transpose<int8_t> trans;
 };
 
 class NHWCToNCHWCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
@@ -40,6 +47,9 @@ class NHWCToNCHWCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   using param_t = operators::LayoutParam;
   void Run() override;
   virtual ~NHWCToNCHWCompute() = default;
+
+ private:
+  lite::cuda::math::Transpose<float> trans;
 };
 
 class NHWCToNCHWComputeInt8
@@ -48,6 +58,9 @@ class NHWCToNCHWComputeInt8
   using param_t = operators::LayoutParam;
   void Run() override;
   virtual ~NHWCToNCHWComputeInt8() = default;
+
+ private:
+  lite::cuda::math::Transpose<int8_t> trans;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.cu b/lite/kernels/cuda/match_matrix_tensor_compute.cu
index f89b9c9578e54ec8e7de93541eaa51a9b1d17a97..0458bb4e8ebc6a333e17c502f03287fe2516c37d 100644
--- a/lite/kernels/cuda/match_matrix_tensor_compute.cu
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <algorithm>
 #include <vector>
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/match_matrix_tensor_compute.h"
@@ -20,6 +21,54 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
 
+template <typename dtype>
+void gpu_transpose(
+    cublasHandle_t handle, const dtype* src, int M, int N, dtype* dst);
+
+template <>
+void gpu_transpose<float>(
+    cublasHandle_t handle, const float* src, int M, int N, float* dst) {
+  float alpha = 1.0;
+  float beta = 0.0;
+  CUBLAS_CHECK(cublasSgeam(handle,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           M,
+                           N,
+                           &alpha,
+                           src,
+                           N,
+                           &beta,
+                           dst,
+                           M,
+                           dst,
+                           M));
+}
+
+template <typename dtype>
+__global__ void padding_out(const dtype* src,
+                            const int* offset,
+                            const int seq_num_r,
+                            const int max_len_r,
+                            const int tl,
+                            const int count,
+                            dtype* dst) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int thread_num = blockDim.x * gridDim.x;
+  for (tid = threadIdx.x + blockIdx.x * blockDim.x; tid < count;
+       tid += thread_num) {
+    int seq_id = tid / (tl * max_len_r);
+    int tl_id = (tid / (max_len_r)) % tl;
+    int r_id = tid % max_len_r;
+    int cur_len = offset[seq_id + 1] - offset[seq_id];
+    if (r_id < cur_len) {
+      dst[tid] = src[(offset[seq_id] + r_id) * tl + tl_id];
+    } else {
+      dst[tid] = 0.f;
+    }
+  }
+}
+
 void MatchMatrixTensorCompute::PrepareForRun() {
   gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
 }
@@ -28,6 +77,7 @@ void MatchMatrixTensorCompute::Run() {
   CHECK(ctx_) << "running context should be set first";
   auto& param = this->Param<param_t>();
   auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
 
   auto* x = param.x;
   auto* w = param.w;
@@ -39,76 +89,74 @@ void MatchMatrixTensorCompute::Run() {
 
   const auto& offset_l = x->lod()[0];
   const auto& offset_r = y->lod()[0];
-
-  std::vector<size_t> top_offset;
-  int top_size = 0;
-  top_offset.push_back(top_size);
-  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-    int len_l = offset_l[b + 1] - offset_l[b];
-    int len_r = offset_r[b + 1] - offset_r[b];
-    top_size += dim_t * len_l * len_r;
-    top_offset.push_back(top_size);
+  std::vector<int> offset_r_int(offset_r.size());
+  std::transform(offset_r.begin(),
+                 offset_r.end(),
+                 offset_r_int.begin(),
+                 [](int64_t x) -> int { return static_cast<int>(x); });
+
+  int batch = offset_r.size() - 1;
+  int len_l = offset_l[1] - offset_l[0];
+  for (int i = 1; i < offset_l.size() - 1; i++) {
+    int cur_len = offset_l[i + 1] - offset_l[i];
+    CHECK_EQ(cur_len, len_l)
+        << "each sequence of left matrix is the same length";
   }
-
-  auto* bottom_l_data = x->data<float>();
-  auto* bottom_r_data = y->data<float>();
-  auto* t_data = w->data<float>();
-  auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
-  auto* bottom_l_trans_data = tmp->mutable_data<float>(TARGET(kCUDA));
-
-  gemm_impl_->init(
-      false, false, x->dims()[0], dim_t * dim_in, dim_in, &context);
-  gemm_impl_->run(
-      1.0f, 0.0f, bottom_l_data, t_data, bottom_l_trans_data, &context);
-
-  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-    for (int t = 0; t < dim_t; t++) {
-      int len_l = offset_l[b + 1] - offset_l[b];
-      int len_r = offset_r[b + 1] - offset_r[b];
-      auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
-      const auto* l_t_data =
-          bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
-      const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
-
-      gemm_impl_->init(false,
-                       true,
-                       len_l,
-                       len_r,
-                       dim_in,
-                       dim_t * dim_in,
-                       dim_in,
-                       len_r,
-                       &context);
-      gemm_impl_->run(1.0f, 0.0f, l_t_data, r_data, top_data, &context);
-    }
+  int max_len_r = 0;
+  for (int i = 0; i < offset_r.size() - 1; ++i) {
+    int cur_len = offset_r[i + 1] - offset_r[i];
+    max_len_r = cur_len > max_len_r ? cur_len : max_len_r;
   }
 
-  int batch_size = x->lod()[0].size() - 1;
-  int lod_lv1_size = batch_size * dim_t;
-  int lod_lv2_size = x->lod()[0].back() * dim_t;
-  std::vector<size_t> out_lod0(batch_size + 1, 0);
-  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
-  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
-  for (int i = 0; i < batch_size; i++) {
-    out_lod0[i + 1] = out_lod0[i] + dim_t;
-    int len_l = offset_l[i + 1] - offset_l[i];
-
-    for (int j = 0; j < dim_t; j++) {
-      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
-      int len_r = offset_r[i + 1] - offset_r[i];
-
-      for (int k = 0; k < len_l; k++) {
-        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
-            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
-      }
-    }
+  _input_l_transform.Resize({batch, dim_t, dim_in, len_l});
+  _input_l_transform_reorganize.Resize({batch, dim_t, len_l, dim_in});
+  _output_tmp.Resize({batch, max_len_r, dim_t, len_l});
+  out->Resize({batch, dim_t, len_l, max_len_r});
+
+  _offset_r.Resize({static_cast<int64_t>(offset_r.size())});
+  TargetWrapperCuda::MemcpyAsync(_offset_r.mutable_data<int>(TARGET(kCUDA)),
+                                 &offset_r_int[0],
+                                 sizeof(int) * offset_r.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  int len_r = offset_r[offset_r.size() - 1];
+  const float* input_l = x->data<float>();
+  const float* input_r = y->data<float>();
+  const float* weight_data = w->data<float>();
+  float* input_l_transform =
+      _input_l_transform.mutable_data<float>(TARGET(kCUDA));
+  float* input_l_transform_reorganize =
+      _input_l_transform_reorganize.mutable_data<float>(TARGET(kCUDA));
+  float* output_tmp = _output_tmp.mutable_data<float>(TARGET(kCUDA));
+  float* out_data = out->mutable_data<float>(TARGET(kCUDA));
+
+  gemm_impl_->init(true, true, dim_t * dim_in, len_l, dim_in, &context);
+  gemm_impl_->run(
+      1.0f, 0.0f, weight_data, input_l, input_l_transform, &context);
+  for (int i = 0; i < dim_t; ++i) {
+    int offset = i * dim_in * len_l;
+    gpu_transpose(gemm_impl_->get_handle(),
+                  input_l_transform + offset,
+                  dim_in,
+                  len_l,
+                  input_l_transform_reorganize + offset);
   }
-
-  LoD out_lod;
-  out_lod.push_back(top_offset);
-  out_lod.push_back(offset_l);
-  out_lod.push_back(offset_r);
-  out->set_lod(out_lod);
+  gemm_impl_->init(false, true, len_r, dim_t * len_l, dim_in, &context);
+  gemm_impl_->run(
+      1.0f, 0.0f, input_r, input_l_transform_reorganize, output_tmp, &context);
+  int seq_num = offset_r.size() - 1;
+  int count = seq_num * max_len_r * dim_t * len_l;
+  const int blocks = 512;
+  const int grids = (count + blocks - 1) / blocks;
+  padding_out<float><<<grids, blocks, 0, stream>>>(_output_tmp.data<float>(),
+                                                   _offset_r.data<int>(),
+                                                   seq_num,
+                                                   max_len_r,
+                                                   dim_t * len_l,
+                                                   count,
+                                                   out_data);
+  out->set_lod(y->lod());
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.h b/lite/kernels/cuda/match_matrix_tensor_compute.h
index 09db326ff3e992363e9b572ca91444499caed20f..d5fe8885f2c4e078832822cc5759a6882271d2d6 100644
--- a/lite/kernels/cuda/match_matrix_tensor_compute.h
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.h
@@ -34,6 +34,10 @@ class MatchMatrixTensorCompute
 
  private:
   std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+  lite::Tensor _input_l_transform;
+  lite::Tensor _input_l_transform_reorganize;
+  lite::Tensor _output_tmp;
+  lite::Tensor _offset_r;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/search_fc_compute.cu b/lite/kernels/cuda/search_fc_compute.cu
index 591e2474a475590e8c7d3882b4dfa8f5a55a3ab0..8bd5bad5404db6087dd959c065e59cbe46905dbb 100644
--- a/lite/kernels/cuda/search_fc_compute.cu
+++ b/lite/kernels/cuda/search_fc_compute.cu
@@ -16,92 +16,6 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace cuda {
-template <typename T>
-static void anakin_NV_gemv(cublasHandle_t handle,
-                           const bool TransA,
-                           const int M,
-                           const int N,
-                           const T alpha,
-                           const T* A,
-                           const T* x,
-                           const T beta,
-                           T* y);
-template <>
-void anakin_NV_gemv<float>(cublasHandle_t handle,
-                           const bool TransA,
-                           const int M,
-                           const int N,
-                           const float alpha,
-                           const float* A,
-                           const float* x,
-                           const float beta,
-                           float* y) {
-  cublasOperation_t cuTransA = (TransA == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(
-      cublasSgemv(handle, cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
-}
-template <typename T>
-static void anakin_NV_gemm(cublasHandle_t handle,
-                           const bool TransA,
-                           const bool TransB,
-                           const int M,
-                           const int N,
-                           const int K,
-                           const T alpha,
-                           const T* A,
-                           const T* B,
-                           const T beta,
-                           T* C);
-
-template <>
-void anakin_NV_gemm<float>(cublasHandle_t handle,
-                           const bool TransA,
-                           const bool TransB,
-                           const int M,
-                           const int N,
-                           const int K,
-                           const float alpha,
-                           const float* A,
-                           const float* B,
-                           const float beta,
-                           float* C) {
-  // Note that cublas follows fortran order.
-  int lda = (!TransA /* == CblasNoTrans*/) ? K : M;
-  int ldb = (!TransB /* == CblasNoTrans*/) ? N : K;
-  cublasOperation_t cuTransA =
-      (!TransA /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (!TransB /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm(handle,
-                           cuTransB,
-                           cuTransA,
-                           N,
-                           M,
-                           K,
-                           &alpha,
-                           B,
-                           ldb,
-                           A,
-                           lda,
-                           &beta,
-                           C,
-                           N));
-}
-
-template <>
-void anakin_NV_gemm<char>(cublasHandle_t handle,
-                          const bool TransA,
-                          const bool TransB,
-                          const int M,
-                          const int N,
-                          const int K,
-                          const char alpha,
-                          const char* A,
-                          const char* B,
-                          const char beta,
-                          char* C) {
-  LOG(FATAL) << "int8 gemm is not implemented";
-}
 
 template <typename T>
 static __global__ void add_bias(int n,
@@ -115,6 +29,11 @@ static __global__ void add_bias(int n,
   }
 }
 
+template <typename T>
+void SearchFcCompute<T>::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+}
+
 template <typename T>
 void SearchFcCompute<T>::Run() {
   auto& param = this->Param<param_t>();
@@ -132,22 +51,10 @@ void SearchFcCompute<T>::Run() {
   const T* weight = w_tensor->data<T>();
   const Tensor* b_tensor = param.b;
   const T* bias = b_tensor->data<T>();
-  cublasCreate(&_handle);
-  if (_M == 1 && _K > 50000) {
-    anakin_NV_gemv<T>(_handle, false, _N, _K, (T)1, weight, din, (T)0, dout);
-  } else {
-    anakin_NV_gemm<T>(_handle,
-                      false,
-                      !_flag_trans_weights,
-                      _M,
-                      _N,
-                      _K,
-                      (T)1,
-                      din,
-                      weight,
-                      (T)0,
-                      dout);
-  }
+
+  CHECK(gemm_impl_->init(false, true, _M, _N, _K, &ctx));
+  gemm_impl_->run(1.0f, 0.0f, din, weight, dout, &ctx);
+
   int total_size = _M * _N;
   add_bias<T><<<CUDA_GET_BLOCKS(total_size), CUDA_NUM_THREADS, 0, stream>>>(
       total_size, _N, bias, dout);
diff --git a/lite/kernels/cuda/search_fc_compute.h b/lite/kernels/cuda/search_fc_compute.h
index db09362734ecdb05663a5a6d4297ab869cb1b55d..a551486cba9dca9f9bb567ce8fcf76af27a64a10 100644
--- a/lite/kernels/cuda/search_fc_compute.h
+++ b/lite/kernels/cuda/search_fc_compute.h
@@ -14,7 +14,9 @@
 
 #pragma once
 #include <cudnn.h>
+#include <memory>
 #include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/gemm.h"
 #include "lite/core/kernel.h"
 
 namespace paddle {
@@ -34,16 +36,15 @@ template <typename T>
 class SearchFcCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
  public:
   using param_t = operators::SearchFcParam;
+  void PrepareForRun() override;
   void Run() override;
   virtual ~SearchFcCompute() = default;
 
  private:
-  bool _flag_trans_weights{false};
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_{nullptr};
   int _M;
   int _K;
   int _N;
-  cublasHandle_t _handle;
-  bool _is_continue_buf{true};
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/search_grnn_compute.cu b/lite/kernels/cuda/search_grnn_compute.cu
index 468b66e5680c7d0e5879def9a888e10faa0bca32..2c1cb94a14d911d282d8e365ca0b818e7992461d 100644
--- a/lite/kernels/cuda/search_grnn_compute.cu
+++ b/lite/kernels/cuda/search_grnn_compute.cu
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include <vector>
+#include "lite/backends/cuda/math/transpose.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/search_grnn_compute.h"
 
@@ -19,294 +20,469 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace cuda {
+
 using Tensor = lite::Tensor;
 
-template <typename T>
-T sigmoid(T z) {
-  return 1 / (1 + std::exp(-z));
+template <typename Dtype>
+__global__ void trans_map2out(
+    Dtype* output, const Dtype* input, const int* map, int count, int lastdim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < count) {
+    int seq = tid / lastdim;
+    output[map[seq] * lastdim + tid % lastdim] = input[tid];
+  }
 }
 
-template <typename T>
-__global__ void PreComputeKernel(
-    const int num, const T* w_x_e, const T* wz_x_e, T* tilde, T* z, T* hidden) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if (index < num) {
-    tilde[index] = std::tanh(w_x_e[index]);
-    z[index] = 1 / (1 + std::exp(-wz_x_e[index]));
-    hidden[index] = (1. - z[index]) * tilde[index];
+template <typename Dtype>
+__global__ void trans_map2in(
+    Dtype* output, const Dtype* input, const int* map, int count, int lastdim) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < count) {
+    int seq = tid / lastdim;
+    output[tid] = input[map[seq] * lastdim + tid % lastdim];
   }
 }
 
-template <typename T>
-__global__ void PostComputeKernel(const int start,
-                                  const int end,
-                                  const int cap_h,
-                                  const int w_tm1,
-                                  const T* wr_x_e,
-                                  const T* ur_x_h,
-                                  const T* wz_x_e,
-                                  const T* uz_x_h,
-                                  const T* w_x_e,
-                                  const T* u_x_h,
-                                  T* r,
-                                  T* z,
-                                  T* tilde,
-                                  T* hidden) {
-  int j = start + blockIdx.x * blockDim.x + threadIdx.x;
-  if (j < end) {
-    r[j] = 1 / (1 + std::exp(-(wr_x_e[j] + ur_x_h[j])));
-    z[j] = 1 / (1 + std::exp(-(wz_x_e[j] + uz_x_h[j])));
-    tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]);
-    hidden[j] = z[j] * hidden[j - cap_h * w_tm1] + (1.0 - z[j]) * tilde[j];
+template <typename Dtype>
+void trans_map2out_cfunc(const Dtype* input,
+                         Dtype* output,
+                         int word_size,
+                         int seq_sum,
+                         cudaStream_t stream,
+                         int* dev_map_vec) {
+  int count = seq_sum * word_size;
+  int block_dim = count;
+  int grid_dim = 1;
+
+  if (count > 1024) {
+    block_dim = 256;
+    grid_dim = (count + block_dim - 1) / block_dim;
   }
+
+  trans_map2out<<<grid_dim, block_dim, 0, stream>>>(
+      output, input, dev_map_vec, count, word_size);
 }
 
-void SearchGrnnCompute::PrepareForRun() {
-  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+template <typename Dtype>
+void trans_map2in_cfunc(const Dtype* input,
+                        Dtype* output,
+                        int hidden_size,
+                        int seq_sum,
+                        cudaStream_t stream,
+                        int* dev_map_vec) {
+  int count = seq_sum * hidden_size;
+  int block_dim = count;
+  int grid_dim = 1;
+  if (count > 1024) {
+    block_dim = 256;
+    grid_dim = (count + block_dim - 1) / block_dim;
+  }
+
+  trans_map2in<<<grid_dim, block_dim, 0, stream>>>(
+      output, input, dev_map_vec, count, hidden_size);
 }
 
-void SearchGrnnCompute::PrepareLayout(const Tensor* input_blob) {
-  auto& param = this->Param<param_t>();
-  auto& context = this->ctx_->template As<CUDAContext>();
-  auto cuda_stream = context.exec_stream();
+template <typename Dtype>
+void SeqSortedseqTranseUtil::seq_2_sorted_seq(const Dtype* input,
+                                              Dtype* output,
+                                              int word_size,
+                                              cudaStream_t stream) {
+  int seq_sum = _map_vec.size();
+  trans_map2out_cfunc(input, output, word_size, seq_sum, stream, _dev_map_vec);
+}
+
+template <typename Dtype>
+void SeqSortedseqTranseUtil::sorted_seq_2_seq(const Dtype* input,
+                                              Dtype* output,
+                                              int hidden_size,
+                                              cudaStream_t stream) {
+  int seq_sum = _map_vec.size();
+  trans_map2in_cfunc(input, output, hidden_size, seq_sum, stream, _dev_map_vec);
+}
+
+bool SeqSortedseqTranseUtil::get_sorted_map(const std::vector<int>& offset_vec,
+                                            cudaStream_t stream_id) {
+  int batch_size = offset_vec.size() - 1;
+  int word_sum = offset_vec[offset_vec.size() - 1];
+  std::vector<int> length_vec(batch_size);
+  _length_index.resize(batch_size);
+  int emit_length = 0;
+
+  if (batch_size == 1) {
+    emit_length = offset_vec[1] - offset_vec[0];
+    _emit_offset_vec.resize(emit_length + 1);
+
+    for (int i = 0; i <= emit_length; ++i) {
+      _emit_offset_vec[i] = i;
+    }
 
-  auto* _input = input_blob;
-  int dim0 = _input->dims()[0];
-  int dim1 = 1;
-  if (_input->dims().size() > 1) {
-    dim1 = _input->dims()[1];
+    return false;
   }
-  int batch = _input->lod()[0].size() - 1;
-  auto& offset = _input->lod()[0];
-
-  idx_sorted_by_width_cpu = std::make_shared<Tensor>();
-  idx_sorted_by_width_cpu->Resize({batch});
-  int* idx_sorted_by_width_cpu_data =
-      idx_sorted_by_width_cpu->mutable_data<int>();
-
-  Tensor _width;
-  _width.Resize({batch});
-  int* width_data = _width.mutable_data<int>();
-  // sort sequence by width (descending) and find the largest width in the
-  // batch
-  for (int i = 0; i < batch; i++) {
-    width_data[i] = offset[i + 1] - offset[i];
-    idx_sorted_by_width_cpu_data[i] = i;
+
+  int max_len = 0;
+
+  for (int i = 0; i < offset_vec.size() - 1; ++i) {
+    int len = offset_vec[i + 1] - offset_vec[i];
+    max_len = max_len > len ? max_len : len;
+    length_vec[i] = len;
+    _length_index[i] = i;
+  }
+
+  emit_length = max_len;
+
+  if (max_len == 1) {
+    _emit_offset_vec.resize(2);
+    _emit_offset_vec[0] = 0;
+    _emit_offset_vec[1] = emit_length * batch_size;
+    return false;
   }
-  std::sort(idx_sorted_by_width_cpu_data,
-            idx_sorted_by_width_cpu_data + batch,
-            [&_width](int a, int b) {
-              return _width.data<int>()[a] > _width.data<int>()[b];
+
+  std::sort(_length_index.begin(),
+            _length_index.end(),
+            [&length_vec](int i1, int i2) {
+              return length_vec[i1] > length_vec[i2];
             });
-  int max_width = width_data[idx_sorted_by_width_cpu_data[0]];
-
-  // start of reorganizing the input
-  std::vector<size_t> new_offset;
-  new_offset.resize(max_width + 1);
-  new_offset[0] = 0;
-  int j = batch - 1;
-  int last_width = 0;
-  int sub_row = 0;
-  int sub_col = 0;
-
-  for (int i = 1; i <= max_width;) {
-    for (int k = j; k >= 0; --k) {
-      if (width_data[idx_sorted_by_width_cpu_data[k]] > last_width) {
-        sub_row = width_data[idx_sorted_by_width_cpu_data[k]] - last_width;
-        sub_col = k + 1;
-        for (int s = 0; s < sub_row; s++) {
-          new_offset[i] = new_offset[i - 1] + sub_col;
-          i++;
+
+  _emit_offset_vec.resize(max_len + 1);
+  _map_vec.resize(word_sum);
+
+  if (word_sum > _dev_map_vec_length) {
+    if (_dev_map_vec != nullptr) {
+      TargetWrapperCuda::Free(static_cast<void*>(_dev_map_vec));
+    }
+
+    _dev_map_vec =
+        static_cast<int*>(TargetWrapperCuda::Malloc(sizeof(int) * word_sum));
+    _dev_map_vec_length = word_sum;
+  }
+
+  int target_word_id = 0;
+  std::vector<int> length_vec_cnt = length_vec;
+  int last_batch_size = batch_size;
+  for (int word_id_in_seq = 0; word_id_in_seq < max_len; word_id_in_seq++) {
+    _emit_offset_vec[word_id_in_seq] = target_word_id;
+
+    for (int batch_id = 0; batch_id < last_batch_size; batch_id++) {
+      int old_batch_id = _length_index[batch_id];
+
+      if (length_vec_cnt[old_batch_id] > 0) {
+        int inner_word_id_in_seq = word_id_in_seq;
+
+        if (_is_reverse) {
+          inner_word_id_in_seq = length_vec[old_batch_id] - 1 - word_id_in_seq;
         }
-        // move on
-        last_width = width_data[idx_sorted_by_width_cpu_data[k]];
-        j = k - 1;
+
+        int old_word_id = offset_vec[old_batch_id] + inner_word_id_in_seq;
+        _map_vec[old_word_id] = target_word_id;
+        length_vec_cnt[old_batch_id]--;
+        target_word_id++;
+      } else {
+        last_batch_size--;
         break;
       }
     }
   }
 
-  // copying to the reorganized buffer
-  auto* _layout_input = new Tensor();
-  auto* _layout_input_gpu = param.layout_input;
-  if (_input->dims().size() == 1) {
-    // _layout_input.reshape_batch_sequence({dim0}, new_offset);
-    LOG(FATAL) << "_input->dims().size() = 1, error.";
-  } else {
-    // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset);
-    LoD new_lod;
-    new_lod.push_back(new_offset);
-    _layout_input->set_lod(new_lod);
-    _layout_input->Resize({dim0, dim1});
-    _layout_input_gpu->set_lod(new_lod);
-    _layout_input_gpu->Resize({dim0, dim1});
-  }
+  TargetWrapperCuda::MemcpyAsync(_dev_map_vec,
+                                 _map_vec.data(),
+                                 sizeof(int) * word_sum,
+                                 IoDirection::HtoD,
+                                 stream_id);
+  _emit_offset_vec[max_len] = word_sum;
+  _emit_length = emit_length;
+  return true;
+}
 
-  auto* new_emb = _layout_input->mutable_data<float>();
-  auto* input_cpu = new Tensor();
-  input_cpu->Resize(_input->dims());
-  auto* input_cpu_data = input_cpu->mutable_data<float>();
-  TargetW::MemcpyAsync(input_cpu_data,
-                       _input->data<float>(),
-                       _input->numel() * sizeof(float),
-                       IoDirection::DtoH,
-                       cuda_stream);
-  for (int i = 0; i < max_width; i++) {
-    int w = new_offset[i + 1] - new_offset[i];
-    auto* emb_start = new_emb + dim1 * new_offset[i];
-    for (int j = 0; j < w; ++j) {
-      memcpy(emb_start + dim1 * j,
-             input_cpu_data + dim1 * offset[idx_sorted_by_width_cpu_data[j]] +
-                 dim1 * i,
-             dim1 * sizeof(float));
-    }
+template <typename Dtype>
+__global__ void transpose_2d(Dtype* output, const Dtype* input, int m, int n) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < m * n) {
+    int i = tid / n;
+    int j = tid % m;
+    output[tid] = input[j * n + i];
   }
+}
+
+void SearchGrnnCompute::WeightsPreprocess() {
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
 
-  auto* _layout_input_gpu_data =
-      _layout_input_gpu->mutable_data<float>(TARGET(kCUDA));
-  TargetW::MemcpyAsync(_layout_input_gpu_data,
-                       new_emb,
-                       _layout_input->numel() * sizeof(float),
-                       IoDirection::HtoD,
-                       cuda_stream);
-  delete _layout_input;
-  delete input_cpu;
+  DDim idims = param.wi->dims();
+  DDim hdims = param.wh->dims();
+  _wi.Resize({idims[2], idims[0], idims[1]});
+  _wh.Resize({hdims[2], hdims[0], hdims[1]});
+  lite::cuda::math::Transpose<float> trans;
+  trans.transpose(_wi.mutable_data<float>(TARGET(kCUDA)),
+                  param.wi->data<float>(),
+                  idims.Vectorize(),
+                  {2, 0, 1},
+                  &stream);
+  trans.transpose(_wh.mutable_data<float>(TARGET(kCUDA)) + hdims[1] * hdims[2],
+                  param.wh->data<float>() + hdims[1] * hdims[2],
+                  {hdims[0] - 1, hdims[1], hdims[2]},
+                  {2, 0, 1},
+                  &stream);
+  trans.transpose(_wh.mutable_data<float>(TARGET(kCUDA)),
+                  param.wh->data<float>(),
+                  {hdims[1], hdims[2]},
+                  {1, 0},
+                  &stream);
+
+  // int thread_num = 512;
+  // int block_num = (hdims[1] * hdims[2] + thread_num - 1) / thread_num;
+  // transpose_2d<<<block_num, thread_num, 0, stream>>>(
+  //    _wh.mutable_data<float>(TARGET(kCUDA)),
+  //    param.wh->data<float>(),
+  //    hdims[1],
+  //    hdims[2]);
 }
 
-void SearchGrnnCompute::CopyBack(float* from, float* to, int step) {
+void SearchGrnnCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   auto& context = this->ctx_->template As<CUDAContext>();
   auto stream = context.exec_stream();
-  auto* _input = param.x;
-  auto* _layout_input = param.layout_input;
-
-  const auto& offset = _input->lod()[0];
-  const auto& new_offset = _layout_input->lod()[0];
-  const auto* idx_sorted_by_width_cpu_data =
-      idx_sorted_by_width_cpu->data<int>();
-  for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) {
-    int w = new_offset[i + 1] - new_offset[i];
-    for (int j = 0; j < w; j++) {
-      TargetW::MemcpyAsync(
-          to + step * (offset[idx_sorted_by_width_cpu_data[j]] + i),
-          from + (new_offset[i] + j) * step,
-          step * sizeof(float),
-          IoDirection::DtoD,
-          stream);
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+  _seq_util = SeqSortedseqTranseUtil();
+
+  WeightsPreprocess();
+
+  int hidden_size = param.num_hidden;
+  int word_size = param.num_input;
+  int weights_h2h_size = hidden_size * hidden_size * 3;
+  int weights_i2h_size = hidden_size * word_size * 3;
+
+  lite::Tensor temp_weights_h2h_ori;
+  lite::Tensor temp_weights_h2h_swarp;
+  temp_weights_h2h_ori.Resize({weights_h2h_size});
+  temp_weights_h2h_swarp.Resize({weights_h2h_size});
+
+  TargetWrapperCuda::MemcpyAsync(temp_weights_h2h_ori.mutable_data<float>(),
+                                 _wh.data<float>(),
+                                 sizeof(float) * weights_h2h_size,
+                                 IoDirection::DtoH,
+                                 stream);
+  cudaStreamSynchronize(stream);
+
+  float* temp_tensor_ptr = temp_weights_h2h_swarp.mutable_data<float>();
+  memcpy(temp_tensor_ptr,
+         temp_weights_h2h_ori.data<float>(),
+         sizeof(float) * hidden_size * hidden_size);
+
+  float* rz_temp_tensor_ptr = temp_tensor_ptr + hidden_size * hidden_size;
+  const float* rz_weights_tensor_ptr =
+      temp_weights_h2h_ori.data<float>() + hidden_size * hidden_size;
+  for (int row = 0; row < hidden_size; row++) {
+    for (int block = 0; block < 2; block++) {
+      int block_offset = block * hidden_size;
+      for (int cow = 0; cow < hidden_size; cow++) {
+        rz_temp_tensor_ptr[block * hidden_size * hidden_size +
+                           row * hidden_size + cow] =
+            rz_weights_tensor_ptr[row * (2 * hidden_size) + cow + block_offset];
+      }
+    }
+  }
+
+  float* orz_temp_tensor_ptr = temp_tensor_ptr;
+  float* orz_weights_tensor_ptr = temp_weights_h2h_ori.mutable_data<float>();
+  for (int row = 0; row < hidden_size; row++) {
+    for (int block = 0; block < 3; block++) {
+      int block_offset = block * hidden_size;
+      for (int cow = 0; cow < hidden_size; cow++) {
+        orz_weights_tensor_ptr[row * (3 * hidden_size) + cow + block_offset] =
+            orz_temp_tensor_ptr[block * hidden_size * hidden_size +
+                                row * hidden_size + cow];
+      }
     }
   }
+
+  _temp_weights_h2h.Resize({weights_h2h_size});
+  TargetWrapperCuda::MemcpyAsync(
+      _temp_weights_h2h.mutable_data<float>(TARGET(kCUDA)),
+      temp_weights_h2h_ori.data<float>(),
+      sizeof(float) * weights_h2h_size,
+      IoDirection::HtoD,
+      stream);
+  cudaStreamSynchronize(stream);
+}
+
+template <typename Dtype>
+static inline __device__ Dtype Sigmoid(const Dtype a) {
+  return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-a));
+}
+
+template <typename Dtype>
+static inline __device__ Dtype Tanh(const Dtype a) {
+  Dtype tmp = static_cast<Dtype>(-2.0) * a;
+  return (static_cast<Dtype>(2.0) / (static_cast<Dtype>(1.0) + expf(tmp))) -
+         static_cast<Dtype>(1.0);
+}
+
+template <typename Dtype>
+__global__ void cal_cudnn_kernel(const Dtype* w_x_r,
+                                 const Dtype* w_x_z,
+                                 const Dtype* w_x_o,
+                                 const Dtype* w_h_r,
+                                 const Dtype* w_h_z,
+                                 const Dtype* w_h_o,
+                                 int hidden_size,
+                                 int batch_size,
+                                 Dtype* output,
+                                 const Dtype* hidden_pre) {
+  const int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const int batch_id = thread_id / hidden_size;
+  const int index = thread_id % hidden_size;
+  if (index < hidden_size && batch_id < batch_size) {
+    int w_base_index = batch_id * hidden_size * 3 + index;
+    int h_base_index = batch_id * hidden_size + index;
+    Dtype hidden_pre_value = hidden_pre[h_base_index];
+    Dtype r = Sigmoid(w_x_r[w_base_index] + w_h_r[w_base_index]);
+    Dtype z = Sigmoid(w_x_z[w_base_index] + w_h_z[w_base_index]);
+    Dtype _h = Tanh(w_x_o[w_base_index] + w_h_o[w_base_index] * r);
+
+    output[h_base_index] =
+        (static_cast<Dtype>(1.0) - z) * _h + z * hidden_pre_value;
+  }
 }
 
 void SearchGrnnCompute::Run() {
-  CHECK(ctx_) << "running context should be set first";
   auto& param = this->Param<param_t>();
   auto& context = this->ctx_->template As<CUDAContext>();
   auto stream = context.exec_stream();
 
-  auto* bottom = param.x;
-  auto* wi = param.wi;
-  auto* wh = param.wh;
-  auto* top = param.out;
-  auto* _buffer = param.tmp_buffer;
-  int _cap_h = param.num_hidden;
-  int _cap_e = param.num_input;
-
-  int _cap_l = bottom->dims()[0];
-  int batch = bottom->lod()[0].size() - 1;
-
-  const auto& offset = bottom->lod()[0];
-  LoD top_lod;
-  top_lod.push_back(offset);
-  top->set_lod(top_lod);
-  std::vector<int64_t> top_dims_vec{_cap_l, _cap_h};
-  top->Resize(top_dims_vec);
-  auto* top_hidden = top->mutable_data<float>(TARGET(kCUDA));
-
-  const auto* dense_e2h = wi->data<float>();
-  const auto* dense_h2h = wh->data<float>();
-
-  const auto* e2h = dense_e2h;
-  const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h;
-  const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h;
-  const auto* h2h = dense_h2h;
-  const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h;
-  const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h;
-
-  PrepareLayout(bottom);
-
-  auto* _layout_input = param.layout_input;
-  auto* new_emb = _layout_input->data<float>();
-  const auto& new_offset = _layout_input->lod()[0];
-  int max_width = _layout_input->lod()[0].size() - 1;
-
-  // this buffer is used for book keeping info which will be used in bp
-  // buffer also needed in bp, so make it larger
-  _buffer->Resize({20, _cap_l, _cap_h});
-  auto* buffer_data = _buffer->mutable_data<float>(TARGET(kCUDA));
-  auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h;
-  auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h;
-  auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h;
-  auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h;
-  auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h;
-  auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h;
-  auto* r = buffer_data + 6 * _cap_l * _cap_h;
-  auto* z = buffer_data + 7 * _cap_l * _cap_h;
-  auto* tilde = buffer_data + 8 * _cap_l * _cap_h;
-  // the internal hidden
-  auto* hidden = buffer_data + 19 * _cap_l * _cap_h;
-
-  gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context);
-  gemm_impl_->run(1.0f, 0.0f, new_emb, e2h, w_x_e, &context);
-  gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context);
-  gemm_impl_->run(1.0f, 0.0f, new_emb, e2hr, wr_x_e, &context);
-  gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context);
-  gemm_impl_->run(1.0f, 0.0f, new_emb, e2hz, wz_x_e, &context);
-
-  // precompute hidden0
-  int num = batch * _cap_h;
-  int threads = 512;
-  int blocks = (num + threads - 1) / threads;
-  PreComputeKernel<<<blocks, threads, 0, stream>>>(
-      num, w_x_e, wz_x_e, tilde, z, hidden);
-
-  // recurrence
-  for (int i = 1; i < max_width; i++) {
-    int w_tm1 = new_offset[i] - new_offset[i - 1];
-    int w = new_offset[i + 1] - new_offset[i];
-
-    // precompute hidden i-1 to hidden i
-    auto* htm1 = hidden + new_offset[i - 1] * _cap_h;
-
-    gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context);
-    gemm_impl_->run(
-        1.0f, 0.0f, htm1, h2h, u_x_h + new_offset[i] * _cap_h, &context);
-    gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context);
-    gemm_impl_->run(
-        1.0f, 0.0f, htm1, h2hr, ur_x_h + new_offset[i] * _cap_h, &context);
-    gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context);
-    gemm_impl_->run(
-        1.0f, 0.0f, htm1, h2hz, uz_x_h + new_offset[i] * _cap_h, &context);
-
-    // compute the gate and hidden
-    int start = new_offset[i] * _cap_h;
-    int end = (new_offset[i] + w) * _cap_h;
-    PostComputeKernel<<<blocks, threads, 0, stream>>>(start,
-                                                      end,
-                                                      _cap_h,
-                                                      w_tm1,
-                                                      wr_x_e,
-                                                      ur_x_h,
-                                                      wz_x_e,
-                                                      uz_x_h,
-                                                      w_x_e,
-                                                      u_x_h,
-                                                      r,
-                                                      z,
-                                                      tilde,
-                                                      hidden);
+  auto* x = param.x;
+  LoD offset_vec_vec = x->lod();
+  std::vector<int> offset(offset_vec_vec[offset_vec_vec.size() - 1].size());
+  for (size_t i = 0; i < offset_vec_vec[offset_vec_vec.size() - 1].size();
+       ++i) {
+    offset[i] = static_cast<int>(offset_vec_vec[offset_vec_vec.size() - 1][i]);
+  }
+  const float* x_data = x->data<float>();
+  auto* dout = param.out;
+  std::vector<int64_t> out_dims_vec{x->dims()[0], param.num_hidden};
+  dout->Resize(out_dims_vec);
+  float* dout_data = dout->mutable_data<float>(TARGET(kCUDA));
+  auto* wi = &_wi;
+  auto* wh = &_wh;
+
+  const float* weights_i2h = wi->data<float>();
+  const float* weights_h2h = wh->data<float>();
+
+  int batch_size = offset.size() - 1;
+  int seq_sum = x->dims()[0];
+  bool is_batched = offset.size() > 2;
+  int hidden_size = param.num_hidden;
+  int word_size = param.num_input;
+  int o_offset = 0;
+  int r_offset = 1;
+  int z_offset = 2;
+
+  is_batched = _seq_util.get_sorted_map(offset, stream);
+  std::vector<int> emit_offset_vec = _seq_util.get_emit_offset_vec();
+  int emit_length = emit_offset_vec.size() - 1;
+
+  if (is_batched) {
+    std::vector<int64_t> seq_shape{1, 1, seq_sum, word_size};
+    _temp_tensor_in.Resize(seq_shape);
+    std::vector<int64_t> seq_out_shape{1, 1, seq_sum, hidden_size};
+    _temp_tensor_out.Resize(seq_out_shape);
+    _seq_util.seq_2_sorted_seq(
+        x_data,
+        _temp_tensor_in.mutable_data<float>(TARGET(kCUDA)),
+        word_size,
+        stream);
+    x_data = _temp_tensor_in.data<float>();
+    dout_data = _temp_tensor_out.mutable_data<float>(TARGET(kCUDA));
+  }
+
+  std::vector<int64_t> shape_wx({seq_sum, 1, 3, hidden_size});
+  _temp_wx.Resize(shape_wx);
+
+  std::vector<int64_t> shape_wh({1, batch_size, 3, hidden_size});
+  _temp_wh.Resize(shape_wh);
+
+  gemm_impl_->init(false, false, seq_sum, 3 * hidden_size, word_size, &context);
+  gemm_impl_->run(1.0f,
+                  0.0f,
+                  x_data,
+                  weights_i2h,
+                  _temp_wx.mutable_data<float>(TARGET(kCUDA)),
+                  &context);
+
+  std::vector<int64_t> shape_zero({batch_size * hidden_size});
+  _temp_zero.Resize(shape_zero);
+
+  TargetWrapperCuda::MemsetAsync(_temp_zero.mutable_data<float>(TARGET(kCUDA)),
+                                 0,
+                                 sizeof(float) * batch_size * hidden_size,
+                                 stream);
+
+  const float* h = _temp_zero.data<float>();
+  for (int word_id = 0; word_id < emit_length; word_id++) {
+    int real_word_id = word_id;
+    int last_word_id = word_id - 1;
+    int emit_word_id_start = emit_offset_vec[real_word_id];
+    int emit_word_id_end = emit_offset_vec[real_word_id + 1];
+    int emit_word_length = emit_word_id_end - emit_word_id_start;
+
+    const float* hidden_in;
+    float* hidden_out = dout_data + emit_offset_vec[real_word_id] * hidden_size;
+
+    if (word_id == 0) {
+      hidden_in = h;
+    } else {
+      hidden_in = dout_data + emit_offset_vec[last_word_id] * hidden_size;
+    }
+
+    float* w_x_r = _temp_wx.mutable_data<float>(TARGET(kCUDA)) +
+                   r_offset * hidden_size +
+                   emit_word_id_start * hidden_size * 3;
+    float* w_x_z = _temp_wx.mutable_data<float>(TARGET(kCUDA)) +
+                   z_offset * hidden_size +
+                   emit_word_id_start * hidden_size * 3;
+    float* w_x_o = _temp_wx.mutable_data<float>(TARGET(kCUDA)) +
+                   o_offset * hidden_size +
+                   emit_word_id_start * hidden_size * 3;
+
+    float* w_h_r =
+        _temp_wh.mutable_data<float>(TARGET(kCUDA)) + r_offset * hidden_size;
+    float* w_h_z =
+        _temp_wh.mutable_data<float>(TARGET(kCUDA)) + z_offset * hidden_size;
+    float* w_h_o =
+        _temp_wh.mutable_data<float>(TARGET(kCUDA)) + o_offset * hidden_size;
+    gemm_impl_->init(
+        false, false, emit_word_length, 3 * hidden_size, hidden_size, &context);
+    gemm_impl_->run(1.0f,
+                    0.0f,
+                    hidden_in,
+                    _temp_weights_h2h.data<float>(),
+                    _temp_wh.mutable_data<float>(TARGET(kCUDA)),
+                    &context);
+
+    const float* w_o = weights_h2h;
+    const int block_dim = 512;
+    const int grid_dim =
+        (emit_word_length * hidden_size + block_dim - 1) / block_dim;
+    cal_cudnn_kernel<<<grid_dim, block_dim, 0, stream>>>(w_x_r,
+                                                         w_x_z,
+                                                         w_x_o,
+                                                         w_h_r,
+                                                         w_h_z,
+                                                         w_h_o,
+                                                         hidden_size,
+                                                         emit_word_length,
+                                                         hidden_out,
+                                                         hidden_in);
+  }
+
+  if (is_batched) {
+    _seq_util.sorted_seq_2_seq(_temp_tensor_out.data<float>(),
+                               dout->mutable_data<float>(TARGET(kCUDA)),
+                               hidden_size,
+                               stream);
   }
 
-  CopyBack(hidden, top_hidden, _cap_h);
+  dout->set_lod(x->lod());
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/search_grnn_compute.h b/lite/kernels/cuda/search_grnn_compute.h
index 73d84635d06f578f68bd844fe275d99595e70fc8..42def3eb36d954a1f87d613a424085916a08b625 100644
--- a/lite/kernels/cuda/search_grnn_compute.h
+++ b/lite/kernels/cuda/search_grnn_compute.h
@@ -14,6 +14,7 @@
 
 #pragma once
 #include <memory>
+#include <vector>
 #include "lite/backends/cuda/blas.h"
 #include "lite/backends/cuda/math/gemm.h"
 #include "lite/core/kernel.h"
@@ -23,6 +24,53 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
+class SeqSortedseqTranseUtil {
+ public:
+  explicit SeqSortedseqTranseUtil(bool is_reverse = false, bool is_bi = false)
+      : _is_reverse(is_reverse),
+        _is_bi(is_bi),
+        _dev_map_vec(nullptr),
+        _dev_map_vec_length(0) {}
+
+  ~SeqSortedseqTranseUtil() {
+    if (_dev_map_vec != nullptr) {
+      TargetWrapperCuda::Free(static_cast<void*>(_dev_map_vec));
+    }
+  }
+
+  std::vector<int>& get_length_index() { return _length_index; }
+  std::vector<int>& get_emit_offset_vec() { return _emit_offset_vec; }
+  std::vector<int>& get_map_vec() { return _map_vec; }
+  int* get_dev_map_vec() { return _dev_map_vec; }
+  int get_emit_length() { return _emit_length; }
+
+  template <typename Dtype>
+  void seq_2_sorted_seq(const Dtype* input,
+                        Dtype* output,
+                        int word_size,
+                        cudaStream_t stream);
+
+  template <typename Dtype>
+  void sorted_seq_2_seq(const Dtype* input,
+                        Dtype* output,
+                        int hidden_size,
+                        cudaStream_t stream);
+
+  bool get_sorted_map(const std::vector<int>& offset_vec,
+                      cudaStream_t stream_id);
+
+ private:
+  std::vector<int> _length_index;
+  std::vector<int> _emit_offset_vec;
+  std::vector<int> _map_vec;
+  int _emit_length;
+
+  bool _is_reverse;
+  bool _is_bi;
+  int* _dev_map_vec;
+  int _dev_map_vec_length;
+};
+
 class SearchGrnnCompute
     : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
  public:
@@ -34,10 +82,26 @@ class SearchGrnnCompute
   virtual ~SearchGrnnCompute() = default;
 
  private:
-  std::shared_ptr<Tensor> idx_sorted_by_width_cpu;
+  // Weights preprocess:
+  // wi need to be transpose, the axes should be (2, 0, 1)
+  // wh0 should transpose, {wh1 wh2} need be transpose, the axes should be {2,
+  // 0, 1}
+  void WeightsPreprocess();
+
+ private:
   std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
-  void PrepareLayout(const Tensor* input);
-  void CopyBack(float* from, float* to, int step);
+
+  lite::Tensor _temp_tensor_in;
+  lite::Tensor _temp_tensor_out;
+  lite::Tensor _temp_wx;
+  lite::Tensor _temp_wh;
+  lite::Tensor _temp_zero;
+  lite::Tensor _temp_weights_h2h;
+
+  lite::Tensor _wi;
+  lite::Tensor _wh;
+
+  SeqSortedseqTranseUtil _seq_util;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/sequence_concat_compute.cu b/lite/kernels/cuda/sequence_concat_compute.cu
index d4390046b01d6411bc8528e86083d5059eb4d449..57d6684ffdbca380d373ca086f51b12c5b8b35dc 100644
--- a/lite/kernels/cuda/sequence_concat_compute.cu
+++ b/lite/kernels/cuda/sequence_concat_compute.cu
@@ -22,43 +22,44 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-const int CUDA_NUM_THREADS = 512;
-
-template <typename T>
-inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs) {
-  std::vector<size_t> result;
-  result.resize(xs[0]->lod()[0].size());
-
-  for (size_t i = 1; i < result.size(); ++i) {
-    size_t sum = 0;
-    for (size_t j = 0; j < xs.size(); ++j) {
-      auto& x_lod = xs[j]->lod()[0];
-      sum += x_lod[i];
-    }
-    result[i] = sum;
+template <typename dtype>
+__global__ void concat_impl_cuda(const int nthreads,
+                                 const dtype* in_data,
+                                 const int num_concats,
+                                 const int concat_size,
+                                 const int top_concat_axis,
+                                 const int bottom_concat_axis,
+                                 const int offset_concat_axis,
+                                 dtype* out_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    const int total_concat_size = concat_size * bottom_concat_axis;
+    const int concat_num = index / total_concat_size;
+    const int concat_index = index % total_concat_size;
+    const int top_index =
+        concat_index +
+        (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+    out_data[top_index] = in_data[index];
   }
-  LoD lod;
-  lod.emplace_back(result);
-  return lod;
 }
 
-template <typename Dtype>
-__global__ void ker_sequence_concat(Dtype* out_data,
-                                    const uint64_t* in_locate_data,
-                                    const int* o2i_map,
-                                    const int* o2i_w_map,
-                                    const int seq_num,
-                                    const int emb_size,
-                                    const int count) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int tid = idx; tid < count; tid += blockDim.x * gridDim.x) {
-    int emb_id = tid % emb_size;
-    int word_id = tid / emb_size;
-    int input_id = o2i_map[word_id];
-    int cur_work_id = o2i_w_map[word_id];
-    const Dtype* in_data = reinterpret_cast<const Dtype*>(
-        reinterpret_cast<uintptr_t>(in_locate_data[input_id]));
-    out_data[tid] = in_data[cur_work_id * emb_size + emb_id];
+template <typename dtype>
+__global__ void concat_impl_2d_impl(const int inner_size,
+                                    const int num_concats,
+                                    const dtype* in_data,
+                                    const int concat_size,
+                                    const int out_concat_axis,
+                                    const int offset_concat_axis,
+                                    dtype* out_data) {
+  int idx_inner = threadIdx.x + blockIdx.x * blockDim.x;
+  int idx_outer = threadIdx.y + blockIdx.y * blockDim.y;
+
+  if (idx_inner < inner_size && idx_outer < num_concats) {
+    int idx_input = idx_outer * inner_size + idx_inner;
+    int idx_output =
+        (idx_outer * out_concat_axis + offset_concat_axis) * concat_size +
+        idx_inner;
+    out_data[idx_output] = in_data[idx_input];
   }
 }
 
@@ -66,73 +67,75 @@ void SequenceConcatCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
-  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
 
-  int seq_num = param.X[0]->lod()[0].size() - 1;
-  const int emb_size = param.X[0]->numel() / param.X[0]->dims()[0];
-  std::vector<uint64_t> in_locate_vec;
-  for (size_t i = 0; i < param.X.size(); ++i) {
-    in_locate_vec.push_back(
-        reinterpret_cast<uintptr_t>(param.X[i]->data<float>()));
-  }
-  in_locate_tensor.Resize({static_cast<int64_t>(in_locate_vec.size())});
+  const int BLOCK_SIZE = 32;
+  const int axis = 1;
+  int num_concats = param.X[0]->dims().count(0, axis);
+  int concat_input_size =
+      param.X[0]->dims().count(axis + 1, param.X[0]->dims().size());
 
-  std::vector<int> out2in_map;
-  std::vector<int> out2in_word_map;
-  for (int i = 0; i < seq_num; ++i) {
-    for (int j = 0; j < param.X.size(); ++j) {
-      auto offset = param.X[j]->lod()[0];
-      int cur_len = offset[i + 1] - offset[i];
-      for (int k = 0; k < cur_len; ++k) {
-        out2in_map.push_back(j);
-        out2in_word_map.push_back(offset[i] + k);
+  int input_size = param.X.size();
+  std::vector<std::vector<int64_t>> shapes_in(input_size);
+  for (int i = 0; i < input_size; ++i) {
+    shapes_in[i] = param.X[i]->dims().Vectorize();
+  }
+  std::vector<int64_t> shape_out = shapes_in[0];
+
+  // compute output shape
+  for (int i = 1; i < input_size; ++i) {
+    for (int j = 0; j < shapes_in[i].size(); ++j) {
+      if (j == axis) {
+        continue;
+      } else if (shapes_in[i][j] != -1) {
+        CHECK_EQ(shape_out[j], shapes_in[i][j])
+            << "All inputs must have the same shape, except at concat_axis.";
       }
     }
+    shape_out[axis] += shapes_in[i][axis];
   }
-  int word_num = out2in_map.size();
-  out2in_map_tensor.Resize({word_num});
-  out2in_word_map_tensor.Resize({word_num});
-  int* gpu_o2i_map_data = out2in_map_tensor.mutable_data<int>(TARGET(kCUDA));
-  int* gpu_o2i_w_map_data =
-      out2in_word_map_tensor.mutable_data<int>(TARGET(kCUDA));
-  uint64_t* gpu_in_locate_data =
-      in_locate_tensor.mutable_data<uint64_t>(TARGET(kCUDA));
 
-  TargetWrapperCuda::MemcpyAsync(gpu_o2i_map_data,
-                                 out2in_map.data(),
-                                 sizeof(int) * out2in_map.size(),
-                                 IoDirection::HtoD,
-                                 stream);
-  TargetWrapperCuda::MemcpyAsync(gpu_o2i_w_map_data,
-                                 out2in_word_map.data(),
-                                 sizeof(int) * out2in_word_map.size(),
-                                 IoDirection::HtoD,
-                                 stream);
-  TargetWrapperCuda::MemcpyAsync(gpu_in_locate_data,
-                                 in_locate_vec.data(),
-                                 sizeof(uint64_t) * in_locate_vec.size(),
-                                 IoDirection::HtoD,
-                                 stream);
-
-  param.Out->set_lod(ConcatLoD<float>(param.X));
-
-  int count = param.X[0]->numel();
-  for (int i = 1; i < param.X.size(); ++i) {
-    count += param.X[i]->numel();
+  param.Out->Resize(shape_out);
+  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+  int offset_concat_axis = 0;
+  const int out_concat_axis = shape_out[axis];
+
+  for (int i = 0; i < input_size; ++i) {
+    std::vector<int64_t> in_shape = param.X[i]->dims().Vectorize();
+    const auto* in_data = param.X[i]->data<float>();
+    const int in_concat_axis = in_shape[axis];
+    const int in_concat_size = in_concat_axis * concat_input_size;
+    const int nthreads = in_concat_size * num_concats;
+    float ratio = static_cast<float>(in_concat_size) / num_concats;
+    bool is_balance = (ratio > 0.1 && ratio < 10);
+    if (is_balance) {
+      int block_x = BLOCK_SIZE;
+      int block_y = BLOCK_SIZE;
+      int grid_x = (in_concat_size + block_x - 1) / block_x;
+      int grid_y = (num_concats + block_y - 1) / block_y;
+      dim3 block(block_x, block_y);
+      dim3 grid(grid_x, grid_y);
+      concat_impl_2d_impl<float><<<grid, block, 0, stream>>>(in_concat_size,
+                                                             num_concats,
+                                                             in_data,
+                                                             concat_input_size,
+                                                             out_concat_axis,
+                                                             offset_concat_axis,
+                                                             out_data);
+    } else {
+      int grid = (nthreads + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      concat_impl_cuda<float><<<grid, BLOCK_SIZE, 0, stream>>>(
+          nthreads,
+          in_data,
+          num_concats,
+          concat_input_size,
+          out_concat_axis,
+          in_concat_axis,
+          offset_concat_axis,
+          out_data);
+    }
+    offset_concat_axis += in_concat_axis;
   }
-
-  int blocks = (count + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-  ker_sequence_concat<float><<<blocks, CUDA_NUM_THREADS, 0, stream>>>(
-      out_data,
-      gpu_in_locate_data,
-      gpu_o2i_map_data,
-      gpu_o2i_w_map_data,
-      seq_num,
-      emb_size,
-      count);
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+  param.Out->set_lod(param.X[0]->lod());
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/sequence_concat_compute.h b/lite/kernels/cuda/sequence_concat_compute.h
index 1737c18dd35976572efa1b62fadefed906b0ceb5..3f2204cd1bfd27af61b791b4f2c7b3e2ed210436 100644
--- a/lite/kernels/cuda/sequence_concat_compute.h
+++ b/lite/kernels/cuda/sequence_concat_compute.h
@@ -27,11 +27,6 @@ class SequenceConcatCompute
 
   void Run() override;
   virtual ~SequenceConcatCompute() = default;
-
- private:
-  lite::Tensor out2in_map_tensor;
-  lite::Tensor out2in_word_map_tensor;
-  lite::Tensor in_locate_tensor;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
index 8ea3edb30d86e314a04aab7ceac358e4c57b5b6a..4794644c6d02af7886a13b619d64e76e6e4a0f33 100644
--- a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
@@ -26,6 +26,8 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
     const Dtype *input,
     const int *gpu_input_offset_l,
     const int *gpu_input_offset_r,
+    const int row_max,
+    const int col_max,
     const int topk_size,
     const int *topks,
     const int feat_map_num) {
@@ -33,20 +35,17 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
       gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x];  // 8
   int col = gpu_input_offset_r[blockIdx.x + 1] -
             gpu_input_offset_r[blockIdx.x];  // 30
+
   int max_k = topks[topk_size - 1];
   max_k = max_k < col ? max_k : col;
 
   extern __shared__ Dtype smem[];  // H*W
 
-  const Dtype *fm_row_in_data = input;
-  for (int i = 0; i < blockIdx.x; ++i) {
-    int tmp_row = gpu_input_offset_l[i + 1] - gpu_input_offset_l[i];
-    int tmp_col = gpu_input_offset_r[i + 1] - gpu_input_offset_r[i];
-    fm_row_in_data += tmp_row * feat_map_num * tmp_col;
-  }
-  fm_row_in_data += blockIdx.y * row * col;
+  const Dtype *fm_row_in_data = input +
+                                blockIdx.x * row_max * feat_map_num * col_max +
+                                blockIdx.y * row_max * col_max;
 
-  for (int i = threadIdx.x; i < row * col; i += blockDim.x) {
+  for (int i = threadIdx.x; i < row * col_max; i += blockDim.x) {
     smem[i] = fm_row_in_data[i];
   }
   __syncthreads();
@@ -57,13 +56,13 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
         (gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size +
         blockIdx.y * topk_size;
 
-    Dtype *smem_start_col = smem + idx * col;
+    Dtype *smem_start_col = smem + idx * col_max;
 
     int counter = max_k;  // topk_size;
     Dtype last_max_val = -20000.0;
     while (counter) {
       Dtype max_val = -10000.0;
-      int max_pos = 0;
+      int max_pos = 0;  // -1;
       int m = 0;
       for (; m < col; m++) {
         Dtype cur_data = smem_start_col[m];
@@ -77,6 +76,7 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
         max_val = last_max_val;
       }
       smem_start_col[max_pos] = -10000000.0;
+
       int i = max_k - counter;
       for (int c = 0; c < topk_size; c++) {
         if (i <= topks[c] - 1) {
@@ -98,22 +98,18 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
   auto &param = this->Param<param_t>();
   auto &ctx = this->ctx_->template As<CUDAContext>();
   auto cuda_stream = ctx.exec_stream();
-  int topk_num = param.topks.size();
-  lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
-  _top_ks.Resize(top_ks_shape);
-  cudaMemcpyAsync(_top_ks.mutable_data<int>(TARGET(kCUDA)),
-                  &param.topks[0],
-                  sizeof(int) * topk_num,
-                  cudaMemcpyHostToDevice,
-                  cuda_stream);
 
-  int width_offset_len = param.COLUMN->lod()[0].size();
-  lite::DDim width_offset_shape(
-      std::vector<int64_t>{width_offset_len, 1, 1, 1});
+  CHECK(param.X->lod().size() > 0 && param.X->lod()[0].size() > 0)
+      << "X sequence offset is not valid";
+  CHECK(param.ROW->lod().size() > 0 && param.ROW->lod()[0].size() > 0)
+      << "ROW sequence offset is not valid";
+
+  int width_offset_len = param.X->lod()[0].size();
+  lite::DDim width_offset_shape(std::vector<int64_t>{width_offset_len});
   _width_offset.Resize(width_offset_shape);
   std::vector<int> width_lod_0(width_offset_len, 0);
-  for (size_t i = 0; i < param.COLUMN->lod()[0].size(); ++i) {
-    width_lod_0[i] = static_cast<int>(param.COLUMN->lod()[0][i]);
+  for (size_t i = 0; i < param.X->lod()[0].size(); ++i) {
+    width_lod_0[i] = static_cast<int>(param.X->lod()[0][i]);
   }
   cudaMemcpyAsync(_width_offset.mutable_data<int>(TARGET(kCUDA)),
                   &width_lod_0[0],
@@ -122,8 +118,7 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
                   cuda_stream);
 
   int height_offset_len = param.ROW->lod()[0].size();
-  lite::DDim height_offset_shape(
-      std::vector<int64_t>{height_offset_len, 1, 1, 1});
+  lite::DDim height_offset_shape(std::vector<int64_t>{height_offset_len});
   _height_offset.Resize(height_offset_shape);
   std::vector<int> height_lod_0(height_offset_len, 0);
   for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) {
@@ -139,39 +134,42 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
   Tensor *out_tensor = param.Out;
   const T *in_data = x_tensor->data<T>();
   T *out_data = out_tensor->mutable_data<T>(TARGET(kCUDA));
-  TargetWrapperCuda::MemsetAsync(out_tensor->mutable_data<T>(TARGET(kCUDA)),
-                                 0,
-                                 sizeof(T) * out_tensor->numel(),
-                                 cuda_stream);
+  TargetWrapperCuda::MemsetAsync(
+      out_data, 0, sizeof(T) * param.Out->numel(), cuda_stream);
+
+  int topk_num = param.topks.size();
+  lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
+  _top_ks.Resize(top_ks_shape);
+  cudaMemcpyAsync(_top_ks.mutable_data<int>(TARGET(kCUDA)),
+                  &param.topks[0],
+                  sizeof(int) * topk_num,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
 
-  int num = param.ROW->lod()[0].size() - 1;
-  int channel = param.channel_num;
+  int num = param.X->dims()[0];
+  int channel = param.X->dims()[1];
+  int height = param.X->dims()[2];
+  int width = param.X->dims()[3];
 
   const int *height_offset = _height_offset.data<int>();
   const int *width_offset = _width_offset.data<int>();
 
-  int feat_map_size = 0;
-  for (size_t i = 0; i < height_lod_0.size() - 1; ++i) {
-    int height = height_lod_0[i + 1] - height_lod_0[i];
-    int width = width_lod_0[i + 1] - width_lod_0[i];
-    if (height * width > feat_map_size) {
-      feat_map_size = height * width;
-    }
-  }
+  int feat_map_size = height * width;
+
   dim3 blocks(num, channel);
   dim3 threads(32, 1);
+
   topk_avg_pooling_kernel_by_row_improve<
       T><<<blocks, threads, feat_map_size * sizeof(T), cuda_stream>>>(
       out_data,
       in_data,
       height_offset,
       width_offset,
+      height,
+      width,
       param.topks.size(),
       _top_ks.data<int>(),
       param.channel_num);
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/softmax_compute.cu b/lite/kernels/cuda/softmax_compute.cu
index 6293f7295ec78f44705992182667b30f82728e09..157c6ae889d322197d4286d6adbca21ade4ad792 100644
--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
@@ -21,6 +21,8 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
 
+const int CUDA_NUM_THREADS = 512;
+
 extern __shared__ char tile[];
 template <typename dtype>
 __global__ void sharemem_softmax_kernel(int total_size,
@@ -149,6 +151,15 @@ __global__ void softmax_divid_output_kernel(int total_size,
   }
 }
 
+void SoftmaxCompute::PrepareForRun() {
+  int device_id;
+  cudaGetDevice(&device_id);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, device_id);
+  sharedmem_size = deviceProp.sharedMemPerBlock;
+  max_dimsize = sharedmem_size / sizeof(float) / CUDA_NUM_THREADS;
+}
+
 void SoftmaxCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
@@ -165,18 +176,10 @@ void SoftmaxCompute::Run() {
   int total_threads = inner_num * outer_num;
   int axis_size = x_dims[axis];
 
-  int device_id;
-  const int threads = 512;
+  const int threads = CUDA_NUM_THREADS;
   const int blocks = (total_threads + threads - 1) / threads;
-  cudaGetDevice(&device_id);
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, device_id);
-  size_t sharedmem_size = deviceProp.sharedMemPerBlock;
-  int max_dimsize = sharedmem_size / sizeof(float) / threads;
   auto input_data = param.x->data<float>();
   auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
-  TargetWrapperCuda::MemsetSync(
-      output_data, 0, param.output->numel() * sizeof(float));
   if (axis_size <= max_dimsize) {
     int use_sharemem_size = axis_size * threads * sizeof(float);
     sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(
diff --git a/lite/kernels/cuda/softmax_compute.h b/lite/kernels/cuda/softmax_compute.h
index 4acde4ab072390dd139c3e4e715f9ad288dc4ef8..72d43a8eff0e55efe4b08ca8d4a665b35b22e5b9 100644
--- a/lite/kernels/cuda/softmax_compute.h
+++ b/lite/kernels/cuda/softmax_compute.h
@@ -25,8 +25,14 @@ class SoftmaxCompute
  public:
   using param_t = operators::SoftmaxParam;
 
+  void PrepareForRun() override;
   void Run() override;
   virtual ~SoftmaxCompute() = default;
+
+ private:
+  size_t sharedmem_size;
+  int num_threads;
+  int max_dimsize;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/transpose_compute.cu b/lite/kernels/cuda/transpose_compute.cu
index 0050e5e0f6d67f4eacaadc675b98417b9436b006..c5693c674c573d7c9f59034dd3c0985c9d94a22f 100644
--- a/lite/kernels/cuda/transpose_compute.cu
+++ b/lite/kernels/cuda/transpose_compute.cu
@@ -25,6 +25,7 @@ namespace cuda {
 void TransposeCompute::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
 
   const lite::Tensor* X = param.x;
   lite::Tensor* Out = param.output;
@@ -39,8 +40,7 @@ void TransposeCompute::Run() {
   // NCHW -> NHWC
   if (axes.size() == 4 && axes[0] == 0 && axes[1] == 2 && axes[2] == 3 &&
       axes[3] == 1) {
-    lite::cuda::math::NCHW2NHWC(
-        dims[0], dims[1], dims[2] * dims[3], in, out, &ctx);
+    trans.NCHW2NHWC(dims[0], dims[1], dims[2] * dims[3], in, out, &stream);
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
     return;
@@ -49,14 +49,13 @@ void TransposeCompute::Run() {
   // NHWC -> NCHW
   if (axes.size() == 4 && axes[0] == 0 && axes[1] == 3 && axes[2] == 1 &&
       axes[3] == 2) {
-    lite::cuda::math::NHWC2NCHW(
-        dims[0], dims[3], dims[1] * dims[2], in, out, &ctx);
+    trans.NHWC2NCHW(dims[0], dims[3], dims[1] * dims[2], in, out, &stream);
     cudaError_t error = cudaGetLastError();
     if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
     return;
   }
 
-  lite::cuda::math::Transpose(dims, axes, in, out, &ctx);
+  trans.transpose(out, in, dims, axes, &stream);
   cudaError_t error = cudaGetLastError();
   if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
diff --git a/lite/kernels/cuda/transpose_compute.h b/lite/kernels/cuda/transpose_compute.h
index f85f43993d60cc9dbe5e665a3b2b0fffcbcbc7c9..273d072231fb0608deb9ed729bdf153395ee983f 100644
--- a/lite/kernels/cuda/transpose_compute.h
+++ b/lite/kernels/cuda/transpose_compute.h
@@ -29,7 +29,7 @@ class TransposeCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   virtual ~TransposeCompute() = default;
 
  private:
-  lite::Tensor axes_, dims_;
+  lite::cuda::math::Transpose<float> trans;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/transpose_compute_test.cc b/lite/kernels/cuda/transpose_compute_test.cc
index 517f761b61268d2c664f74bdb338ffb79f8841f8..bf0d803a14a5f0e47c96128b953ae72a18798205 100644
--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ b/lite/kernels/cuda/transpose_compute_test.cc
@@ -238,7 +238,7 @@ TEST(transpose, normal) {
   lite::Tensor x, x_cpu, x_ref;
   lite::Tensor out, out_cpu, out_ref;
 
-  int C = 6, H = 7, W = 8;
+  int C = 3, H = 128, W = 128;
   std::vector<int> axes({2, 0, 1});
   x.Resize({C, H, W});
   out.Resize({W, C, H});
diff --git a/lite/kernels/cuda/var_conv_2d_compute.cu b/lite/kernels/cuda/var_conv_2d_compute.cu
index f2588a8f53b83363300000fca6ba8a11cf5d50b6..1e42635934b67b28fca29808f484be53292d74cf 100644
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
@@ -25,224 +25,83 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-const int CUDA_NUM_THREADS = 512;
-
-template <typename Dtype>
-__global__ void var_im2col_gpu_kernel(const int n,
-                                      const Dtype* data_im,
-                                      const int height,
-                                      const int width,
-                                      const int kernel_h,
-                                      const int kernel_w,
-                                      const int pad_h,
-                                      const int pad_w,
-                                      const int stride_h,
-                                      const int stride_w,
-                                      const int height_col,
-                                      const int width_col,
-                                      Dtype* data_col) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int index = idx; index < n; index += blockDim.x * gridDim.x) {
-    const int h_index = index / width_col;
-    const int h_col = h_index % height_col;
-    const int w_col = index % width_col;
-    const int c_im = h_index / height_col;
-    const int c_col = c_im * kernel_h * kernel_w;
-    const int h_offset = h_col * stride_h - pad_h;
-    const int w_offset = w_col * stride_w - pad_w;
-
-    Dtype* data_col_ptr = data_col;
-    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    const Dtype* data_im_ptr = data_im;
-    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
-
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        int h_im = h_offset + i;
-        int w_im = w_offset + j;
-        *data_col_ptr =
-            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
-                ? data_im_ptr[i * width + j]
-                : 0;
-        data_col_ptr += height_col * width_col;
-      }
-    }
-  }
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
+
+  return output_size;
 }
 
-void VarConv2DCompute::var_im2col(const cudaStream_t& stream) {
+void VarConv2DCompute::PrepareForRun() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
   auto& param = this->Param<param_t>();
-  int input_channel = param.input_channel;
-  int kernel_h = param.kernel_h;
-  int kernel_w = param.kernel_w;
-  int stride_h = param.stride_h;
-  int stride_w = param.stride_w;
-  // auto* in_row = param.ROW;
-  // auto* in_col = param.COLUMN;
-  const auto* input = param.X;
-  auto* col = param.Col;
-
-  int batch = input->lod()[0].size() - 1;
-  const auto& bottom_offset = input->lod()[0];
-  // 2-D lod info.
-  // const auto& offset_x = in_col->lod()[0];
-  // const auto& offset_y = in_row->lod()[0];
-  const auto& offset_y = param.X->lod()[1];
-  const auto& offset_x = param.X->lod()[2];
-  // top offset is the whole size of each data sample
-  std::vector<uint64_t> top_offset;
-  int top_size = 0;
-  top_offset.push_back(top_size);
-  for (int b = 0; b < batch; ++b) {
-    int width = offset_x[b + 1] - offset_x[b];
-    int height = offset_y[b + 1] - offset_y[b];
-    int top_im_x = 0;
-    if (width == 0) {
-      top_im_x = 0;
-    } else {
-      top_im_x = (width - 1) / stride_w + 1;
-    }
-    int top_im_y = 0;
-    if (height == 0) {
-      top_im_y = 0;
-    } else {
-      top_im_y = (height - 1) / stride_h + 1;
-    }
-    int top_x = top_im_x * top_im_y;
-    int top_y = input_channel * kernel_h * kernel_w;
-    top_size += top_y * top_x;
-    top_offset.push_back(top_size);
+  conv_param_.x = const_cast<lite::Tensor*>(param.X);
+  conv_param_.var_length = true;
+
+  conv_param_.paddings.reset(new std::vector<int>);
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_h / 2));
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_h / 2));
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_w / 2));
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_w / 2));
+  conv_param_.dilations.reset(new std::vector<int>);
+  conv_param_.dilations->push_back(1);
+  conv_param_.dilations->push_back(1);
+  conv_param_.strides[0] = param.stride_h;
+  conv_param_.strides[1] = param.stride_w;
+  conv_param_.filter = const_cast<lite::Tensor*>(param.W);
+  conv_param_.filter->Resize({param.output_channel,
+                              param.input_channel,
+                              param.kernel_h,
+                              param.kernel_w});
+
+  conv_param_.output = param.Out;
+  std::vector<int64_t> output_shape(
+      {conv_param_.x->dims()[0], param.output_channel});
+  for (size_t i = 0; i < conv_param_.strides.size(); ++i) {
+    output_shape.push_back(
+        ConvOutputSize(conv_param_.x->dims()[i + 2],
+                       conv_param_.filter->dims()[i + 2],
+                       (*conv_param_.dilations.get())[i],
+                       (*conv_param_.paddings.get())[i * 2],
+                       (*conv_param_.paddings.get())[i * 2 + 1],
+                       conv_param_.strides[i]));
   }
-
-  LoD col_lod;
-  col_lod.push_back(top_offset);
-  col->set_lod(col_lod);
-  std::vector<int64_t> col_dims_vec{top_size};
-  col_dims_vec.push_back(1);
-  col->Resize(col_dims_vec);
-  auto* top_data = col->mutable_data<float>(TARGET(kCUDA));
-  const auto* bottom_data = input->data<float>();
-
-  for (int b = 0; b < batch; ++b) {
-    int t_offset = top_offset[b];
-    int b_offset = bottom_offset[b];
-    int width = offset_x[b + 1] - offset_x[b];
-    int height = offset_y[b + 1] - offset_y[b];
-    if (width == 0 || height == 0) {
-      continue;
-    }
-    int width_col = (width - 1) / stride_w + 1;
-    int height_col = (height - 1) / stride_h + 1;
-    const float* data_im = bottom_data + b_offset;
-    float* data_col = top_data + t_offset;
-
-    // We are going to launch channels * height_col * width_col kernels, each
-    // kernel responsible for copying a single-channel grid.
-    int num_kernels = height_col * width_col * input_channel;
-    const int CUDA_NUM_BLOCKS =
-        (num_kernels + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-    var_im2col_gpu_kernel<
-        float><<<CUDA_NUM_BLOCKS, CUDA_NUM_THREADS, 0, stream>>>(
-        num_kernels,
-        data_im,
-        height,
-        width,
-        kernel_h,
-        kernel_w,
-        ((stride_h - 1) * height + kernel_h - 1) / 2,
-        ((stride_w - 1) * width + kernel_w - 1) / 2,
-        stride_h,
-        stride_w,
-        height_col,
-        width_col,
-        data_col);
+  if (param.fuse_relu) {
+    conv_param_.activation_param.has_active = true;
+    conv_param_.activation_param.active_type = lite_api::ActivationType::kRelu;
   }
+  conv_param_.output->Resize({output_shape});
+  conv_impl_.reset(new lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>);
+  conv_impl_->init(conv_param_, &context);
 }
 
 void VarConv2DCompute::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
   auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
 
-  auto* bottom = param.X;
-  // auto* in_row = param.ROW;
-  // auto* in_col = param.COLUMN;
-  auto* w = param.W;
-  auto* top = param.Out;
-  auto* col = param.Col;
-  int output_channel = param.output_channel;
-  int input_channel = param.input_channel;
-  int kernel_h = param.kernel_h;
-  int kernel_w = param.kernel_w;
-  int stride_h = param.stride_h;
-  int stride_w = param.stride_w;
-
-  var_im2col(stream);
-
-  int batch = bottom->lod()[0].size() - 1;
-  const auto& col_offset = col->lod()[0];
-  // const auto& offset_x = in_col->lod()[0];
-  // const auto& offset_y = in_row->lod()[0];
-  const auto& offset_y = param.X->lod()[1];
-  const auto& offset_x = param.X->lod()[2];
-  std::vector<size_t> top_offset;
-  std::vector<int64_t> height_vector;
-  std::vector<int64_t> width_vector;
-  int top_size = 0;
-  top_offset.push_back(top_size);
-  for (int b = 0; b < batch; ++b) {
-    int width = offset_x[b + 1] - offset_x[b];
-    int height = offset_y[b + 1] - offset_y[b];
-    int top_im_x = 0;
-    if (width == 0) {
-      top_im_x = 0;
-    } else {
-      top_im_x = (width - 1) / stride_w + 1;
-    }
-    int top_im_y = 0;
-    if (height == 0) {
-      top_im_y = 0;
-    } else {
-      top_im_y = (height - 1) / stride_h + 1;
-    }
-    height_vector.push_back(top_im_y);
-    width_vector.push_back(top_im_x);
-    int top_im_size = top_im_y * top_im_x;
-    top_size += output_channel * top_im_size;
-    top_offset.push_back(top_size);
+  param.Out->set_lod(param.X->lod());
+  std::vector<int64_t> output_shape(
+      {conv_param_.x->dims()[0], param.output_channel});
+  for (size_t i = 0; i < conv_param_.strides.size(); ++i) {
+    output_shape.push_back(
+        ConvOutputSize(conv_param_.x->dims()[i + 2],
+                       conv_param_.filter->dims()[i + 2],
+                       (*conv_param_.dilations.get())[i],
+                       (*conv_param_.paddings.get())[i * 2],
+                       (*conv_param_.paddings.get())[i * 2 + 1],
+                       conv_param_.strides[i]));
   }
-
-  LoD top_lod;
-  top_lod.push_back(top_offset);
-  top->set_lod(top_lod);
-  std::vector<int64_t> top_dims_vec{top_size};
-  top_dims_vec.push_back(1);
-  top->Resize(top_dims_vec);
-
-  auto* top_data = top->mutable_data<float>(TARGET(kCUDA));
-  const auto* w_data = w->data<float>();
-  const auto* col_data = col->data<float>();
-
-  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
-  for (int b = 0; b < batch; ++b) {
-    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
-    if (top_im_size == 0) {
-      continue;
-    }
-    float* out_data = top_data + top_offset[b];
-    const float* in_data = col_data + col->lod()[0][b];
-    gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
-    gemm_impl_->init(false,
-                     false,
-                     w->dims()[0],
-                     height_vector[b] * width_vector[b],
-                     input_channel * kernel_h * kernel_w,
-                     &ctx);
-    gemm_impl_->run(1., 0., w_data, in_data, out_data, &ctx);
-  }
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+  conv_param_.output->Resize({output_shape});
+  conv_impl_->create(conv_param_, &context);
+  conv_impl_->run(conv_param_);
 }
 
 }  // namespace cuda
diff --git a/lite/kernels/cuda/var_conv_2d_compute.h b/lite/kernels/cuda/var_conv_2d_compute.h
index e0b8e30c509f9095960bee3720567c96a71e7336..4bb61132dbb49579875fa6d3b311a80a0a177394 100644
--- a/lite/kernels/cuda/var_conv_2d_compute.h
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
+#include "lite/backends/cuda/math/cudnn_conv.h"
 #include "lite/core/kernel.h"
 
 namespace paddle {
@@ -25,10 +27,12 @@ class VarConv2DCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
   using param_t = operators::VarConv2DParam;
 
   void Run() override;
+  void PrepareForRun() override;
   virtual ~VarConv2DCompute() = default;
 
  private:
-  void var_im2col(const cudaStream_t& stream);
+  mutable operators::ConvParam conv_param_;
+  std::unique_ptr<lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>> conv_impl_;
 };
 
 }  // namespace cuda
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
old mode 100644
new mode 100755
index dc8860188043dde6538a303eef82617e46c2a6c9..7c47e72872ecae6216288c20fa1a6ae30fac65bd
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -4,29 +4,42 @@ endif()
 
 set(fpga_deps fpga_target_wrapper kernel_fpga)
 
-add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
 
+# add_kernel(activation_compute_fpga FPGA basic SRCS activation_compute.cc DEPS ${fpga_deps})
+# add_kernel(box_coder_compute_fpga FPGA basic SRCS box_coder_compute.cc DEPS ${fpga_deps})
+# add_kernel(concat_compute_fpga FPGA basic SRCS concat_compute.cc DEPS ${fpga_deps})
 add_kernel(conv_compute_fpga FPGA basic SRCS conv_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
-
+# add_kernel(density_prior_box_compute_fpga FPGA basic SRCS density_prior_box_compute.cc DEPS ${fpga_deps})
+add_kernel(dropout_compute_fpga FPGA basic SRCS dropout_compute.cc DEPS ${fpga_deps})
 add_kernel(elementwise_compute_fpga FPGA basic SRCS elementwise_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
-
+# add_kernel(feed_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
 
+add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
+add_kernel(gru_compute_fpga FPGA extra SRCS gru_compute.cc DEPS ${fpga_deps})
+# add_kernel(mul_compute_fpga FPGA basic SRCS mul_compute.cc DEPS ${fpga_deps})
+add_kernel(multiclass_nms_compute_fpga FPGA basic SRCS multiclass_nms_compute.cc DEPS ${fpga_deps})
+add_kernel(norm_compute_fpga FPGA basic SRCS norm_compute.cc DEPS ${fpga_deps})
+# add_kernel(im2sequence_compute_fpga FPGA basic SRCS im2sequence_compute.cc DEPS ${fpga_deps})
 add_kernel(pooling_compute_fpga FPGA basic SRCS pooling_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
-
+add_kernel(prior_box_compute_fpga FPGA basic SRCS prior_box_compute.cc DEPS ${fpga_deps})
+# add_kernel(reshape_compute_fpga FPGA basic SRCS reshape_compute.cc DEPS ${fpga_deps} reshape_op)
+# add_kernel(sequence_pool_compute_fpga FPGA basic SRCS sequence_pool_compute.cc DEPS ${fpga_deps})
 add_kernel(scale_compute_fpga FPGA basic SRCS scale_compute.cc DEPS ${fpga_deps})
-
-add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
-
-add_kernel(fc_compute_fpga FPGA basic SRCS fc_compute.cc DEPS ${fpga_deps})
-lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
+# add_kernel(softmax_compute_fpga FPGA basic SRCS softmax_compute.cc DEPS ${fpga_deps})
+# add_kernel(transpose_compute_fpga FPGA basic SRCS transpose_compute.cc DEPS ${fpga_deps})
 
 add_kernel(io_copy_compute_fpga FPGA basic SRCS io_copy_compute.cc DEPS ${fpga_deps})
 add_kernel(calib_compute_fpga FPGA basic SRCS calib_compute.cc DEPS ${fpga_deps})
 add_kernel(layout_compute_fpga FPGA basic SRCS layout_compute.cc DEPS ${fpga_deps})
 add_kernel(feed_compute_fpga FPGA basic SRCS feed_compute.cc DEPS ${fpga_deps})
 add_kernel(fetch_compute_fpga FPGA basic SRCS fetch_compute.cc DEPS ${fpga_deps})
+
+# add_kernel(while_compute_fpga FPGA extra SRCS while_compute.cc DEPS ${fpga_deps})
+# add_kernel(write_to_array_compute_fpga FPGA extra SRCS write_to_array_compute.cc DEPS ${fpga_deps})
+
+# lite_cc_test(test_acivation_fpga SRCS activation_compute_test.cc DEPS ${lite_kernel_deps} activation_compute_fpga ${fpga_deps})
+lite_cc_test(test_conv_fpga SRCS conv_compute_test.cc DEPS ${lite_kernel_deps} conv_compute_fpga ${fpga_deps})
+lite_cc_test(test_elementwise_fpga SRCS elementwise_compute_test.cc DEPS ${lite_kernel_deps} elementwise_compute_fpga ${fpga_deps})
+lite_cc_test(test_fc_compute_fpga SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_fpga ${fpga_deps})
+lite_cc_test(test_pooling_compute_fpga SRCS pooling_compute_test.cc DEPS ${lite_kernel_deps} pooling_compute_fpga ${fpga_deps})
+# lite_cc_test(test_softmax_compute_fpga SRCS softmax_compute_test.cc DEPS ${lite_kernel_deps} softmax_compute_fpga ${fpga_deps})
diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc
old mode 100644
new mode 100755
index 51902cb08ad15fa20e3b1853c44564982adc327f..26301be8dff80f0b039831f4411151c58fa50d19
--- a/lite/kernels/fpga/calib_compute.cc
+++ b/lite/kernels/fpga/calib_compute.cc
@@ -23,24 +23,24 @@ namespace lite {
 namespace kernels {
 namespace fpga {
 using float16 = zynqmp::float16;
+
 void CalibComputeFp32ToFP16::Run() {
   auto& param = this->Param<operators::CalibParam>();
   const auto* din = param.input->data<float>();
-  auto* dout = param.output->mutable_data<float16>(TARGET(kFPGA));
-
-  for (int i = 0; i < param.input->numel(); ++i) {
-    dout[i] = zynqmp::float_to_half(din[i]);
-  }
+  param.output->mutable_data<float16>();
+  param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
+  auto out_lod = param.output->mutable_lod();
+  *out_lod = param.input->lod();
   return;
 }
 
 void CalibComputeFP16ToFp32::Run() {
   auto& param = this->Param<operators::CalibParam>();
   const auto* din = param.input->data<float16>();
-  auto* dout = param.output->mutable_data<float>(TARGET(kFPGA));
-  for (int i = 0; i < param.input->numel(); ++i) {
-    dout[i] = zynqmp::half_to_float(din[i]);
-  }
+  auto* dout = param.output->mutable_data<float>();
+  param.output->ZynqTensor()->copyFrom(param.input->ZynqTensor());
+  auto out_lod = param.output->mutable_lod();
+  *out_lod = param.input->lod();
   return;
 }
 
diff --git a/lite/kernels/fpga/concat_compute.cc b/lite/kernels/fpga/concat_compute.cc
new file mode 100755
index 0000000000000000000000000000000000000000..ad66e3098187e6bc1c0b11afb15609bbe91dcc2f
--- /dev/null
+++ b/lite/kernels/fpga/concat_compute.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/concat_compute.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void ConcatCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  param.output->mutable_data<float16>();
+
+  // ====================================================
+  zynqmp::ConcatParam& concat_param = pe_.param();
+  for (auto t : param.x) {
+    concat_param.inputs.push_back(t->ZynqTensor());
+  }
+  concat_param.output = param.output->ZynqTensor();
+  concat_param.axis = param.axis;
+  pe_.init();
+  pe_.apply();
+}
+
+void ConcatCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ConcatParam& concat_param = pe_.param();
+  Debugger::get_instance().registerOutput("concat", concat_param.output);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(concat,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ConcatCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/concat_compute.h b/lite/kernels/fpga/concat_compute.h
new file mode 100755
index 0000000000000000000000000000000000000000..1b5426be33daf50aed3521a6c0dcae054ea7bac0
--- /dev/null
+++ b/lite/kernels/fpga/concat_compute.h
@@ -0,0 +1,45 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/operators/concat_op.h"
+
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/concat_pe.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class ConcatCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ConcatParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~ConcatCompute() = default;
+
+ private:
+  zynqmp::ConcatPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
old mode 100644
new mode 100755
index 8bc171dd67df08c17cdce61c6fa6882afd9ae8ae..a5a7c3b92f8ce01dacfc0518d0763cdb642bd838
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -13,9 +13,12 @@
 // limitations under the License.
 
 #include "lite/kernels/fpga/conv_compute.h"
+#include <vector>
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
+#include "lite/backends/fpga/KD/debugger.hpp"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -25,37 +28,61 @@ using float16 = zynqmp::float16;
 
 void ConvCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
-
-  // ====================================================
-  zynqmp::ConvParam& conv_param = pe_.param();
   param.output->mutable_data<float16>();
+  int pad_h = (*param.paddings)[0];
+  int pad_w = (*param.paddings)[2];
+  // ====================================================
+  if (param.x->ZynqTensor()->shape().channel() != 1 &&
+      param.groups == param.x->ZynqTensor()->shape().channel()) {
+    zynqmp::DepthwiseConvParam& conv_param = dw_conv_pe_.param();
 
-  // filter_.setDataType(zynqmp::FP32);
-  conv_param.input = param.x->ZynqTensor();
-  conv_param.output = param.output->ZynqTensor();
-  conv_param.filter = param.filter->ZynqTensor();
-  conv_param.groups = param.groups;
-  conv_param.strides = param.strides;
-  auto paddings = *param.paddings;
-  conv_param.paddings = param.paddings;
-  conv_param.dilations = param.dilations;
-  bool pad_equal =
-      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
-  if (!pad_equal) {
-    LOG(FATA) << "This pad not support ! " << paddings[0] << ", " << paddings[1]
-              << ", " << paddings[2] << ", " << paddings[3];
+    conv_param.input = param.x->ZynqTensor();
+    conv_param.output = param.output->ZynqTensor();
+    conv_param.filter = param.filter->ZynqTensor();
+    conv_param.filter->setDataType(zynqmp::FP32);
+    conv_param.groups = param.groups;
+    conv_param.strides = param.strides;
+    conv_param.paddings = std::vector<int>({pad_h, pad_w});
+    conv_param.dilations = *param.dilations;
+    fill_scale_bias_const(&conv_param);
+    conv_param.bias()->copyFrom(param.bias->ZynqTensor());
+    conv_param.relu.enabled = param.fuse_relu;
+
+    dw_conv_pe_.init();
+    dw_conv_pe_.apply();
+  } else {
+    zynqmp::ConvParam& conv_param = conv_pe_.param();
+    conv_param.input = param.x->ZynqTensor();
+    conv_param.output = param.output->ZynqTensor();
+    conv_param.filter = param.filter->ZynqTensor();
+    conv_param.filter->setDataType(zynqmp::FP32);
+    conv_param.groups = param.groups;
+    conv_param.strides = param.strides;
+    conv_param.paddings = std::vector<int>({pad_h, pad_w});
+    conv_param.dilations = *param.dilations;
+    fill_scale_bias_const(&conv_param);
+    if (param.bias != nullptr) {
+      conv_param.bias()->copyFrom(param.bias->ZynqTensor());
+    }
+
+    conv_param.relu.enabled = param.fuse_relu;
+    conv_pe_.init();
+    conv_pe_.apply();
   }
-  fill_scale_bias_const(&conv_param);
-  conv_param.bias()->copyFrom(param.bias->ZynqTensor());
-  conv_param.relu.enabled = param.fuse_relu;
-  pe_.init();
-  pe_.apply();
 }
 
 void ConvCompute::Run() {
   auto& param = this->Param<param_t>();
-  zynqmp::ConvParam& conv_param = pe_.param();
-  pe_.dispatch();
+  if (param.x->ZynqTensor()->shape().channel() != 1 &&
+      param.groups == param.x->ZynqTensor()->shape().channel()) {
+    dw_conv_pe_.dispatch();
+  } else {
+    conv_pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+    zynqmp::ConvParam& conv_param = conv_pe_.param();
+    Debugger::get_instance().registerOutput("conv", conv_param.output);
+#endif
+  }
 }
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/conv_compute.h b/lite/kernels/fpga/conv_compute.h
old mode 100644
new mode 100755
index a023fb46fc8af0ad12d07929137f3eb058e92ef4..8c2b6c3704b8fe6620487bbfe94c50257b9fbcf9
--- a/lite/kernels/fpga/conv_compute.h
+++ b/lite/kernels/fpga/conv_compute.h
@@ -14,11 +14,13 @@
 
 #pragma once
 
-#include "lite/backends/fpga/KD/float16.hpp"
-#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/operators/conv_op.h"
 
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/conv_pe.hpp"
+#include "lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -36,7 +38,8 @@ class ConvCompute
   ~ConvCompute() {}
 
  private:
-  zynqmp::ConvPE pe_;
+  zynqmp::ConvPE conv_pe_;
+  zynqmp::DepthwiseConvPE dw_conv_pe_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/conv_compute_test.cc b/lite/kernels/fpga/conv_compute_test.cc
index 1e05c1fa0c7e0f211b5eaed8f5e0385cbfe20cf2..7db855a0fe10d775cec07ad30e67f00c8230940a 100644
--- a/lite/kernels/fpga/conv_compute_test.cc
+++ b/lite/kernels/fpga/conv_compute_test.cc
@@ -141,15 +141,13 @@ void conv_compute_ref(const operators::ConvParam& param) {
   int group = param.groups;
   int kernel_w = param.filter->dims()[2];
   int kernel_h = param.filter->dims()[3];
-
-  auto paddings = *param.paddings;
-  auto dilations = *para.dilations;
   int stride_w = param.strides[0];
   int stride_h = param.strides[1];
-  int dila_w = dilations[0];
-  int dila_h = dilations[1];
-  int pad_w = paddings[2];
-  int pad_h = paddings[0];
+  int dila_w = (*param.dilations)[0];
+  int dila_h = (*param.dilations)[1];
+
+  int pad_w = (*param.paddings)[2];
+  int pad_h = (*param.paddings)[0];
   bool flag_bias = (param.bias != nullptr);
   bool flag_relu = param.fuse_relu;
 
@@ -279,14 +277,11 @@ TEST(conv_fpga, compute) {
                             param.bias = &bias;
                           }
                           param.fuse_relu = flag_relu;
-                          std::vector<int> paddings = {
-                              padding, padding, padding, padding};
+                          *param.paddings = std::vector<int>(
+                              {padding, padding, padding, padding});
                           param.strides = std::vector<int>({stride, stride});
-                          std::vector<int> dilations = {dilation, dilation};
-                          param.paddings =
-                              std::make_shared<std::vector<int>>(paddings);
-                          param.dilations =
-                              std::make_shared<std::vector<int>>(dilations);
+                          *param.dilations =
+                              std::vector<int>({dilation, dilation});
                           param.groups = group;
                           conv.SetParam(param);
                           conv.Launch();
diff --git a/lite/kernels/fpga/dropout_compute.cc b/lite/kernels/fpga/dropout_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b047a643f5cfda594a1bb56278c59f0371914e96
--- /dev/null
+++ b/lite/kernels/fpga/dropout_compute.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/dropout_compute.h"
+#include <string>
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/backends/fpga/KD/float16.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+void DropoutCompute::PrepareForRun() {
+  auto& param = Param<operators::DropoutParam>();
+  param.output->mutable_data<float16>();
+
+  zynqmp::ScaleParam& scale_param = pe_.param();
+  scale_param.input = param.x->ZynqTensor();
+  scale_param.output = param.output->ZynqTensor();
+
+  int channel = scale_param.input->shape().channel();
+  zynqmp::Tensor* scale = new zynqmp::Tensor();
+  zynqmp::Tensor* bias = new zynqmp::Tensor();
+  zynqmp::Shape shape(zynqmp::N, {channel});
+  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
+  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
+
+  float scale_value = 1 - param.dropout_prob;
+  for (int i = 0; i < channel; ++i) {
+    scale_data[i] = scale_value;
+    bias_data[i] = 0.0f;
+  }
+  scale->flush();
+  bias->flush();
+
+  scale_param.bias = bias;
+  scale_param.scale = scale;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void DropoutCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ScaleParam& scale_param = pe_.param();
+  Debugger::get_instance().registerOutput("dropout", scale_param.output);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(dropout,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::DropoutCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/npu/graph_compute.h b/lite/kernels/fpga/dropout_compute.h
old mode 100644
new mode 100755
similarity index 58%
rename from lite/kernels/npu/graph_compute.h
rename to lite/kernels/fpga/dropout_compute.h
index b289b8e42f49e347fe72c5f9f37ea80bc30fc6a2..76f640855c56660007cc1d0eba27b3971a08e9d9
--- a/lite/kernels/npu/graph_compute.h
+++ b/lite/kernels/fpga/dropout_compute.h
@@ -13,42 +13,33 @@
 // limitations under the License.
 
 #pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
+#include <algorithm>
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
-#include "lite/core/types.h"
+
+#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace npu {
+namespace fpga {
 
-class GraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
- public:
-  using param_t = operators::GraphParam;
+using float16 = zynqmp::float16;
 
+class DropoutCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
   void PrepareForRun() override;
 
   void Run() override;
 
-  virtual ~GraphCompute() = default;
+  virtual ~DropoutCompute() = default;
 
  private:
-  std::shared_ptr<hiai::AiModelMngerClient> model_client_;
-  std::string model_name_;
-  hiai::AiContext model_context_;
-
-  std::vector<int64_t> npu_idatasizes_;
-  std::vector<int64_t> npu_odatasizes_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_itensors_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_otensors_;
+  zynqmp::ScalePE pe_;
 };
 
-}  // namespace npu
+}  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc
old mode 100644
new mode 100755
index 2a12650ef11ca996e8cff96c4d8e54d42e2020f4..1010e5a4d6541b22c07d1d9efbc2465381081997
--- a/lite/kernels/fpga/elementwise_compute.cc
+++ b/lite/kernels/fpga/elementwise_compute.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/fpga/elementwise_compute.h"
 #include <string>
 #include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 
 namespace paddle {
 namespace lite {
@@ -37,7 +38,13 @@ void ElementwiseAddCompute::PrepareForRun() {
   pe_.init();
   pe_.apply();
 }
-void ElementwiseAddCompute::Run() { pe_.dispatch(); }
+void ElementwiseAddCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ElementwiseAddParam& ew_param = pe_.param();
+  Debugger::get_instance().registerOutput("ew_add", ew_param.output);
+#endif
+}
 
 void ElementwiseAddActivationCompute::PrepareForRun() {
   zynqmp::ElementwiseAddParam& ew_param = pe_.param();
@@ -53,7 +60,54 @@ void ElementwiseAddActivationCompute::PrepareForRun() {
   pe_.init();
   pe_.apply();
 }
-void ElementwiseAddActivationCompute::Run() { pe_.dispatch(); }
+void ElementwiseAddActivationCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ElementwiseAddParam& ew_param = pe_.param();
+  Debugger::get_instance().registerOutput("ew_add", ew_param.output);
+#endif
+}
+
+void ElementwiseMulCompute::PrepareForRun() {
+  zynqmp::ScaleParam& scale_param = pe_.param();
+  auto& param = Param<operators::ElementwiseParam>();
+  param.Out->mutable_data<float16>();
+
+  scale_param.input = param.X->ZynqTensor();
+  scale_param.output = param.Out->ZynqTensor();
+
+  scale_param.relu.enabled = false;
+
+  int channel = scale_param.input->shape().channel();
+  zynqmp::Tensor* scale = new zynqmp::Tensor();
+  zynqmp::Tensor* bias = new zynqmp::Tensor();
+  scale_param.scale = scale;
+  scale_param.bias = bias;
+  zynqmp::Shape shape(zynqmp::N, {channel});
+  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
+  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
+  float scale_value = param.Y->data<float>()[0];
+
+  for (int i = 0; i < channel; ++i) {
+    if (param.Y->dims().production() != 1) {
+      scale_value = param.Y->ZynqTensor()->data<float>()[i];
+    }
+    scale_data[i] = scale_value;
+    bias_data[i] = 0;
+  }
+
+  pe_.init();
+  pe_.apply();
+}
+
+void ElementwiseMulCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::ScaleParam& scale_param = pe_.param();
+  Debugger::get_instance().registerOutput("ew_mul_in", scale_param.input);
+  Debugger::get_instance().registerOutput("ew_mul", scale_param.output);
+#endif
+}
 
 }  // namespace fpga
 }  // namespace kernels
@@ -70,10 +124,7 @@ REGISTER_LITE_KERNEL(elementwise_add,
                {LiteType::GetTensorTy(TARGET(kFPGA),
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
                                        PRECISION(kFP16),
@@ -100,3 +151,20 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFP16),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ElementwiseMulCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/elementwise_compute.h b/lite/kernels/fpga/elementwise_compute.h
index 7051dd7eeda02537be713ff042a0cf33ac1b618d..e3e9c52c4c660e9ae6852f2ec8cdd815829ad524 100644
--- a/lite/kernels/fpga/elementwise_compute.h
+++ b/lite/kernels/fpga/elementwise_compute.h
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include "lite/backends/fpga/KD/float16.hpp"
 #include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 
@@ -50,6 +51,18 @@ class ElementwiseAddActivationCompute
   zynqmp::ElementwiseAddPE pe_;
 };
 
+class ElementwiseMulCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~ElementwiseMulCompute() = default;
+
+ private:
+  zynqmp::ScalePE pe_;
+};
+
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/fpga/fc_compute.cc b/lite/kernels/fpga/fc_compute.cc
index dca6dbce16c082fb14cefce6ec4da2e53c61e8e0..0c76bf0b41e45ad0bcaa10e97011e26449a3ad7d 100644
--- a/lite/kernels/fpga/fc_compute.cc
+++ b/lite/kernels/fpga/fc_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/fpga/fc_compute.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
@@ -30,7 +31,6 @@ void FcCompute::PrepareForRun() {
   zynqmp::FullyConnectedParam& fc_param = pe_.param();
 
   param.output->mutable_data<float16>();
-
   fc_param.input = param.input->ZynqTensor();
   fc_param.output = param.output->ZynqTensor();
   fc_param.filter = param.w->ZynqTensor();
@@ -41,8 +41,11 @@ void FcCompute::PrepareForRun() {
 }
 
 void FcCompute::Run() {
-  auto& param = this->Param<param_t>();
   pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::FullyConnectedParam& fc_param = pe_.param();
+  Debugger::get_instance().registerOutput("fc", fc_param.output);
+#endif
 }
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/fc_compute.h b/lite/kernels/fpga/fc_compute.h
old mode 100644
new mode 100755
index f20419f02b91d02d8d7c6ec7a573a4ead23b3ba7..4cdd1a1abb54fb31962306fde6114a0c51fc10ed
--- a/lite/kernels/fpga/fc_compute.h
+++ b/lite/kernels/fpga/fc_compute.h
@@ -37,10 +37,6 @@ class FcCompute
 
  private:
   zynqmp::FullyConnectedPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
-  zynqmp::Tensor filter_;
-  zynqmp::Tensor bias_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc
old mode 100644
new mode 100755
index 29c080888bb3cbba80e84278b0399b5eadbfa27f..7670bf0007def88c27c12ea54c569a7fcf263693
--- a/lite/kernels/fpga/feed_compute.cc
+++ b/lite/kernels/fpga/feed_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/fpga/feed_compute.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
@@ -25,21 +26,29 @@ using float16 = zynqmp::float16;
 
 void FeedCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
-  // ====================================================
-  zynqmp::InputParam& conv_param = pe_.param();
   Tensor& x = param.feed_list->at(param.col);
-
   param.out->Resize(x.dims());
   param.out->mutable_data<float16>();
-  conv_param.input = x.ZynqTensor();
-  conv_param.output = param.out->ZynqTensor();
+  // ====================================================
+  zynqmp::InputParam& feed_param = pe_.param();
+  feed_param.input = x.ZynqTensor();
+  feed_param.output = param.out->ZynqTensor();
   pe_.init();
   pe_.apply();
 }
 
 void FeedCompute::Run() {
   auto& param = this->Param<param_t>();
+  Tensor& x = param.feed_list->at(param.col);
   pe_.dispatch();
+
+  auto out_lod = param.out->mutable_lod();
+  *out_lod = x.lod();
+
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::InputParam& feed_param = pe_.param();
+  Debugger::get_instance().registerOutput("feed", feed_param.output);
+#endif
 }
 
 }  // namespace fpga
@@ -50,7 +59,7 @@ void FeedCompute::Run() {
 REGISTER_LITE_KERNEL(
     feed, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::FeedCompute, def)
     .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kFPGA),
+               {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNHWC))})
     .BindOutput("Out",
diff --git a/lite/kernels/fpga/feed_compute.h b/lite/kernels/fpga/feed_compute.h
old mode 100644
new mode 100755
index a15ba5636f1d767129ff42f79379670c840a1aa2..5d74b4a0660deafa5b0815ff5ac048c36301f615
--- a/lite/kernels/fpga/feed_compute.h
+++ b/lite/kernels/fpga/feed_compute.h
@@ -32,8 +32,6 @@ class FeedCompute
 
  private:
   zynqmp::InputPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/fetch_compute.cc b/lite/kernels/fpga/fetch_compute.cc
index cf4cf2d3e6d50aaff5c4a77bc7f6d4bf2d539020..9b5f3f60232bb8527f823395693cf3b3851bc04e 100644
--- a/lite/kernels/fpga/fetch_compute.cc
+++ b/lite/kernels/fpga/fetch_compute.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "lite/kernels/fpga/fetch_compute.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
@@ -25,35 +26,65 @@ using float16 = zynqmp::float16;
 void FetchCompute::PrepareForRun() {
   auto& param = this->Param<param_t>();
   // ====================================================
-  zynqmp::OutputParam& conv_param = pe_.param();
+  zynqmp::OutputParam& fetch_param = pe_.param();
   auto fetch_list = param.fetch_list;
   if (fetch_list->size() <= static_cast<size_t>(param.col)) {
     fetch_list->resize(param.col + 1);
   }
   Tensor& out = param.fetch_list->at(param.col);
   out.Resize(param.input->dims());
-  out.mutable_data<float16>();
+  out.mutable_data<float>();
 
-  conv_param.input = param.input->ZynqTensor();
-  conv_param.output = out.ZynqTensor();
+  fetch_param.input = param.input->ZynqTensor();
+  fetch_param.output = out.ZynqTensor();
 
   pe_.init();
   pe_.apply();
 }
 
-void FetchCompute::Run() { pe_.dispatch(); }
+void FetchCompute::Run() {
+  pe_.dispatch();
+  auto& param = this->Param<param_t>();
+
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::OutputParam& fetch_param = pe_.param();
+  Debugger::get_instance().registerOutput("fetch", fetch_param.output);
+#endif
+}
 
 }  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    fetch, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::FetchCompute, def)
+REGISTER_LITE_KERNEL(fetch,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::FetchCompute,
+                     fpga_host)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fetch,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::FetchCompute,
+                     host_host)
     .BindInput("X",
-               {LiteType::GetTensorTy(
-                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
-                {LiteType::GetTensorTy(
-                    TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
     .Finalize();
diff --git a/lite/kernels/fpga/fetch_compute.h b/lite/kernels/fpga/fetch_compute.h
old mode 100644
new mode 100755
index d86fa905c422dc759b23b3b8603e3b4436c5bdf4..b6804f0a6a4b624a9b7968d5121e9d5440202adb
--- a/lite/kernels/fpga/fetch_compute.h
+++ b/lite/kernels/fpga/fetch_compute.h
@@ -31,8 +31,6 @@ class FetchCompute
 
  private:
   zynqmp::OutputPE pe_;
-  zynqmp::Tensor input_;
-  zynqmp::Tensor output_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/gru_compute.cc b/lite/kernels/fpga/gru_compute.cc
new file mode 100755
index 0000000000000000000000000000000000000000..25fdcb505bcc8221da74b8cc87dc4fbec86b6190
--- /dev/null
+++ b/lite/kernels/fpga/gru_compute.cc
@@ -0,0 +1,212 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <unistd.h>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/backends/arm/math/gru_utils.h"
+#include "lite/backends/arm/math/sequence2batch.h"
+#include "lite/backends/arm/math/sgemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/fpga/gru_compute.h"
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/backends/fpga/KD/pes/gru_util.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+inline lite_api::ActivationType get_gru_act_type(const std::string& type) {
+  if (type == "sigmoid") {
+    return lite_api::ActivationType::kSigmoid;
+  } else if (type == "tanh") {
+    return lite_api::ActivationType::kTanh;
+  } else if (type == "relu") {
+    return lite_api::ActivationType::kRelu;
+  } else if (type == "identity") {
+    return lite_api::ActivationType::kIndentity;
+  } else {
+    LOG(FATAL) << "unsupported activation type: " << type;
+  }
+}
+
+void GRUCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  param.hidden->mutable_data<float>();
+
+  auto input = param.input;
+  auto h0 = param.h0;
+  auto weight = param.weight;
+  auto bias = param.bias;
+
+  zynqmp::GRUParam& gru_param = pe_.param();
+  gru_param.input = input->ZynqTensor();
+  if (h0 != nullptr) {
+    gru_param.h0 = h0->ZynqTensor();
+  }
+  gru_param.weight = weight->ZynqTensor();
+  gru_param.bias = bias->ZynqTensor();
+
+  gru_param.batch_gate = param.batch_gate->ZynqTensor();
+  gru_param.batch_reset_hidden_prev =
+      param.batch_reset_hidden_prev->ZynqTensor();
+  gru_param.batch_hidden = param.batch_hidden->ZynqTensor();
+  gru_param.hidden = param.hidden->ZynqTensor();
+
+  gru_param.gate_activation = param.gate_activation;
+  gru_param.activation = param.activation;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void GRUCompute::Run() {
+  auto& param = this->Param<param_t>();
+  param.hidden->mutable_data<float>();
+  // inputs
+  auto input = param.input;
+  auto h0 = param.h0;
+  auto weight = param.weight;
+  auto bias = param.bias;
+  // outputs
+  auto batch_gate = param.batch_gate;
+  auto batch_reset_hidden_prev = param.batch_reset_hidden_prev;
+  auto batch_hidden = param.batch_hidden;
+  auto hidden = param.hidden;
+
+  auto hidden_dims = hidden->dims();
+  int frame_size = hidden_dims[1];
+  auto batch_size = input->dims()[0];
+
+  const float* weight_data = weight->data<float>();
+  float* batch_gate_data = batch_gate->mutable_data<float>();
+
+  lite::arm::math::LoDTensor2BatchFunctor<float> to_batch;
+  to_batch(*input, batch_gate, true, param.is_reverse);  // 1.
+
+  if (bias) {
+    auto bias_data = bias->data<float>();  // 2.
+    lite::arm::math::gru_add_with_bias(batch_gate_data,
+                                       bias_data,
+                                       batch_gate_data,
+                                       batch_size,
+                                       frame_size * 3);
+  }
+
+  zynqmp::GRUTensors gru_tensors;
+  lite::arm::math::GRUMetaValue<float> gru_value;
+  gru_value.gate_weight = const_cast<float*>(weight_data);
+  gru_value.state_weight =
+      const_cast<float*>(weight_data + 2 * frame_size * frame_size);
+
+  Tensor ordered_h0;
+  std::vector<uint64_t> order(batch_gate->lod()[2]);
+
+  if (h0) {
+    // Since the batch computing for GRU reorders the input sequences
+    // according to their length. The initialized cell state also needs
+    // to reorder.
+    // lite::arm::math::ReorderInitState<float>(*h0, order, &ordered_h0, true);
+    // //3.
+    gru_value.prev_out_value = ordered_h0.mutable_data<float>();
+    gru_tensors.pre_output = ordered_h0.ZynqTensor();
+  } else {
+    gru_value.prev_out_value = nullptr;
+    gru_tensors.pre_output = nullptr;
+  }
+  auto batch_starts = batch_gate->lod()[0];
+  size_t seq_len = batch_starts.size() - 1;
+  auto active_node = get_gru_act_type(param.activation);
+  auto active_gate = get_gru_act_type(param.gate_activation);
+
+  save_float(gru_value.gate_weight, "_gate_weight.txt", weight->numel());
+  batch_gate->ZynqTensor()->saveToFile("batch_gate.txt");
+
+  zynqmp::Tensor float_input;
+  zynqmp::Tensor hidden_out;
+
+  for (size_t n = 0; n < seq_len; n++) {
+    int bstart = static_cast<int>(batch_starts[n]);
+    int bend = static_cast<int>(batch_starts[n + 1]);
+    int cur_batch_size = bend - bstart;
+
+    gru_value.output_value =
+        batch_hidden->mutable_data<float>() + bstart * batch_hidden->dims()[1];
+    gru_value.gate_value =
+        batch_gate->mutable_data<float>() + bstart * batch_gate->dims()[1];
+    gru_value.reset_output_value =
+        batch_reset_hidden_prev->mutable_data<float>() +
+        bstart * batch_reset_hidden_prev->dims()[1];
+
+    zynqmp::Shape float_input_shape(zynqmp::NC,
+                                    {cur_batch_size, batch_gate->dims()[1]});
+    float* float_data =
+        float_input.mutableData<float>(zynqmp::FP32, float_input_shape);
+    memcpy(float_data,
+           gru_value.gate_value,
+           batch_gate->dims()[1] * sizeof(float));
+    float_input.flush();
+
+    float* hidden_data =
+        hidden_out.mutableData<float>(zynqmp::FP32, float_input_shape);
+    gru_tensors.gate = &float_input;
+    gru_tensors.output = &hidden_out;
+
+    pe_.GRUCOmpute(gru_tensors,
+                   frame_size,
+                   cur_batch_size,
+                   active_node,
+                   active_gate,
+                   param.origin_mode);
+
+    // TODO(chonwhite): copy data back to original tensor;
+
+    gru_tensors.pre_output = gru_tensors.output;
+  }
+  lite::arm::math::Batch2LoDTensorFunctor<float> to_seq;  // 5.
+  *(batch_hidden->mutable_lod()) = batch_gate->lod();
+  batch_hidden->mutable_data<float>();
+  to_seq(*batch_hidden, hidden);
+
+  save_tensor(const_cast<Tensor*>(input), "_input.txt");
+  save_tensor(hidden, "_gru.txt");
+
+  exit(-1);
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    gru, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::GRUCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("H0", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("BatchGate", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("BatchResetHiddenPrev", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("BatchHidden", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Hidden", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e3a31da0ba15d07a574805eaf7235cebeccf533
--- /dev/null
+++ b/lite/kernels/fpga/gru_compute.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+
+#include "lite/backends/fpga/KD/pes/elementwise_add_pe.hpp"
+#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
+#include "lite/backends/fpga/KD/pes/gru_pe.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class GRUCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::GRUParam;
+
+  GRUCompute() = default;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~GRUCompute() = default;
+
+ private:
+  zynqmp::Tensor pre_output_;
+  zynqmp::Tensor pre_bias_;
+  zynqmp::Tensor weight_;
+
+  zynqmp::ElementwiseAddPE bias_ew_pe_;
+  zynqmp::FullyConnectedPE pre_out_pe_;
+  zynqmp::FullyConnectedPE reset_out_pe_;
+
+  zynqmp::GRUPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/im2sequence_compute.cc b/lite/kernels/fpga/im2sequence_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27acd07f21bb187573df44ff8267c5279d720fed
--- /dev/null
+++ b/lite/kernels/fpga/im2sequence_compute.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+
+#include "lite/api/paddle_place.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/fpga/im2sequence_compute.h"
+
+#include "lite/backends/fpga/KD/float16.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void im2sequence(const float16* input,
+                 const int input_c,
+                 const int input_h,
+                 const int input_w,
+                 const int kernel_h,
+                 const int kernel_w,
+                 const int pad_top,
+                 const int pad_bottom,
+                 const int pad_left,
+                 const int pad_right,
+                 const int stride_h,
+                 const int stride_w,
+                 const int out_h,
+                 const int out_w,
+                 float16* out) {
+  int window_size = kernel_h * kernel_w;
+  int out_rows = out_h * out_w;
+  int out_cols = input_c * window_size;
+  int H_pad = input_h + pad_top + pad_bottom;
+  int W_pad = input_w + pad_left + pad_right;
+
+  float16 zero = zynqmp::float_to_half(0.0f);
+
+  for (int h_id = 0; h_id < out_h; h_id++) {
+    for (int w_id = 0; w_id < out_w; w_id++) {
+      // consider dilation.
+      int start_h = h_id * stride_h - pad_top;
+      int start_w = w_id * stride_w - pad_left;
+      for (int c_id = 0; c_id < input_c; c_id++) {
+        for (int k_h_id = 0; k_h_id < kernel_h; k_h_id++) {
+          int in_h_id = start_h + k_h_id;
+          bool exceed_flag = (in_h_id < 0) || (in_h_id >= H_pad);
+          int out_start_id =
+              (h_id * out_w + w_id) * out_cols + c_id * window_size;
+          for (int k_w_id = 0; k_w_id < kernel_w; k_w_id++) {
+            int in_w_id = start_w + k_w_id;
+            exceed_flag = exceed_flag || (in_w_id < 0) || (in_w_id >= W_pad);
+            int input_id = (c_id * input_h + in_h_id) * input_w + in_w_id;
+            int out_id = out_start_id + k_h_id * kernel_w + k_w_id;
+            out[out_id] = exceed_flag ? zero : input[input_id];
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void hwc_to_chw(T* chw_data,
+                const T* hwc_data,
+                int num,
+                int channel,
+                int height,
+                int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int wh = width * height;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+
+void Im2SequenceCompute::PrepareForRun() {}
+
+void Im2SequenceCompute::Run() {
+  auto& param = this->Param<operators::Im2SequenceParam>();
+  auto kernels = param.kernels;
+  auto strides = param.strides;
+  auto paddings = param.paddings;
+
+  const auto* x_data = param.X->data<float16>();
+  float16* o_data =
+      reinterpret_cast<float16*>(param.Out->mutable_data<float16>());
+
+  float16* o2 = o_data;
+
+  auto input_dims = param.X->dims();
+  int im_num = input_dims[0];
+  int im_size = param.X->numel() / im_num;
+
+  param.X->ZynqTensor()->syncToCPU();
+  float16* chw_data = new float16[param.X->numel()];
+  hwc_to_chw<float16>(chw_data,
+                      x_data,
+                      param.X->dims()[0],
+                      param.X->dims()[1],
+                      param.X->dims()[2],
+                      param.X->dims()[3]);
+
+  const float16* in = chw_data;
+
+  int out_cols = input_dims[1] * kernels[0] * kernels[1];
+
+  int total_rows = 0;
+  std::vector<uint64_t> im_offset;
+  im_offset.push_back(total_rows);
+  if (param.Y) {
+    const auto* y_data = param.Y->data<int>();
+    auto out_strides = param.out_strides;
+    std::vector<int> im_real_h;
+    std::vector<int> im_real_w;
+    std::vector<int> out_h_vec;
+    std::vector<int> out_w_vec;
+    for (int im_id = 0; im_id < im_num; im_id++) {
+      int real_h = y_data[im_id * 2 + 0];
+      int real_w = y_data[im_id * 2 + 1];
+      int tmp_real_h = (real_h + out_strides[0] - 1) / out_strides[0];
+      int tmp_real_w = (real_w + out_strides[1] - 1) / out_strides[1];
+      im_real_h.push_back(tmp_real_h);
+      im_real_w.push_back(tmp_real_w);
+      int out_h =
+          (tmp_real_h + paddings[0] + paddings[1] - kernels[0]) / strides[0] +
+          1;
+      int out_w =
+          (tmp_real_w + paddings[2] + paddings[3] - kernels[1]) / strides[1] +
+          1;
+      out_h_vec.push_back(out_h);
+      out_w_vec.push_back(out_w);
+      total_rows += out_h * out_w;
+      im_offset.push_back(total_rows);
+    }
+    auto out_dims = param.Out->dims();
+    out_dims[0] = total_rows;
+    param.Out->Resize(out_dims);
+
+    int out_offset = 0;
+    for (int im_id = 0; im_id < im_num; im_id++) {
+      im2sequence(in + im_id * im_size,
+                  input_dims[1],
+                  input_dims[2],
+                  input_dims[3],
+                  param.kernels[0],
+                  param.kernels[1],
+                  param.paddings[0],
+                  param.paddings[1],
+                  param.paddings[2],
+                  param.paddings[3],
+                  param.strides[0],
+                  param.strides[1],
+                  out_h_vec[im_id],
+                  out_w_vec[im_id],
+                  o2 + im_offset[im_id] * out_cols);
+    }
+  } else {
+    int out_h =
+        (input_dims[2] + paddings[0] + paddings[1] - kernels[0]) / strides[0] +
+        1;
+    int out_w =
+        (input_dims[3] + paddings[2] + paddings[3] - kernels[1]) / strides[1] +
+        1;
+    for (int im_id = 0; im_id < im_num; im_id++) {
+      int out_size_per_im = out_h * out_w * out_cols;
+      im2sequence(in + im_id * im_size,
+                  input_dims[1],
+                  input_dims[2],
+                  input_dims[3],
+                  param.kernels[0],
+                  param.kernels[1],
+                  param.paddings[0],
+                  param.paddings[1],
+                  param.paddings[2],
+                  param.paddings[3],
+                  param.strides[0],
+                  param.strides[1],
+                  out_h,
+                  out_w,
+                  o2 + im_id * out_size_per_im);
+      im_offset.push_back(uint64_t((im_id + 1) * out_h * out_w));
+    }
+    auto lod = param.Out->mutable_lod();
+    lod->resize(1);
+    (*lod)[0] = im_offset;
+  }
+
+  delete[] chw_data;
+  param.Out->ZynqTensor()->flush();
+  param.Out->ZynqTensor()->copyScaleFrom(param.X->ZynqTensor());
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(im2sequence,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::Im2SequenceCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/im2sequence_compute.h b/lite/kernels/fpga/im2sequence_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..362f801b7d1477c9685d8d077aebc9c10e5000ea
--- /dev/null
+++ b/lite/kernels/fpga/im2sequence_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+// #include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/kernel.h"
+#include "lite/operators/im2sequence_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class Im2SequenceCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::Im2SequenceParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~Im2SequenceCompute() {}
+
+ private:
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/io_copy_compute.cc b/lite/kernels/fpga/io_copy_compute.cc
index df85a03894fd5d3bd2b2265a6bffa5a458aaffe5..10a0e3116b920a2f408606ef211f408ed2279f60 100644
--- a/lite/kernels/fpga/io_copy_compute.cc
+++ b/lite/kernels/fpga/io_copy_compute.cc
@@ -45,7 +45,23 @@ class IoCopyHostToFpgaCompute
     auto& param = Param<operators::IoCopyParam>();
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kFPGA));
-    param.y->CopyDataFrom(*param.x);
+    param.y->mutable_data<float16>();
+    if (param.x->ZynqTensor()->aligned() &&
+        param.x->ZynqTensor()->shape().shouldAlign()) {
+      zynqmp::Tensor tempTensor;
+      tempTensor.mutableData<float16>(zynqmp::FP16,
+                                      param.x->ZynqTensor()->shape());
+      tempTensor.copyFrom(param.x->ZynqTensor());
+      tempTensor.setAligned(true);
+      tempTensor.unalignImage();
+      param.y->ZynqTensor()->copyFrom(&tempTensor);
+    } else {
+      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+    }
+    param.y->ZynqTensor()->invalidate();
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
   }
 
   std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
@@ -81,7 +97,27 @@ class IoCopyFpgaToHostCompute
     auto& param = Param<operators::IoCopyParam>();
     CHECK(param.x->target() == TARGET(kHost) ||
           param.x->target() == TARGET(kFPGA));
-    param.y->CopyDataFrom(*param.x);
+
+    param.y->mutable_data<float>();
+    param.y->ZynqTensor()->setDataType(zynqmp::FP32);
+    param.x->ZynqTensor()->syncToDevice();
+
+    if (param.x->ZynqTensor()->aligned() &&
+        param.x->ZynqTensor()->shape().shouldAlign()) {
+      zynqmp::Tensor tempTensor;
+      tempTensor.mutableData<float16>(zynqmp::FP16,
+                                      param.x->ZynqTensor()->shape());
+      tempTensor.copyFrom(param.x->ZynqTensor());
+      tempTensor.setAligned(true);
+      tempTensor.unalignImage();
+      param.y->ZynqTensor()->copyFrom(&tempTensor);
+    } else {
+      param.y->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+    }
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+    param.y->ZynqTensor()->flush();
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
   }
 
   std::string doc() const override { return "Copy IO from FPGA to HOST"; }
@@ -100,12 +136,27 @@ REGISTER_LITE_KERNEL(io_copy,
                      host_to_device)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNCHW))})
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy,
+                     kFPGA,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::fpga::IoCopyHostToFpgaCompute,
+                     host_to_device_any_any)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(io_copy,
@@ -119,9 +170,25 @@ REGISTER_LITE_KERNEL(io_copy,
                                       PRECISION(kFP16),
                                       DATALAYOUT(kNHWC))})
     .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kHost),
-                                       PRECISION(kAny),
-                                       DATALAYOUT(kAny))})
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(io_copy,
+                     kFPGA,
+                     kAny,
+                     kAny,
+                     paddle::lite::kernels::fpga::IoCopyFpgaToHostCompute,
+                     device_to_host_22)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(io_copy_once,
@@ -132,12 +199,12 @@ REGISTER_LITE_KERNEL(io_copy_once,
                      host_to_device_once)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kHost),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW))})
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kFPGA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNCHW))})
+                                       PRECISION(kAny),
+                                       DATALAYOUT(kAny))})
     .Finalize();
 
 REGISTER_LITE_KERNEL(io_copy_once,
@@ -148,8 +215,8 @@ REGISTER_LITE_KERNEL(io_copy_once,
                      device_to_host_once)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kFPGA),
-                                      PRECISION(kFP16),
-                                      DATALAYOUT(kNHWC))})
+                                      PRECISION(kAny),
+                                      DATALAYOUT(kAny))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kHost),
                                        PRECISION(kAny),
diff --git a/lite/kernels/fpga/layout_compute.cc b/lite/kernels/fpga/layout_compute.cc
index 823637e3f3c92dd39b6110f1ed2028e2d06a1d31..c636e1c7200b5cf3a6f1366c53e2b26e2eea1637 100644
--- a/lite/kernels/fpga/layout_compute.cc
+++ b/lite/kernels/fpga/layout_compute.cc
@@ -26,16 +26,95 @@ namespace fpga {
 
 using float16 = zynqmp::float16;
 
-void TransHwcToChw(Tensor* dest, const Tensor* src) {}
-void TransChwToHwc(Tensor* dest, const Tensor* src) {}
+template <typename T>
+void convert_to_hwc(
+    T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < channel; c++) {
+      for (int h = 0; h < height; h++) {
+        for (int w = 0; w < width; w++) {
+          hwc_data[n * chw + h * wc + w * channel + c] = chw_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void hwc_to_chw(
+    T* chw_data, T* hwc_data, int num, int channel, int height, int width) {
+  int chw = channel * height * width;
+  int wc = width * channel;
+  int wh = width * height;
+  int index = 0;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          chw_data[n * chw + c * wh + h * width + w] = hwc_data[index];
+          index++;
+        }
+      }
+    }
+  }
+}
+
+void TransHwcToChw(Tensor* dest, const Tensor* src) {
+  if (src->ZynqTensor()->dataType() == zynqmp::FP32) {
+    float* chw = dest->mutable_data<float>();
+    float* hwc = const_cast<float*>(src->data<float>());
+    int num = dest->dims()[0];
+    int channel = dest->dims()[1];
+    int height = 1;
+    if (dest->dims().size() > 2) {
+      height = dest->dims()[2];
+    }
+    int width = 1;
+    if (dest->dims().size() > 3) {
+      width = dest->dims()[3];
+    }
+
+    hwc_to_chw<float>(chw, hwc, num, channel, height, width);
+  }
+
+  if (src->ZynqTensor()->dataType() == zynqmp::FP16) {
+    float16* chw = dest->mutable_data<float16>();
+    float16* hwc = const_cast<float16*>(src->data<float16>());
+    int num = dest->dims()[0];
+    int channel = dest->dims()[1];
+    int height = 1;
+    if (dest->dims().size() > 2) {
+      height = dest->dims()[2];
+    }
+    int width = 1;
+    if (dest->dims().size() > 3) {
+      width = dest->dims()[3];
+    }
+
+    hwc_to_chw<float16>(chw, hwc, num, channel, height, width);
+  }
+}
+void TransChwToHwc(Tensor* dest, const Tensor* src) {
+  std::cout << "chw to hwc \n";
+  exit(-1);
+}
 
 class TransHwcToChwCompute
     : public KernelLite<TARGET(kFPGA), PRECISION(kAny), DATALAYOUT(kNHWC)> {
  public:
   void Run() override {
     auto& param = Param<operators::LayoutParam>();
-    auto out_data = param.y->mutable_data<float16>(TARGET(kFPGA));
+    param.x->ZynqTensor()->syncToCPU();
     TransHwcToChw(param.y, param.x);
+    param.y->ZynqTensor()->flush();
+    param.y->ZynqTensor()->copyScaleFrom(param.x->ZynqTensor());
+
+    auto out_lod = param.y->mutable_lod();
+    *out_lod = param.x->lod();
   }
 
   std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
@@ -97,6 +176,22 @@ REGISTER_LITE_KERNEL(layout,
                                        DATALAYOUT(kNCHW))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(layout,
+                     kFPGA,
+                     kAny,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::TransHwcToChwCompute,
+                     hwc_to_chw_arm_float)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(layout,
                      kFPGA,
                      kAny,
diff --git a/lite/kernels/fpga/mul_compute.cc b/lite/kernels/fpga/mul_compute.cc
new file mode 100755
index 0000000000000000000000000000000000000000..c27600d9f773ff0aae04a2ee519905bc0e58785c
--- /dev/null
+++ b/lite/kernels/fpga/mul_compute.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/mul_compute.h"
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void MulCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+
+  // ====================================================
+  zynqmp::FullyConnectedParam& fc_param = pe_.param();
+
+  param.output->mutable_data<float16>();
+
+  fc_param.input = param.x->ZynqTensor();
+  fc_param.output = param.output->ZynqTensor();
+  fc_param.filter = param.y->ZynqTensor();
+
+  fc_param.bias = &bias_;
+
+  int channel = fc_param.filter->shape().channel();
+
+  zynqmp::Shape bias_shape(zynqmp::N, {channel});
+
+  float* bias_data =
+      fc_param.bias->mutableData<float>(zynqmp::FP32, bias_shape);
+  memset(bias_data, 0, channel * sizeof(float));
+  bias_.flush();
+
+  pe_.init();
+  pe_.apply();
+}
+
+void mul(MulCompute* k) {
+  auto& param = k->Param<operators::MulParam>();
+  int num = param.x->dims()[0];
+  int channel = param.x->dims()[1];
+
+  int fn = param.y->dims()[1];
+
+  float16* out_data = param.output->mutable_data<float16>();
+  int g_index = 0;
+  for (int n = 0; n < 1; n++) {
+    for (int on = 0; on < fn; on++) {
+      float sum = 0;
+      int si = 0;
+      for (int c = 0; c < channel; c++) {
+        float value = zynqmp::half_to_float(param.x->data<float16>()[si]);
+        int index = c * fn + on;
+        float weight = param.y->data<float>()[index];
+        sum += value * weight;
+        si++;
+      }
+      out_data[g_index] = zynqmp::float_to_half(sum);
+      g_index++;
+    }
+  }
+}
+
+void MulCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::FullyConnectedParam& fc_param = pe_.param();
+  Debugger::get_instance().registerOutput("mul", fc_param.output);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    mul, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::MulCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/xpu/graph_compute.h b/lite/kernels/fpga/mul_compute.h
similarity index 71%
rename from lite/kernels/xpu/graph_compute.h
rename to lite/kernels/fpga/mul_compute.h
index 5406daa8a1b757989d006f4e0ea09baedc809e33..3e5ad7838595d921c90f19faa75b67de94b59c60 100644
--- a/lite/kernels/xpu/graph_compute.h
+++ b/lite/kernels/fpga/mul_compute.h
@@ -13,35 +13,35 @@
 // limitations under the License.
 
 #pragma once
-
-#include <xtcl/xtcl.h>
-#include <memory>
-#include <string>
-#include <vector>
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/types.h"
 
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/fully_connected_pe.hpp"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace xpu {
+namespace fpga {
 
-class GraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+class MulCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
  public:
-  using param_t = operators::GraphParam;
+  using param_t = operators::MulParam;
 
   void PrepareForRun() override;
 
   void Run() override;
 
-  virtual ~GraphCompute() = default;
+  virtual ~MulCompute() = default;
 
  private:
-  std::shared_ptr<xtcl::network::xRuntimeInstance> runtime_{nullptr};
+  zynqmp::FullyConnectedPE pe_;
+  zynqmp::Tensor bias_;
 };
 
-}  // namespace xpu
+}  // namespace fpga
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cee5e16205370df7faabc6f37d57fe360e8a9e67
--- /dev/null
+++ b/lite/kernels/fpga/multiclass_nms_compute.cc
@@ -0,0 +1,430 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/multiclass_nms_compute.h"
+#include <map>
+#include <utility>
+#include <vector>
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static void GetMaxScoreIndex(const std::vector<T>& scores,
+                             const T threshold,
+                             int top_k,
+                             std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(),
+                   sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+static T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If coordinate values are not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static T JaccardOverlap(const T* box1, const T* box2, const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    T norm = normalized ? static_cast<T>(0.) : static_cast<T>(1.);
+    T inter_w = inter_xmax - inter_xmin + norm;
+    T inter_h = inter_ymax - inter_ymin + norm;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <class T>
+T PolyIoU(const T* box1,
+          const T* box2,
+          const size_t box_size,
+          const bool normalized) {
+  LOG(FATAL) << "PolyIoU not implement.";
+}
+
+template <class T>
+void SliceOneClass(const Tensor& items,
+                   const int class_id,
+                   Tensor* one_class_item) {
+  T* item_data = one_class_item->mutable_data<T>();
+  const T* items_data = items.data<T>();
+  const int64_t num_item = items.dims()[0];
+  const int64_t class_num = items.dims()[1];
+  if (items.dims().size() == 3) {
+    int64_t item_size = items.dims()[2];
+    for (int i = 0; i < num_item; ++i) {
+      std::memcpy(item_data + i * item_size,
+                  items_data + i * class_num * item_size + class_id * item_size,
+                  sizeof(T) * item_size);
+    }
+  } else {
+    for (int i = 0; i < num_item; ++i) {
+      item_data[i] = items_data[i * class_num + class_id];
+    }
+  }
+}
+
+template <typename T>
+void NMSFast(const Tensor& bbox,
+             const Tensor& scores,
+             const T score_threshold,
+             const T nms_threshold,
+             const T eta,
+             const int64_t top_k,
+             std::vector<int>* selected_indices,
+             const bool normalized) {
+  // The total boxes for each instance.
+  int64_t num_boxes = bbox.dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  // 8: [x1 y1 x2 y2 x3 y3 x4 y4]
+  // 16, 24, or 32: [x1 y1 x2 y2 ...  xn yn], n = 8, 12 or 16
+  int64_t box_size = bbox.dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices;
+  GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+  selected_indices->clear();
+  T adaptive_threshold = nms_threshold;
+  const T* bbox_data = bbox.data<T>();
+
+  while (sorted_indices.size() != 0) {
+    const int idx = sorted_indices.front().second;
+    bool keep = true;
+    for (size_t k = 0; k < selected_indices->size(); ++k) {
+      if (keep) {
+        const int kept_idx = (*selected_indices)[k];
+        T overlap = T(0.);
+        // 4: [xmin ymin xmax ymax]
+        if (box_size == 4) {
+          overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size,
+                                      normalized);
+        }
+        // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32
+        if (box_size == 8 || box_size == 16 || box_size == 24 ||
+            box_size == 32) {
+          overlap = PolyIoU<T>(bbox_data + idx * box_size,
+                               bbox_data + kept_idx * box_size,
+                               box_size,
+                               normalized);
+        }
+        keep = overlap <= adaptive_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) {
+      selected_indices->push_back(idx);
+    }
+    sorted_indices.erase(sorted_indices.begin());
+    if (keep && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+}
+
+template <typename T>
+void MultiClassNMS(const operators::MulticlassNmsParam& param,
+                   const Tensor& scores,
+                   const Tensor& bboxes,
+                   const int scores_size,
+                   std::map<int, std::vector<int>>* indices,
+                   int* num_nmsed_out) {
+  int64_t background_label = param.background_label;
+  int64_t nms_top_k = param.nms_top_k;
+  int64_t keep_top_k = param.keep_top_k;
+  bool normalized = param.normalized;
+  T nms_threshold = static_cast<T>(param.nms_threshold);
+  T nms_eta = static_cast<T>(param.nms_eta);
+  T score_threshold = static_cast<T>(param.score_threshold);
+
+  int num_det = 0;
+  int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1];
+
+  for (int64_t c = 0; c < class_num; ++c) {
+    Tensor bbox_slice, score_slice;
+    if (c == background_label) continue;
+    if (scores_size == 3) {
+      scores.Slice<T>(score_slice, c, c + 1);
+      bbox_slice = bboxes;
+    } else {
+      score_slice.Resize({scores.dims()[0], 1});
+      bbox_slice.Resize({scores.dims()[0], 4});
+      SliceOneClass<T>(scores, c, &score_slice);
+      SliceOneClass<T>(bboxes, c, &bbox_slice);
+    }
+    NMSFast(bboxes,
+            score_slice,
+            score_threshold,
+            nms_threshold,
+            nms_eta,
+            nms_top_k,
+            &((*indices)[c]),
+            normalized);
+    if (scores_size == 2) {
+      std::stable_sort((*indices)[c].begin(), (*indices)[c].end());
+    }
+    num_det += (*indices)[c].size();
+  }
+
+  *num_nmsed_out = num_det;
+  const T* scores_data = scores.data<T>();
+  if (keep_top_k > -1 && num_det > keep_top_k) {
+    Tensor score_slice;
+
+    const T* sdata;
+    std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+    for (const auto& it : *indices) {
+      int label = it.first;
+      if (scores_size == 3) {
+        sdata = scores_data + label * scores.dims()[1];
+      } else {
+        score_slice.Resize({scores.dims()[0], 1});
+        SliceOneClass<T>(scores, label, &score_slice);
+        sdata = score_slice.data<T>();
+      }
+      const std::vector<int>& label_indices = it.second;
+      for (size_t j = 0; j < label_indices.size(); ++j) {
+        int idx = label_indices[j];
+        score_index_pairs.push_back(
+            std::make_pair(sdata[idx], std::make_pair(label, idx)));
+      }
+    }
+    // Keep top k results per image.
+    std::stable_sort(score_index_pairs.begin(),
+                     score_index_pairs.end(),
+                     SortScorePairDescend<std::pair<int, int>>);
+    score_index_pairs.resize(keep_top_k);
+
+    // Store the new indices.
+    std::map<int, std::vector<int>> new_indices;
+    for (size_t j = 0; j < score_index_pairs.size(); ++j) {
+      int label = score_index_pairs[j].second.first;
+      int idx = score_index_pairs[j].second.second;
+      new_indices[label].push_back(idx);
+    }
+    if (scores_size == 2) {
+      for (const auto& it : new_indices) {
+        int label = it.first;
+        std::stable_sort(new_indices[label].begin(), new_indices[label].end());
+      }
+    }
+    new_indices.swap(*indices);
+    *num_nmsed_out = keep_top_k;
+  }
+}
+
+template <typename T>
+void MultiClassOutput(const Tensor& scores,
+                      const Tensor& bboxes,
+                      const std::map<int, std::vector<int>>& selected_indices,
+                      const int scores_size,
+                      Tensor* outs) {
+  int64_t class_num = scores.dims()[1];
+  int64_t predict_dim = scores.dims()[1];
+  int64_t box_size = bboxes.dims()[1];
+  if (scores_size == 2) {
+    box_size = bboxes.dims()[2];
+  }
+  int64_t out_dim = box_size + 2;
+  auto* scores_data = scores.data<T>();
+  auto* bboxes_data = bboxes.data<T>();
+  auto* odata = outs->mutable_data<T>();
+  const T* sdata;
+  Tensor bbox;
+  bbox.Resize({scores.dims()[0], box_size});
+  int count = 0;
+  for (const auto& it : selected_indices) {
+    int label = it.first;
+    const std::vector<int>& indices = it.second;
+    if (scores_size == 2) {
+      SliceOneClass<T>(bboxes, label, &bbox);
+    } else {
+      sdata = scores_data + label * predict_dim;
+    }
+    for (size_t j = 0; j < indices.size(); ++j) {
+      int idx = indices[j];
+      odata[count * out_dim] = label;  // label
+      const T* bdata;
+      if (scores_size == 3) {
+        bdata = bboxes_data + idx * box_size;
+        odata[count * out_dim + 1] = sdata[idx];  // score
+      } else {
+        bdata = bbox.data<T>() + idx * box_size;
+        odata[count * out_dim + 1] = *(scores_data + idx * class_num + label);
+      }
+      // xmin, ymin, xmax, ymax or multi-points coordinates
+      std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T));
+      count++;
+    }
+  }
+}
+
+void MulticlassNmsCompute::Run() {
+  auto& param = Param<operators::MulticlassNmsParam>();
+  auto* boxes = param.bboxes;
+  auto* scores = param.scores;
+  auto* outs = param.out;
+  outs->mutable_data<float>();
+
+  auto score_dims = scores->dims();
+  auto score_size = score_dims.size();
+
+  auto box_dims = boxes->dims();
+  int64_t box_dim = boxes->dims()[2];
+
+  std::vector<std::map<int, std::vector<int>>> all_indices;
+  std::vector<uint64_t> batch_starts = {0};
+  int64_t batch_size = score_dims[0];
+
+  int64_t out_dim = box_dim + 2;
+  int num_nmsed_out = 0;
+  Tensor boxes_slice, scores_slice;
+  int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+  for (int i = 0; i < n; ++i) {
+    if (score_size == 3) {
+      scores->Slice<float>(scores_slice, i, i + 1);
+      scores_slice.Resize({score_dims[1], score_dims[2]});
+      boxes->Slice<float>(boxes_slice, i, i + 1);
+      boxes_slice.Resize({score_dims[2], box_dim});
+    } else {
+      auto boxes_lod = boxes->lod().back();
+      scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
+      boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
+    }
+    std::map<int, std::vector<int>> indices;
+    MultiClassNMS<float>(
+        param, scores_slice, boxes_slice, score_size, &indices, &num_nmsed_out);
+    all_indices.push_back(indices);
+    batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+  }
+
+  uint64_t num_kept = batch_starts.back();
+  if (num_kept == 0) {
+    outs->Resize({1, 1});
+    float* od = outs->mutable_data<float>();
+    od[0] = -1;
+    batch_starts = {0, 1};
+  } else {
+    outs->Resize({static_cast<int64_t>(num_kept), out_dim});
+    for (int i = 0; i < n; ++i) {
+      if (score_size == 3) {
+        scores->Slice<float>(scores_slice, i, i + 1);
+        boxes->Slice<float>(boxes_slice, i, i + 1);
+        scores_slice.Resize({score_dims[1], score_dims[2]});
+        boxes_slice.Resize({score_dims[2], box_dim});
+      } else {
+        auto boxes_lod = boxes->lod().back();
+        scores->Slice<float>(scores_slice, boxes_lod[i], boxes_lod[i + 1]);
+        boxes->Slice<float>(boxes_slice, boxes_lod[i], boxes_lod[i + 1]);
+      }
+      int64_t s = static_cast<int64_t>(batch_starts[i]);
+      int64_t e = static_cast<int64_t>(batch_starts[i + 1]);
+
+      if (e > s) {
+        Tensor out;
+        outs->Slice<float>(out, s, e);
+        MultiClassOutput<float>(
+            scores_slice, boxes_slice, all_indices[i], score_dims.size(), &out);
+        outs->ZynqTensor()->copyFrom(out.ZynqTensor());
+      }
+    }
+  }
+  LoD lod;
+  lod.emplace_back(batch_starts);
+  outs->set_lod(lod);
+
+#ifdef FPGA_PRINT_TENSOR
+  Debugger::get_instance().registerOutput("boxes", boxes->ZynqTensor());
+  Debugger::get_instance().registerOutput("scores", scores->ZynqTensor());
+  Debugger::get_instance().registerOutput("nms", outs->ZynqTensor());
+#endif
+}
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(multiclass_nms,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::MulticlassNmsCompute,
+                     def)
+    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(multiclass_nms,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::MulticlassNmsCompute,
+                     def2)
+    .BindInput("BBoxes",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Scores",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/fpga/multiclass_nms_compute.h b/lite/kernels/fpga/multiclass_nms_compute.h
new file mode 100755
index 0000000000000000000000000000000000000000..764e707eeeeed49b46a4cac1c613905d960f0cbb
--- /dev/null
+++ b/lite/kernels/fpga/multiclass_nms_compute.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class MulticlassNmsCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+
+  virtual ~MulticlassNmsCompute() = default;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/norm_compute.cc b/lite/kernels/fpga/norm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..752e2b2211f33fba4f1a0b7ae1ecd9367ccbe91b
--- /dev/null
+++ b/lite/kernels/fpga/norm_compute.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/norm_compute.h"
+#include "lite/backends/fpga/KD/debugger.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void NormCompute::PrepareForRun() {
+  auto& param = this->Param<operators::NormParam>();
+  param.Out->mutable_data<float16>();
+
+  zynqmp::NormParam& norm_param = pe_.param();
+  norm_param.input = param.X->ZynqTensor();
+  norm_param.output = param.Out->ZynqTensor();
+  norm_param.epsilon = param.epsilon;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void NormCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::NormParam& norm_param = pe_.param();
+  Debugger::get_instance().registerOutput("norm", norm_param.output);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    norm, kFPGA, kFP16, kNHWC, paddle::lite::kernels::fpga::NormCompute, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Norm",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/norm_compute.h b/lite/kernels/fpga/norm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..9fe135e9eb17928cfa44d60377f36a7355ed5d71
--- /dev/null
+++ b/lite/kernels/fpga/norm_compute.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdint.h>
+// #include "lite/backends/arm/math/type_trans.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/norm_pe.hpp"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class NormCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::NormParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  ~NormCompute() {}
+
+ private:
+  zynqmp::NormPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/pooling_compute.cc b/lite/kernels/fpga/pooling_compute.cc
index e4979f8e5762400f453e323f98a6b18ba17a0998..5411d3370d4b39a7d587cbf1887e7d1372b30036 100644
--- a/lite/kernels/fpga/pooling_compute.cc
+++ b/lite/kernels/fpga/pooling_compute.cc
@@ -18,6 +18,8 @@
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 
+#include "lite/backends/fpga/KD/debugger.hpp"
+
 namespace paddle {
 namespace lite {
 namespace kernels {
@@ -26,26 +28,32 @@ namespace fpga {
 using float16 = zynqmp::float16;
 
 void PoolCompute::PrepareForRun() {
-  zynqmp::PoolingParam& pool_param = pe_.param();
   auto& param = Param<operators::PoolParam>();
-
   param.output->mutable_data<float16>();
 
+  zynqmp::PoolingParam& pool_param = pe_.param();
   pool_param.input = param.x->ZynqTensor();
   pool_param.output = param.output->ZynqTensor();
   pool_param.relu.enabled = false;
-
   pool_param.type = param.pooling_type == "max" ? zynqmp::PoolingType::MAX
                                                 : zynqmp::PoolingType::AVERAGE;
   pool_param.globalPooling = param.global_pooling;
   pool_param.kernelSize = param.ksize;
   pool_param.strides = param.strides;
-  pool_param.paddings = param.paddings;
+  int pad_h = (*param.paddings)[0];
+  int pad_w = (*param.paddings)[2];
+  pool_param.paddings = std::vector<int>({pad_h, pad_w});
   pe_.init();
   pe_.apply();
 }
 
-void PoolCompute::Run() { pe_.dispatch(); }
+void PoolCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::PoolingParam& pool_param = pe_.param();
+  Debugger::get_instance().registerOutput("pooling", pool_param.output);
+#endif
+}
 
 }  // namespace fpga
 }  // namespace kernels
diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc
old mode 100644
new mode 100755
index 2309bf8fe4aa1c083e1662556ac49bf4357ab07a..9248289fe9353705e7a2d84831b9f3de5d8ee7d7
--- a/lite/kernels/fpga/pooling_compute_test.cc
+++ b/lite/kernels/fpga/pooling_compute_test.cc
@@ -46,7 +46,7 @@ std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
   if (param_->global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      (*param_->paddings)[i] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
@@ -59,7 +59,7 @@ std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
     for (size_t i = 0; i < param_->ksize.size(); ++i) {
       output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                             param_->ksize[i],
-                                            param_->paddings[i],
+                                            (*param_->paddings)[i],
                                             param_->strides[i],
                                             param_->ceil_mode));
     }
@@ -76,7 +76,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
 
   std::vector<int> ksize = param.ksize;
   std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;
 
   std::string pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -103,7 +103,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -230,7 +230,7 @@ TEST(pool_fpga, compute) {
                         }
                         param.global_pooling = global_pooling;
                         param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
+                        *param.paddings = {pad, pad, pad, pad};
                         param.exclusive = exclusive;
                         param.ceil_mode = ceil_mode;
                         param.adaptive = false;
diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..afd14ccb4b4a9a4f1e93e1e38840035fb18186bb
--- /dev/null
+++ b/lite/kernels/fpga/prior_box_compute.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "lite/backends/fpga/KD/debugger.hpp"
+#include "lite/kernels/fpga/prior_box_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>* output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior->clear();
+  output_aspect_ratior->push_back(1.0f);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior->size(); ++j) {
+      if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior->push_back(ar);
+      if (flip) {
+        output_aspect_ratior->push_back(1.0f / ar);
+      }
+    }
+  }
+}
+
+void PriorBoxCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  bool is_flip = param.flip;
+  bool is_clip = param.clip;
+  std::vector<float> min_size = param.min_sizes;
+  std::vector<float> max_size = param.max_sizes;
+  std::vector<float> aspect_ratio = param.aspect_ratios;
+  std::vector<float> variance = param.variances_;
+  int img_w = param.img_w;
+  int img_h = param.img_h;
+  float step_w = param.step_w;
+  float step_h = param.step_h;
+  float offset = param.offset;
+  std::vector<float> aspect_ratios_vec;
+  ExpandAspectRatios(aspect_ratio, is_flip, &aspect_ratios_vec);
+  size_t prior_num = aspect_ratios_vec.size() * min_size.size();
+  prior_num += max_size.size();
+  std::vector<std::string> order = param.order;
+  bool min_max_aspect_ratios_order = param.min_max_aspect_ratios_order;
+
+  int win1 = param.input->dims()[3];
+  int hin1 = param.input->dims()[2];
+
+  DDim shape_out({hin1, win1, prior_num, 4});
+  param.boxes->Resize(shape_out);
+  param.variances->Resize(shape_out);
+
+  param.boxes->mutable_data<float>();
+  param.variances->mutable_data<float>();
+  zynqmp::PriorBoxParam& priobox_param = pe_.param();
+  priobox_param.input = param.input->ZynqTensor();
+  priobox_param.image = param.image->ZynqTensor();
+  priobox_param.outputBoxes = param.boxes->ZynqTensor();
+  priobox_param.outputVariances = param.variances->ZynqTensor();
+  priobox_param.minSizes = param.min_sizes;
+  priobox_param.maxSizes = param.max_sizes;
+  priobox_param.aspectRatios = param.aspect_ratios;
+  priobox_param.variances = param.variances_;
+  priobox_param.minMaxAspectRatiosOrder = min_max_aspect_ratios_order;
+  priobox_param.flip = param.flip;
+  priobox_param.clip = param.clip;
+  priobox_param.stepW = param.step_w;
+  priobox_param.stepH = param.step_h;
+  priobox_param.offset = param.offset;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void PriorBoxCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::PriorBoxParam& priobox_param = pe_.param();
+  Debugger::get_instance().registerOutput("pb_boxes",
+                                          priobox_param.outputBoxes);
+  Debugger::get_instance().registerOutput("pb_variances",
+                                          priobox_param.outputVariances);
+#endif
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(prior_box,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::PriorBoxCompute,
+                     def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Image",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Variances", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/fpga/prior_box_compute.h b/lite/kernels/fpga/prior_box_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..74e4c536470ad7c9b41e7bd284ea13eff4910bb7
--- /dev/null
+++ b/lite/kernels/fpga/prior_box_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/prior_box_pe.hpp"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class PriorBoxCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::PriorBoxParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+
+  virtual ~PriorBoxCompute() = default;
+
+ private:
+  zynqmp::PriorBoxPE pe_;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f72f18892c987e48fb3467372352f6ded98444ff
--- /dev/null
+++ b/lite/kernels/fpga/reshape_compute.cc
@@ -0,0 +1,147 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/fpga/reshape_compute.h"
+#include <vector>
+#include "lite/operators/reshape_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void ReshapeCompute::Run() {
+  auto& param = Param<operators::ReshapeParam>();
+  param.output->mutable_data<float16>();
+  auto x = param.x;
+  // auto actual_shape = param.actual_shape;
+  Tensor* actual_shape = nullptr;  // TODO(chonwhite) change it.
+  auto output = param.output;
+  bool inplace = param.inplace;
+  auto x_dims = x->dims();
+  auto output_dims = output->dims();
+  if (actual_shape) {
+    auto actual_shape_dims = actual_shape->dims();
+    auto* actual_shape_data = actual_shape->data<int>();
+    auto shape = std::vector<int>(
+        actual_shape_data, actual_shape_data + actual_shape_dims.production());
+    output_dims = lite::operators::ValidateShape(shape, x_dims);
+    output->Resize(output_dims);
+  }
+  if (inplace) {
+    output->ShareDataWith(*x);
+  } else {
+    output->CopyDataFrom(*x);
+  }
+
+  param.x->ZynqTensor()->saveToFile("reshape_in", true);
+  output->ZynqTensor()->saveToFile("reshape_out", true);
+
+  output->Resize(output_dims);
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reshape,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(reshape2,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(flatten,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(flatten2,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::ReshapeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Shape",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .BindOutput("XShape",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/fpga/reshape_compute.h b/lite/kernels/fpga/reshape_compute.h
new file mode 100755
index 0000000000000000000000000000000000000000..cc5ed0b565c800e7c03698f716cbe6a304ef97de
--- /dev/null
+++ b/lite/kernels/fpga/reshape_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+class ReshapeCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+
+  virtual ~ReshapeCompute() = default;
+};
+
+class ReshapeComputeFpgaToHost
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  void Run() override;
+
+  virtual ~ReshapeComputeFpgaToHost() = default;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/scale_compute.cc b/lite/kernels/fpga/scale_compute.cc
old mode 100644
new mode 100755
index 01f9a63ad48f0f371ead0e19042df3841993c372..991c73f2954b00c50107e5ec45ff338e99be6d75
--- a/lite/kernels/fpga/scale_compute.cc
+++ b/lite/kernels/fpga/scale_compute.cc
@@ -19,7 +19,37 @@ namespace lite {
 namespace kernels {
 namespace fpga {
 
-void ScaleCompute::Run() {}
+void ScaleCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  param.output->mutable_data<float16>();
+
+  zynqmp::ScaleParam& scale_param = pe_.param();
+
+  scale_param.input = param.x->ZynqTensor();
+  scale_param.output = param.output->ZynqTensor();
+
+  int channel = scale_param.input->shape().channel();
+  zynqmp::Tensor* scale = new zynqmp::Tensor();
+  zynqmp::Tensor* bias = new zynqmp::Tensor();
+  zynqmp::Shape shape(zynqmp::N, {channel});
+  float* scale_data = scale->mutableData<float>(zynqmp::FP32, shape);
+  float* bias_data = bias->mutableData<float>(zynqmp::FP32, shape);
+
+  float scale_value = param.scale;
+  float bias_value = param.bias_after_scale ? param.bias : 0;
+
+  for (int i = 0; i < channel; ++i) {
+    scale_data[i] = scale_value;
+    bias_data[i] = bias_value;
+  }
+  scale_param.scale = scale;
+  scale_param.bias = bias;
+
+  pe_.init();
+  pe_.apply();
+}
+
+void ScaleCompute::Run() { pe_.dispatch(); }
 
 }  // namespace fpga
 }  // namespace kernels
diff --git a/lite/kernels/fpga/scale_compute.h b/lite/kernels/fpga/scale_compute.h
old mode 100644
new mode 100755
index 45cd3528f8a99636e1e22d3cce76e774a6488530..217399db7255d57c68cbf1b5e3f29462b9ab8bfb
--- a/lite/kernels/fpga/scale_compute.h
+++ b/lite/kernels/fpga/scale_compute.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #pragma once
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/backends/fpga/KD/pes/scale_pe.hpp"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 
@@ -21,12 +23,20 @@ namespace lite {
 namespace kernels {
 namespace fpga {
 
+using float16 = zynqmp::float16;
+
 class ScaleCompute
     : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
  public:
+  using param_t = operators::ScaleParam;
+
+  void PrepareForRun() override;
   void Run() override;
 
   virtual ~ScaleCompute() = default;
+
+ private:
+  zynqmp::ScalePE pe_;
 };
 
 }  // namespace fpga
diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc
old mode 100644
new mode 100755
index 63abc76e68ebf15a458ed380d7eabeaf89d5dd2f..b13b5f0f46202dfcf59a5c9094a409d02f03e4a2
--- a/lite/kernels/fpga/softmax_compute.cc
+++ b/lite/kernels/fpga/softmax_compute.cc
@@ -33,7 +33,13 @@ void SoftmaxCompute::PrepareForRun() {
   pe_.apply();
 }
 
-void SoftmaxCompute::Run() { pe_.dispatch(); }
+void SoftmaxCompute::Run() {
+  pe_.dispatch();
+#ifdef FPGA_PRINT_TENSOR
+  zynqmp::SoftmaxParam& softmax_param = pe_.param();
+  Debugger::get_instance().registerOutput("softmax", softmax_param.output);
+#endif
+}
 
 }  // namespace fpga
 }  // namespace kernels
diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e3bb813873d69d8f9d9939f06869e2640f416915
--- /dev/null
+++ b/lite/kernels/fpga/transpose_compute.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/fpga/transpose_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+using float16 = zynqmp::float16;
+
+void transposeCompute(operators::TransposeParam param) {
+  // copy from;
+  const auto* input_x = param.x;
+  const auto input_x_dims = input_x->dims();
+  input_x->ZynqTensor()->invalidate();
+  input_x->ZynqTensor()->unalignImage();
+
+  Tensor float_input;
+  float_input.Resize(input_x_dims);
+  float_input.mutable_data<float>();
+  float_input.ZynqTensor()->copyFrom(input_x->ZynqTensor());
+
+  const auto* input_x_data = float_input.data<float>();
+
+  auto* out = param.output;
+  const auto axis = param.axis;
+
+  auto* out_data = out->mutable_data<float>();
+
+  size_t ndim = axis.size();
+  std::vector<int> xdim(ndim);
+  std::vector<int> xstride(ndim);
+  std::vector<int> xout(ndim);
+  for (int i = 0; i < ndim; i++) {
+    int j = ndim - 1 - i;
+    xdim[j] = input_x_dims[axis[i]];
+    xstride[j] = 1;
+    for (int k = axis[i] + 1; k < ndim; k++) {
+      xstride[j] *= input_x_dims[k];
+    }
+    xout[j] = xstride[j] * xdim[j];
+  }
+
+  auto numel = input_x->numel();
+  size_t pind = 0;
+  std::vector<int> ind(ndim);
+  for (int i = 0; i < numel; i++) {
+    out_data[i] = input_x_data[pind];
+    ind[0]++;
+    pind += xstride[0];
+    for (int j = 0; j < ndim - 1; j++) {
+      if (ind[j] == xdim[j]) {
+        ind[j + 1]++;
+        ind[j] = 0;
+        pind += xstride[j + 1];
+        pind -= xout[j];
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+// Transpose
+void TransposeCompute::Run() { auto& param = this->Param<param_t>(); }
+
+// Transpose2
+void Transpose2Compute::Run() {
+  auto& param = this->Param<param_t>();
+  param.output->mutable_data<float>();
+  param.x->ZynqTensor()->invalidate();
+  param.x->ZynqTensor()->unalignImage();
+  if (param.x->dims().size() != 4) {
+    transposeCompute(param);
+  } else {
+    param.output->ZynqTensor()->copyFrom(param.x->ZynqTensor());
+  }
+}
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+// Transpose
+REGISTER_LITE_KERNEL(transpose,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::TransposeCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kFPGA),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+// Transpose2
+REGISTER_LITE_KERNEL(transpose2,
+                     kFPGA,
+                     kFP16,
+                     kNHWC,
+                     paddle::lite::kernels::fpga::Transpose2Compute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kFPGA),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/fpga/transpose_compute.h b/lite/kernels/fpga/transpose_compute.h
new file mode 100755
index 0000000000000000000000000000000000000000..ba6a7e4aabbf27e545fc53ea9fd5d63ec2d1e6c0
--- /dev/null
+++ b/lite/kernels/fpga/transpose_compute.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include "lite/backends/fpga/KD/float16.hpp"
+#include "lite/core/kernel.h"
+#include "lite/operators/transpose_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace fpga {
+
+// Transpose
+class TransposeCompute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override;
+
+  virtual ~TransposeCompute() = default;
+};
+
+// Transpose2
+class Transpose2Compute
+    : public KernelLite<TARGET(kFPGA), PRECISION(kFP16), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::TransposeParam;
+
+  void Run() override;
+
+  virtual ~Transpose2Compute() = default;
+};
+
+}  // namespace fpga
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/CMakeLists.txt b/lite/kernels/npu/CMakeLists.txt
index eb1824e1112beec57d93d63a2464fed94fab81c9..e7624907cace3d9a2f77f616e58e15ad0876322c 100644
--- a/lite/kernels/npu/CMakeLists.txt
+++ b/lite/kernels/npu/CMakeLists.txt
@@ -1,13 +1,3 @@
+add_subdirectory(bridges)
 
-if(NOT LITE_WITH_NPU)
-  return ()
-endif()
-
-message(STATUS "compile with lite NPU kernels")
-
-add_kernel(graph_compute_npu NPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} npu_runtime)
-# lite_cc_test(test_graph_compute_npu SRCS graph_compute_test.cc DEPS graph_compute_npu)
-
-if(NOT LITE_ON_TINY_PUBLISH)
-    add_subdirectory(bridges)
-endif()
+add_kernel(subgraph_compute_npu NPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_npu subgraph_bridge_engine ${npu_subgraph_bridges})
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 79d1bf2fd5fa694d4888d474c321a43d279bab76..2c516e47e494a445156898c6c2b017607c2de6ee 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -1,73 +1,72 @@
-lite_cc_library(npu_bridge_registry SRCS registry.cc)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU)
+  return()
+endif()
 
-set(npu_bridge_deps npu_bridge_registry npu_builder op)
+lite_cc_library(subgraph_bridge_registry
+    SRCS registry.cc
+    DEPS op)
+lite_cc_library(subgraph_bridge_engine
+    SRCS engine.cc
+    DEPS tensor op scope program)
 
-lite_cc_library(npu_bridge_fc_op SRCS fc_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_conv_op SRCS conv_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_mul_op SRCS mul_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_act_op SRCS act_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_scale_op SRCS scale_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_softmax_op SRCS softmax_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_pool_op SRCS pool_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_reshape_op SRCS reshape_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_conv_transpose_op SRCS conv_transpose_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_interpolate_op SRCS interpolate_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_transpose_op SRCS transpose_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps})
-lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps})
+if(NOT LITE_WITH_NPU)
+  return()
+endif()
 
-set(npu_bridges
-        npu_bridge_registry
-        npu_bridge_fc_op
-        npu_bridge_conv_op
-        npu_bridge_mul_op
-        npu_bridge_act_op
-        npu_bridge_scale_op
-        npu_bridge_softmax_op
-        npu_bridge_pool_op
-        npu_bridge_batch_norm_op
-        npu_bridge_elementwise_ops
-        npu_bridge_reshape_op
-        npu_bridge_conv_transpose_op
-        npu_bridge_interpolate_op
-        npu_bridge_transpose_op
-        npu_bridge_split_op
-        npu_bridge_concat_op
-        npu_bridge_shuffle_channel_op
-        npu_bridge_pad2d_op
-        npu_bridge_square_op
-        npu_bridge_sqrt_op
-        npu_bridge_reduce_mean_op
-        CACHE INTERNAL "npu_bridges")
+lite_cc_library(subgraph_bridge_utility_npu SRCS utility.cc DEPS ${npu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_graph_npu SRCS graph.cc DEPS subgraph_bridge_utility_npu)
 
-set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops})
+set(npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_npu subgraph_bridge_graph_npu)
 
-lite_cc_test(test_npu_bridge_fc_op SRCS fc_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_scale_op SRCS scale_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_batch_norm_op SRCS batch_norm_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_reshape_op SRCS reshape_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_conv_transpose_op SRCS conv_transpose_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_interpolate_op SRCS interpolate_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_transpose_op SRCS transpose_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
-lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_library(subgraph_bridge_fc_op_npu SRCS fc_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_npu SRCS conv_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_mul_op_npu SRCS mul_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_act_op_npu SRCS act_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_scale_op_npu SRCS scale_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_softmax_op_npu SRCS softmax_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_npu SRCS pool_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_npu SRCS batch_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_npu SRCS elementwise_ops.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reshape_op_npu SRCS reshape_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_transpose_op_npu SRCS conv_transpose_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_interpolate_op_npu SRCS interpolate_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_transpose_op_npu SRCS transpose_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_split_op_npu SRCS split_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_concat_op_npu SRCS concat_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_shuffle_channel_op_npu SRCS shuffle_channel_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pad2d_op_npu SRCS pad2d_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_square_op_npu SRCS square_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_sqrt_op_npu SRCS sqrt_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reduce_mean_op_npu SRCS reduce_mean_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
 
-message(STATUS "+++++ npu_bridges: ${npu_bridges}")
+set(npu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_npu
+        subgraph_bridge_graph_npu
+        subgraph_bridge_fc_op_npu
+        subgraph_bridge_conv_op_npu
+        subgraph_bridge_mul_op_npu
+        subgraph_bridge_act_op_npu
+        subgraph_bridge_scale_op_npu
+        subgraph_bridge_softmax_op_npu
+        subgraph_bridge_pool_op_npu
+        subgraph_bridge_batch_norm_op_npu
+        subgraph_bridge_elementwise_ops_npu
+        subgraph_bridge_reshape_op_npu
+        subgraph_bridge_conv_transpose_op_npu
+        subgraph_bridge_interpolate_op_npu
+        subgraph_bridge_transpose_op_npu
+        subgraph_bridge_split_op_npu
+        subgraph_bridge_concat_op_npu
+        subgraph_bridge_shuffle_channel_op_npu
+        subgraph_bridge_pad2d_op_npu
+        subgraph_bridge_square_op_npu
+        subgraph_bridge_sqrt_op_npu
+        subgraph_bridge_reduce_mean_op_npu
+        subgraph_bridge_unsqueeze_op_npu
+        subgraph_bridge_argmax_op_npu
+        CACHE INTERNAL "npu_subgraph_bridges")
+
+message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc
index ac62891113b1899036c35ffd3058f1d409b00a36..0edab9e6644f1bd7164905d8b6ff0a18e980a9ad 100644
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
@@ -12,38 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
-                           const node_map_type& inputs_map) {
-  auto scope = act_op->scope();
-  auto op_info = act_op->op_info();
+int ActConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // create act node and set input node from inputs_map
+  // Create act node and set input node which is obtained from the node map
   auto x_var_name = op_info->Input("X").front();
-  auto act_node = std::make_shared<ge::op::Activation>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  act_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(act_node);
+  auto out_var_name = op_info->Output("Out").front();
+  auto act_node = graph->AddNode<ge::op::Activation>(out_var_name);
+  act_node->set_input_x(*graph->GetNode(x_var_name));
 
   // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
   // clipped_relu etc.
-  act_node->set_attr_mode(lite::npu::CvtActMode(op_type));
+  act_node->set_attr_mode(CvtActMode(op_type));
 
   if (op_type == "relu_clipped") {
     auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
     act_node->set_attr_coef(Relu_clipped_coef);
+  } else if (op_type == "relu6") {
+    float Relu_clipped_coef = 6.f;
+    act_node->set_attr_coef(Relu_clipped_coef);
   } else if (op_type == "leaky_relu") {
     auto alpha = op_info->GetAttr<float>("alpha");
     act_node->set_attr_negative_slope(alpha);
@@ -53,30 +54,33 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
     act_node->set_attr_negative_slope(slope);
     act_node->set_attr_coef(offset);
   }
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = act_node;
-  return outputs_map;
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(sigmoid, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(relu, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(tanh, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(relu_clipped,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
-// REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(leaky_relu,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(abs, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(softsign,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(softplus,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(hard_sigmoid,
-                    paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         sigmoid,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, relu, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, tanh, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         relu_clipped,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, relu6, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         leaky_relu,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, abs, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         softsign,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         softplus,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         hard_sigmoid,
+                         paddle::lite::subgraph::npu::ActConverter);
diff --git a/lite/kernels/npu/bridges/act_op_test.cc b/lite/kernels/npu/bridges/act_op_test.cc
index d50b1968b14cc33efd7ab9bcd0c4427d8ca2e508..e2670a008b11fcd7e65971ba2beac707b839896d 100644
--- a/lite/kernels/npu/bridges/act_op_test.cc
+++ b/lite/kernels/npu/bridges/act_op_test.cc
@@ -55,6 +55,10 @@ void act_ref(const std::shared_ptr<operators::ActivationOp> op) {
     for (size_t i = 0; i < out->numel(); i++) {
       out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef);
     }
+  } else if (op_type == "relu6") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(std::max(0.f, x_data[i]), 6.f);
+    }
   } else if (op_type == "leaky_relu") {
     auto alpha = op_info->GetAttr<float>("alpha");
     for (size_t i = 0; i < out->numel(); i++) {
@@ -96,6 +100,8 @@ void test_act(std::vector<int64_t> x_shape, std::string op_type) {
   opdesc.SetInput("X", {x_var_name});
   opdesc.SetOutput("Out", {out_var_name});
   if (op_type == "relu_clipped") {
+    opdesc.SetAttr("Relu_clipped_coef", 3.f);
+  } else if (op_type == "relu6") {
     opdesc.SetAttr("Relu_clipped_coef", 6.f);
   } else if (op_type == "leaky_relu") {
     opdesc.SetAttr("alpha", 0.02f);
@@ -125,6 +131,7 @@ TEST(NPUBridges, activation) {
                                  "relu",
                                  "tanh",
                                  "relu_clipped",
+                                 "relu6",
                                  "leaky_relu",
                                  "softsign",
                                  "hard_sigmoid"};
@@ -149,6 +156,8 @@ USE_LITE_OP(tanh);
 USE_NPU_BRIDGE(tanh);
 USE_LITE_OP(relu_clipped);
 USE_NPU_BRIDGE(relu_clipped);
+USE_LITE_OP(relu6);
+USE_NPU_BRIDGE(relu6);
 
 USE_LITE_OP(leaky_relu);
 USE_NPU_BRIDGE(leaky_relu);
diff --git a/lite/kernels/npu/bridges/argmax_op.cc b/lite/kernels/npu/bridges/argmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..66b906eee5840a1e8bf2f0ab6f4385bdd11db52f
--- /dev/null
+++ b/lite/kernels/npu/bridges/argmax_op.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int ArgmaxConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  int axis = op_info->GetAttr<int64_t>("axis");
+
+  auto argmax_node = graph->AddNode<ge::op::ArgMax>(out_var_name);
+  argmax_node->set_input_x1(*graph->GetNode(x_var_name));
+
+  auto x2 = graph->AddNode(out_var_name + "/axis", axis);
+  argmax_node->set_input_x2(*x2);
+  return SUCCESS;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         arg_max,
+                         paddle::lite::subgraph::npu::ArgmaxConverter);
diff --git a/lite/kernels/npu/bridges/argmax_op_test.cc b/lite/kernels/npu/bridges/argmax_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c2de79edbcdd1c1efb2d513bba17ed198aa64966
--- /dev/null
+++ b/lite/kernels/npu/bridges/argmax_op_test.cc
@@ -0,0 +1,124 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/argmax_op.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+template <typename dtype>
+void argmax_ref(const std::shared_ptr<operators::ArgmaxOpLite> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  int axis = op_info->GetAttr<int64_t>("axis");
+  auto x_dims = x->dims();
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+  auto y_shape = x_dims.Vectorize();
+  y_shape.erase(y_shape.begin() + axis);
+  out->Resize(y_shape);
+  auto out_dims = out->dims();
+
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  const int size = x_dims[axis];
+  const int in_channel = x_dims.count(axis, x_dims.size());
+  const int out_channel = out_dims.count(axis, out_dims.size());
+  const int in_stride = x_dims.count(axis + 1, x_dims.size());
+  const int out_stride = x_dims.count(0, axis);
+
+  for (int n = 0; n < out_stride; n++) {
+    for (int k = 0; k < in_stride; k++) {
+      const float* in_ptr = x_data + n * in_channel + k;
+      std::vector<std::pair<float, int>> vec;
+      vec.resize(size);
+      for (int i = 0; i < size; i++) {
+        vec[i] = std::make_pair(in_ptr[i * in_stride], i);
+      }
+      // sort
+      std::partial_sort(vec.begin(),
+                        vec.begin() + 1,
+                        vec.end(),
+                        std::greater<std::pair<float, int>>());
+
+      // out
+      dtype* out_ptr = out_data + n * out_channel + k;
+      *out_ptr = vec[0].second;
+    }
+  }
+}
+
+void test_argmax(const std::vector<int64_t>& input_shape, int axis) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("arg_max");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axis", static_cast<int64_t>(axis));
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ArgmaxOpLite>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  argmax_ref<float>(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<int>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, argmax) {
+  test_argmax({1, 2, 3, 4}, 1);
+  test_argmax({1, 2, 3, 4}, 2);
+  test_argmax({1, 2, 3, 4}, 3);
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(arg_max);
+USE_NPU_BRIDGE(arg_max);
diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc
index 8c3153d242330360a2145ae87951dc8ea29168ca..f1bd1b43c1f9b6ec8c7a0bf1e2fdd4e5b39c1e49 100644
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
@@ -12,81 +12,66 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type BatchNormConverter(
-    const std::shared_ptr<lite::OpLite> batch_norm_op,
-    const node_map_type& inputs_map) {
-  auto scope = batch_norm_op->scope();
-  auto op_info = batch_norm_op->op_info();
+int BatchNormConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::BatchNormExt2> batch_norm_node =
-      std::make_shared<ge::op::BatchNormExt2>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
+  auto y_var_name = op_info->Output("Y").front();
+  auto batch_norm_node = graph->AddNode<ge::op::BatchNormExt2>(y_var_name);
+  batch_norm_node->set_input_x(*graph->GetNode(x_var_name));
 
   auto scale_var_name = op_info->Input("Scale").front();
-  lite::Tensor* scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
-  auto npu_scale = std::make_shared<ge::op::Const>(scale_var_name);
-  npu_scale->set_attr_value(lite::npu::CvtTensor(scale));
-  lite::npu::OpList::Global().add(npu_scale);
+  auto scale = scope->FindVar(scale_var_name)->GetMutable<Tensor>();
+  auto scale_const_node = graph->AddNode(scale_var_name, *scale);
 
   auto bias_var_name = op_info->Input("Bias").front();
-  lite::Tensor* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
-  auto npu_bias = std::make_shared<ge::op::Const>(bias_var_name);
-  npu_bias->set_attr_value(lite::npu::CvtTensor(bias));
-  lite::npu::OpList::Global().add(npu_bias);
+  auto bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
+  auto bias_const_node = graph->AddNode(bias_var_name, *bias);
 
   auto mean_var_name = op_info->Input("Mean").front();
-  lite::Tensor* mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
-  auto npu_mean = std::make_shared<ge::op::Const>(mean_var_name);
-  npu_mean->set_attr_value(lite::npu::CvtTensor(mean));
-  lite::npu::OpList::Global().add(npu_mean);
+  auto mean = scope->FindVar(mean_var_name)->GetMutable<Tensor>();
+  auto mean_const_node = graph->AddNode(mean_var_name, *mean);
 
   auto variance_var_name = op_info->Input("Variance").front();
-  lite::Tensor* variance =
-      scope->FindVar(variance_var_name)->GetMutable<Tensor>();
-  auto npu_variance = std::make_shared<ge::op::Const>(variance_var_name);
-  npu_variance->set_attr_value(lite::npu::CvtTensor(variance));
-  lite::npu::OpList::Global().add(npu_variance);
+  auto variance = scope->FindVar(variance_var_name)->GetMutable<Tensor>();
+  auto variance_const_node = graph->AddNode(variance_var_name, *variance);
 
-  float npu_momentum = op_info->GetAttr<float>("momentum");
-  float npu_epsilon = op_info->GetAttr<float>("epsilon");
-  int npu_mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
-  bool npu_use_global_stats = op_info->GetAttr<bool>("use_global_stats");
+  float momentum = op_info->GetAttr<float>("momentum");
+  float epsilon = op_info->GetAttr<float>("epsilon");
+  int mode = 1;  // bnScale, bnBias tensor dims are 1xCx1x1
+  bool use_global_stats = op_info->GetAttr<bool>("use_global_stats");
 
-  batch_norm_node->set_input_x(*inputs_map.at(x_var_name));
-  batch_norm_node->set_input_scale(*npu_scale);
-  batch_norm_node->set_input_offset(*npu_bias);
-  batch_norm_node->set_input_mean(*npu_mean);
-  batch_norm_node->set_input_variance(*npu_variance);
-  batch_norm_node->set_attr_momentum(npu_momentum);
-  batch_norm_node->set_attr_epsilon(npu_epsilon);
-  batch_norm_node->set_attr_mode(npu_mode);
-  batch_norm_node->set_attr_use_global_stats(npu_use_global_stats);
-
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(batch_norm_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Y").front()] = batch_norm_node;
-  return outputs_map;
+  batch_norm_node->set_input_scale(*scale_const_node);
+  batch_norm_node->set_input_offset(*bias_const_node);
+  batch_norm_node->set_input_mean(*mean_const_node);
+  batch_norm_node->set_input_variance(*variance_const_node);
+  batch_norm_node->set_attr_momentum(momentum);
+  batch_norm_node->set_attr_epsilon(epsilon);
+  batch_norm_node->set_attr_mode(mode);
+  batch_norm_node->set_attr_use_global_stats(use_global_stats);
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(batch_norm,
-                    paddle::lite::kernels::npu::bridges::BatchNormConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         batch_norm,
+                         paddle::lite::subgraph::npu::BatchNormConverter);
diff --git a/lite/kernels/npu/bridges/concat_op.cc b/lite/kernels/npu/bridges/concat_op.cc
index 39d9bd697a39f2705de4de8a0a7ee42f7aa4263d..9f504213a61de4a441bd8691425fac4cd496bbfe 100644
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
@@ -12,58 +12,51 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ConcatConverter(const std::shared_ptr<lite::OpLite> concat_op,
-                              const node_map_type& inputs_map) {
-  lite::Scope* scope = concat_op->scope();
-  const lite::OpInfo* op_info = concat_op->op_info();
+int ConcatConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " << op_type << " ... ";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << " ... ";
 
   auto x_var_names = op_info->Input("X");
+  auto out_var_name = op_info->Output("Out").front();
   auto axis = op_info->GetAttr<int>("axis");
-  int num = x_var_names.size();
-  int index = 0;
-
-  std::shared_ptr<ge::op::Concat> output_node =
-      std::make_shared<ge::op::Concat>(unique_op_type);
-  output_node->set_attr_axis(axis);
-  output_node->set_attr_N(num);
-  output_node->create_dynamic_input_x(num);
-  for (auto x_var_name : x_var_names) {
-    if (inputs_map.find(x_var_name) != inputs_map.end()) {
-      output_node->set_dynamic_input_x(index + 1, *inputs_map.at(x_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  auto num = x_var_names.size();
+  auto concat_node = graph->AddNode<ge::op::Concat>(out_var_name);
+  concat_node->set_attr_axis(axis);
+  concat_node->set_attr_N(num);
+  concat_node->create_dynamic_input_x(num);
+  int idx = 1;
+  for (auto& x_var_name : x_var_names) {
+    if (graph->HasNode(x_var_name)) {
+      concat_node->set_dynamic_input_x(idx, *graph->GetNode(x_var_name));
     } else {
-      auto consty = std::make_shared<ge::op::Const>(x_var_name);
-      auto* x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
-      consty->set_attr_value(lite::npu::CvtTensor(x));
-      output_node->set_dynamic_input_x(index + 1, *consty);
-      lite::npu::OpList::Global().add(consty);
+      auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+      auto x_const_node = graph->AddNode(x_var_name, *x);
+      concat_node->set_dynamic_input_x(idx, *x_const_node);
     }
-    index++;
+    idx++;
   }
-  lite::npu::OpList::Global().add(output_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
-  return outputs_map;
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(concat,
-                    paddle::lite::kernels::npu::bridges::ConcatConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         concat,
+                         paddle::lite::subgraph::npu::ConcatConverter);
diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc
index 8dc9ab1f0f8a1e63c52b2406117fc34477e71490..0cc22ef356d264958c019e9046cddef0c22c9475 100644
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -13,32 +13,33 @@
 // limitations under the License.
 
 #include "lite/operators/conv_op.h"
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
-                            const node_map_type& inputs_map) {
-  auto scope = conv_op->scope();
-  auto op_info = conv_op->op_info();
+int ConvConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " << op_type << "... ";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << "... ";
 
-  // get input, filter and op attributes
+  // Get input, filter and op attributes
   auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input = scope->FindVar(input_var_name)->GetMutable<Tensor>();
   auto input_dims = input->dims();
   auto output_var_name = op_info->Output("Output").front();
-  auto output = scope->FindVar(output_var_name)->GetMutable<lite::Tensor>();
+  auto output = scope->FindVar(output_var_name)->GetMutable<Tensor>();
   auto output_dims = output->dims();
   auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
   auto filter_dims = filter->dims();
   auto bs = input_dims[0];
   auto ic = input_dims[1];
@@ -63,7 +64,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
     }
   }
   CHECK_EQ(paddings.size(), 4L)
-      << "Paddings size should be the same or twice as the input size.";
+      << "[NPU] Paddings size should be the same or twice as the input size.";
 
   std::string padding_algorithm("");
   if (op_info->HasAttr("padding_algorithm")) {
@@ -76,9 +77,9 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
                                       input_dims,
                                       filter_dims);
 
-  // check depthwise mode, and decide whether use ConvolutionDepthwise Op
+  // Check depthwise mode, and decide whether use ConvolutionDepthwise Op
   bool use_depthwise_conv =
-      false;  // whether use ge::op::ConvolutionDepthwise ?
+      false;  // Whether use ge::op::ConvolutionDepthwise ?
   bool is_depthwise_mode = ic == groups && oc == groups;
   if (is_depthwise_mode &&
       !((groups == 1 || groups >= 5) && dilations[0] == 1 &&
@@ -90,26 +91,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
                     "performance.";
   }
 
-  // check input
-  CHECK(inputs_map.count(input_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
+  // Create filter node
+  auto filter_const_node = graph->AddNode(filter_var_name, *filter);
 
-  // create filter node
-  CHECK(!inputs_map.count(filter_var_name));
-  auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(lite::npu::CvtTensor(filter));
-  lite::npu::OpList::Global().add(filter_const_node);
-
-  // create bias node if has bias
-  // supports the bias nodes with the following dimensions
+  // Create bias node if exists bias
+  // Supports the bias nodes with the following dimensions
   // 0: {oc}
   // 1: {1, oc, oh, ow}
   // 2: {n, oc, oh, ow}
   std::shared_ptr<ge::Operator> bias_node = nullptr;
   bool is_channel_bias = false;
-  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
+  if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_var_name = op_info->Input("Bias").front();
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
     auto bias_dims = bias->dims();
     auto bias_data_size = bias_dims.production();
     auto output_data_size = output_dims.production();
@@ -125,28 +119,26 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
       // 2: {n, oc, oh, ow}
       bias_shape = output_dims.Vectorize();
     } else {
-      LOG(ERROR) << "bias dimension " << bias_dims
-                 << " isn't supported in conv2d Op when output dimension is "
-                 << output_dims;
+      LOG(WARNING) << "[NPU] Bias dimension " << bias_dims
+                   << " isn't supported in conv2d Op when output dimension is "
+                   << output_dims;
+      return FAILED;
     }
-    if (inputs_map.count(bias_var_name)) {
-      // bias node from input map
-      bias_node = inputs_map.at(bias_var_name);
+    if (graph->HasNode(bias_var_name)) {
+      // Bias node from input map
+      bias_node = graph->GetNode(bias_var_name);
     } else {
-      // bias node with const data
-      auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
-      bias_const_node->set_attr_value(lite::npu::CvtTensor(bias, bias_shape));
-      bias_node = bias_const_node;
+      // Bias node with const data
+      bias_node = graph->AddNode(bias_var_name, *bias, bias_shape);
     }
-    lite::npu::OpList::Global().add(bias_node);
   }
 
-  // create conv node and set input, filter, bias nodes and attributes
+  // Create conv node and set input, filter, bias nodes and attributes
   std::shared_ptr<ge::Operator> conv_node = nullptr;
   if (use_depthwise_conv && is_depthwise_mode) {
     auto depthwise_conv_node =
-        std::make_shared<ge::op::ConvolutionDepthwise>(unique_op_type);
-    depthwise_conv_node->set_input_x(*inputs_map.at(input_var_name));
+        graph->AddNode<ge::op::ConvolutionDepthwise>(output_var_name);
+    depthwise_conv_node->set_input_x(*graph->GetNode(input_var_name));
     depthwise_conv_node->set_input_filter(*filter_const_node);
     depthwise_conv_node->set_attr_mode(1);
     depthwise_conv_node->set_attr_algo(0);
@@ -161,21 +153,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
         ge::AttrValue::LIST_INT({strides[0], strides[1]}));
     depthwise_conv_node->set_attr_kernel(
         ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    lite::npu::OpList::Global().add(depthwise_conv_node);
     conv_node = depthwise_conv_node;
     // ConvolutionDepthwise Op doesn't support bias, so append Add node to
     // support bias
     if (bias_node != nullptr) {
-      auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
+      auto add_node = graph->AddNode<ge::op::Add>(output_var_name);
       add_node->set_input_x1(*depthwise_conv_node);
       add_node->set_input_x2(*bias_node);
-      lite::npu::OpList::Global().add(add_node);
       conv_node = add_node;
     }
   } else {
     auto common_conv_node =
-        std::make_shared<ge::op::Convolution>(unique_op_type);
-    common_conv_node->set_input_x(*inputs_map.at(input_var_name));
+        graph->AddNode<ge::op::Convolution>(output_var_name);
+    common_conv_node->set_input_x(*graph->GetNode(input_var_name));
     common_conv_node->set_input_w(*filter_const_node);
     common_conv_node->set_attr_mode(1);
     common_conv_node->set_attr_pad_mode(0);  // NOTSET
@@ -188,7 +178,6 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
         ge::AttrValue::LIST_INT({strides[0], strides[1]}));
     common_conv_node->set_attr_kernel(
         ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    lite::npu::OpList::Global().add(common_conv_node);
     conv_node = common_conv_node;
     // Convolution Op only support bias with dimension {1, oc, 1, 1},
     // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
@@ -196,37 +185,32 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
       if (is_channel_bias) {
         common_conv_node->set_input_b(*bias_node);
       } else {
-        auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
+        auto add_node = graph->AddNode<ge::op::Add>(output_var_name);
         add_node->set_input_x1(*common_conv_node);
         add_node->set_input_x2(*bias_node);
-        lite::npu::OpList::Global().add(add_node);
         conv_node = add_node;
       }
     }
   }
   CHECK(conv_node);
 
-  node_map_type outputs_map;
   if (fuse_relu) {
-    // append relu node if fuse_relu is true
-    auto relu_node =
-        std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
+    // Append relu node if fuse_relu is true
+    auto relu_node = graph->AddNode<ge::op::Activation>(output_var_name);
     relu_node->set_input_x(*conv_node);
-    relu_node->set_attr_mode(lite::npu::CvtActMode("relu"));
-    lite::npu::OpList::Global().add(relu_node);
-    outputs_map[op_info->Output("Output").front()] = relu_node;
-  } else {
-    outputs_map[op_info->Output("Output").front()] = conv_node;
+    relu_node->set_attr_mode(CvtActMode("relu"));
   }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(conv2d, paddle::lite::kernels::npu::bridges::ConvConverter);
-REGISTER_NPU_BRIDGE(depthwise_conv2d,
-                    paddle::lite::kernels::npu::bridges::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         conv2d,
+                         paddle::lite::subgraph::npu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         depthwise_conv2d,
+                         paddle::lite::subgraph::npu::ConvConverter);
diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc
index 6eff4cb2d28d64098186dfb50a457a8828b8eb61..6e689b56f676cb7208c1d34cc3bb251d009a1662 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -12,30 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ConvTransposeConverter(
-    const std::shared_ptr<lite::OpLite> conv_transpose_op,
-    const node_map_type& inputs_map) {
-  auto scope = conv_transpose_op->scope();
-  auto op_info = conv_transpose_op->op_info();
+int ConvTransposeConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " << op_type << "... ";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << "... ";
 
-  // get input, output and op attributes
+  // Get input, output and op attributes
   auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input = scope->FindVar(input_var_name)->GetMutable<Tensor>();
   auto input_shape = input->dims().Vectorize();
+  auto output_var_name = op_info->Output("Output").front();
   auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
   auto filter_shape = filter->dims().Vectorize();
   CHECK_EQ(input_shape.size(), 4);
   CHECK_EQ(filter_shape.size(), 4);
@@ -54,42 +55,34 @@ node_map_type ConvTransposeConverter(
     }
   }
   CHECK_EQ(paddings.size(), 4L)
-      << "Paddings size should be the same or twice as the input size.";
+      << "[NPU] Paddings size should be the same or twice as the input size.";
 
-  // create deconv node
+  // Create deconv node
   auto conv_transpose_node =
-      std::make_shared<ge::op::Deconvolution>(unique_op_type);
+      graph->AddNode<ge::op::Deconvolution>(output_var_name);
 
-  // create input sizes node to describe the dimensions of input tensor
-  std::vector<int32_t> output_shape;
-  output_shape.push_back(input_shape[0]);
-  output_shape.push_back(filter_shape[1] * groups);
+  // Create input sizes node to describe the dimensions of input tensor
+  std::vector<int32_t> input_sizes;
+  input_sizes.push_back(input_shape[0]);
+  input_sizes.push_back(filter_shape[1] * groups);
   for (int i = 0; i < strides.size(); i++) {
     int kernel_ext = dilations[i] * (filter_shape[i + 2] - 1) + 1;
     int output_size =
         (input_shape[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i];
-    output_shape.push_back(output_size);
+    input_sizes.push_back(output_size);
   }
   auto input_sizes_const_node =
-      std::make_shared<ge::op::Const>(unique_op_type + "/input_size");
-  input_sizes_const_node->set_attr_value(
-      lite::npu::CreateTensorAndFillData(output_shape));
+      graph->AddNode(output_var_name + "/input_sizes", input_sizes);
   conv_transpose_node->set_input_input_sizes(*input_sizes_const_node);
-  lite::npu::OpList::Global().add(input_sizes_const_node);
 
-  // create filter node
-  CHECK(!inputs_map.count(filter_var_name));
-  auto filter_const_node = std::make_shared<ge::op::Const>(filter_var_name);
-  filter_const_node->set_attr_value(lite::npu::CvtTensor(filter));
+  // Create filter node
+  auto filter_const_node = graph->AddNode(filter_var_name, *filter);
   conv_transpose_node->set_input_filter(*filter_const_node);
-  lite::npu::OpList::Global().add(filter_const_node);
 
-  // set input node
-  CHECK(inputs_map.count(input_var_name));
-  conv_transpose_node->set_input_x(*inputs_map.at(input_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
+  // Set input node
+  conv_transpose_node->set_input_x(*graph->GetNode(input_var_name));
 
-  // set attributes
+  // Set attributes
   conv_transpose_node->set_attr_format(0);    // NCHW
   conv_transpose_node->set_attr_pad_mode(0);  // NOTSET
   conv_transpose_node->set_attr_group(groups);
@@ -101,50 +94,39 @@ node_map_type ConvTransposeConverter(
       ge::AttrValue::LIST_INT({strides[0], strides[1]}));
   conv_transpose_node->set_attr_kernel(
       ge::AttrValue::LIST_INT({filter_shape[2], filter_shape[3]}));
-  lite::npu::OpList::Global().add(conv_transpose_node);
 
-  // append add node to add bias if has bias
+  // Append add node to add bias if exists bias
   std::shared_ptr<ge::Operator> output_node = conv_transpose_node;
-  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
-    // create bias node
+  if (HasInputArg(op_info, scope, "Bias")) {
+    // Create bias node
     auto bias_var_name = op_info->Input("Bias").front();
-    CHECK(!inputs_map.count(bias_var_name));
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    CHECK(!graph->HasNode(bias_var_name));
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
     auto channel_size = bias->dims().production();
     CHECK_EQ(channel_size, filter_shape[1] * groups);
-    auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
-    bias_const_node->set_attr_value(
-        lite::npu::CvtTensor(bias, {1, channel_size, 1, 1}));
-    lite::npu::OpList::Global().add(bias_const_node);
-    // append add node to add bias node
-    auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add");
+    auto bias_const_node =
+        graph->AddNode(bias_var_name, *bias, {1, channel_size, 1, 1});
+    // Append add node to add bias node
+    auto add_node = graph->AddNode<ge::op::Add>(output_var_name);
     add_node->set_input_x1(*conv_transpose_node);
     add_node->set_input_x2(*bias_const_node);
-    lite::npu::OpList::Global().add(add_node);
     output_node = add_node;
   }
 
-  node_map_type outputs_map;
   if (fuse_relu) {
-    // append relu node if fuse_relu is true
-    auto relu_node =
-        std::make_shared<ge::op::Activation>(unique_op_type + "/relu");
+    // Append relu node if fuse_relu is true
+    auto relu_node = graph->AddNode<ge::op::Activation>(output_var_name);
     relu_node->set_input_x(*output_node);
-    relu_node->set_attr_mode(lite::npu::CvtActMode("relu"));
-    lite::npu::OpList::Global().add(relu_node);
-    outputs_map[op_info->Output("Output").front()] = relu_node;
-  } else {
-    outputs_map[op_info->Output("Output").front()] = output_node;
+    relu_node->set_attr_mode(CvtActMode("relu"));
   }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(
-    conv2d_transpose,
-    paddle::lite::kernels::npu::bridges::ConvTransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         conv2d_transpose,
+                         paddle::lite::subgraph::npu::ConvTransposeConverter);
diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc
index 5eb5f4e271df71b1fa29084f0787c004f4753ffc..43ecae25e0327a2b44989bb7df766d1d299487ab 100644
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
 std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
   auto x_dims = x.dims();
-  CHECK_EQ(x_dims.size(), 4UL) << "[NPU] only support 4-dimension x";
+  CHECK_EQ(x_dims.size(), 4UL) << "[NPU] Only support 4-dimension x";
   auto y_dims = y->dims();
   CHECK_GE(x_dims.size(), y_dims.size());
 
@@ -45,93 +45,86 @@ std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
   return y_new_shape;
 }
 
-node_map_type ElementwiseConverter(
-    const std::shared_ptr<lite::OpLite> elementwise_op,
-    const node_map_type& inputs_map) {
-  auto scope = elementwise_op->scope();
-  auto op_info = elementwise_op->op_info();
+int ElementwiseConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
   auto x_var_name = op_info->Input("X").front();
   auto y_var_name = op_info->Input("Y").front();
-  CHECK(inputs_map.find(x_var_name) != inputs_map.end());
+  auto out_var_name = op_info->Output("Out").front();
   auto axis = op_info->GetAttr<int>("axis");
 
   std::shared_ptr<ge::Operator> elementwise_node = nullptr;
-  std::shared_ptr<ge::Operator> x_node = inputs_map.at(x_var_name);
+  std::shared_ptr<ge::Operator> x_node = graph->GetNode(x_var_name);
   std::shared_ptr<ge::Operator> y_node = nullptr;
-  if (inputs_map.find(y_var_name) != inputs_map.end()) {
-    y_node = inputs_map.at(y_var_name);
+  if (graph->HasNode(y_var_name)) {
+    y_node = graph->GetNode(y_var_name);
   } else {
-    auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
     auto x = scope->FindTensor(x_var_name);
     auto y = scope->FindMutableTensor(y_var_name);
     auto y_new_shape = CvtYShape(*x, y, axis);
-    y_const_node->set_attr_value(lite::npu::CvtTensor(y, y_new_shape));
-    y_node = y_const_node;
+    y_node = graph->AddNode(y_var_name, y, y_new_shape);
   }
-  lite::npu::OpList::Global().add(x_node);
-  lite::npu::OpList::Global().add(y_node);
 
   if (op_type == "elementwise_add" ||
       op_type == "fusion_elementwise_add_activation") {
-    auto elt_node = std::make_shared<ge::op::Add>(unique_op_type);
+    auto elt_node = graph->AddNode<ge::op::Add>(out_var_name);
     elt_node->set_input_x1(*x_node);
     elt_node->set_input_x2(*y_node);
     elementwise_node = elt_node;
   } else if (op_type == "elementwise_sub") {
-    auto elt_node = std::make_shared<ge::op::Sub>(unique_op_type);
+    auto elt_node = graph->AddNode<ge::op::Sub>(out_var_name);
     elt_node->set_input_x1(*x_node);
     elt_node->set_input_x2(*y_node);
     elementwise_node = elt_node;
   } else if (op_type == "elementwise_mul") {
-    auto elt_node = std::make_shared<ge::op::Mul>(unique_op_type);
+    auto elt_node = graph->AddNode<ge::op::Mul>(out_var_name);
     elt_node->set_input_x(*x_node);
     elt_node->set_input_y(*y_node);
     elementwise_node = elt_node;
   } else if (op_type == "elementwise_div") {
-    auto elt_node = std::make_shared<ge::op::RealDiv>(unique_op_type);
+    auto elt_node = graph->AddNode<ge::op::RealDiv>(out_var_name);
     elt_node->set_input_x1(*x_node);
     elt_node->set_input_x2(*y_node);
     elementwise_node = elt_node;
   } else {
-    LOG(FATAL) << "unsupported op type: " << op_type;
+    LOG(WARNING) << "[NPU] Unsupported op type: " << op_type;
+    return FAILED;
   }
 
-  lite::npu::OpList::Global().add(elementwise_node);
-
-  node_map_type outputs_map;
   if (op_type == "fusion_elementwise_add_activation") {
     auto act_type = op_info->GetAttr<std::string>("act_type");
-    auto act_node =
-        std::make_shared<ge::op::Activation>(unique_op_type + "/act");
+    auto act_node = graph->AddNode<ge::op::Activation>(out_var_name);
     act_node->set_input_x(*elementwise_node);
     // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
     // clipped_relu etc.
-    act_node->set_attr_mode(lite::npu::CvtActMode(act_type));
-    lite::npu::OpList::Global().add(act_node);
-    outputs_map[op_info->Output("Out").front()] = act_node;
-  } else {
-    outputs_map[op_info->Output("Out").front()] = elementwise_node;
+    act_node->set_attr_mode(CvtActMode(act_type));
   }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(elementwise_add,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
-REGISTER_NPU_BRIDGE(fusion_elementwise_add_activation,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
-REGISTER_NPU_BRIDGE(elementwise_sub,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
-REGISTER_NPU_BRIDGE(elementwise_mul,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
-REGISTER_NPU_BRIDGE(elementwise_div,
-                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         elementwise_add,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         fusion_elementwise_add_activation,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         elementwise_sub,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         elementwise_mul,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         elementwise_div,
+                         paddle::lite::subgraph::npu::ElementwiseConverter);
diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e7e35831dd7cc4477dfac31a72884e5e5ea19483
--- /dev/null
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/engine.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+
+int Engine::BuildDeviceProgram() { return FAILED; }
+
+int Engine::LaunchDeviceProgram() { return 0; }
+
+int Engine::BuildOriginProgram() {
+  // TODO(hong19860320) The block_desc need to be divided into subgraphs during
+  // the exection time. But only see them as a subgraph now.
+  origin_program_.clear();
+  for (int op_idx = 0; op_idx < block_desc_->OpsSize(); op_idx++) {
+    auto op_desc = block_desc_->GetOp<cpp::OpDesc>(op_idx);
+    CHECK(op_desc);
+    std::string op_type = op_desc->Type();
+    auto op = LiteOpRegistry::Global().Create(op_desc->Type());
+    op->Attach(*op_desc, scope_);
+    std::unique_ptr<KernelBase> picked_kernel;
+    if (op_desc->HasAttr(kKernelTypeAttr)) {
+      // Create op and pick up kernel according to the kKernelTypeAttr attribute
+      auto kernel_type = op_desc->GetAttr<std::string>(kKernelTypeAttr);
+      std::string alias;
+      Place place;
+      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+      VLOG(3) << "Found the attr '" << kKernelTypeAttr << "': " << kernel_type
+              << " for " << op_type;
+      auto kernels = op->CreateKernels({place});
+      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
+      auto it = std::find_if(
+          kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
+            return it->alias() == alias;
+          });
+      CHECK(it != kernels.end());
+      picked_kernel = std::move(*it);
+    } else {
+      VLOG(3) << "The attr '" << kKernelTypeAttr
+              << "' not found, pick the first kernel for " << op_type;
+#if defined(LITE_WITH_ARM)
+      auto kernels = op->CreateKernels({Place{TARGET(kARM)}});
+#elif defined(LITE_WITH_X86)
+      auto kernels = op->CreateKernels({Place{TARGET(kX86)}});
+#endif
+      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
+      picked_kernel = std::move(kernels.front());
+    }
+    picked_kernel->SetContext(
+        ContextScheduler::Global().NewContext(picked_kernel->target()));
+    origin_program_.emplace_back(std::move(op), std::move(picked_kernel));
+  }
+  return 0;
+}
+
+int Engine::LaunchOriginProgram() {
+  for (auto& inst : origin_program_) {
+    auto op_type = inst.op()->op_info()->Type();
+    if (op_type == "feed" || op_type == "fetch") continue;
+    inst.Run();
+  }
+  return 0;
+}
+
+int Engine::Build() {
+  // In order to attach all of the ops of the block desc, we need to build the
+  // original program firstly.
+  BuildOriginProgram();
+  // Run InferShape() of all of ops, and convert Paddle ops to NPU/XPU IR graph
+  build_device_program_status_ = BuildDeviceProgram();
+  return build_device_program_status_;
+}
+
+bool Engine::InputShapeChanged() {
+  for (int i = 0; i < origin_itensors_.size(); i++) {
+    if (origin_itensors_[i]->dims() != origin_idims_[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+int Engine::Launch() {
+  // Rebuild device program when the shapes of input tensors have been changed.
+  if (CHECK_SUCCESS(build_device_program_status_) &&
+      CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
+      InputShapeChanged()) {
+    Build();
+  }
+  if (CHECK_FAILED(build_device_program_status_)) {
+    LaunchOriginProgram();
+  } else {
+    LaunchDeviceProgram();
+  }
+  return 0;
+}
+
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h
new file mode 100644
index 0000000000000000000000000000000000000000..db39063417d7023d697639236043a66c442ca8fa
--- /dev/null
+++ b/lite/kernels/npu/bridges/engine.h
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/program.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+
+class Engine {
+ public:
+  Engine(int block_idx,
+         cpp::BlockDesc *block_desc,
+         const std::vector<std::string> &input_names,
+         const std::vector<std::string> &output_names,
+         lite::Scope *scope)
+      : block_idx_(block_idx),
+        block_desc_(block_desc),
+        input_names_(input_names),
+        output_names_(output_names),
+        scope_(scope) {}
+  virtual ~Engine() = default;
+
+  virtual int Build();
+  virtual int Launch();
+
+ private:
+  Engine(const Engine &) = delete;
+
+ protected:
+  virtual int BuildDeviceProgram();
+  virtual int LaunchDeviceProgram();
+
+  virtual int BuildOriginProgram();
+  virtual int LaunchOriginProgram();
+
+  virtual bool InputShapeChanged();
+
+  int block_idx_;
+  cpp::BlockDesc *block_desc_;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+  Scope *scope_{nullptr};
+  // SUCCESS: device program build successed. FAILED: device program build
+  // failed. REBUILD_WHEN_SHAPE_CHANGED: device program build successed but need
+  // to rebuild when input shape changed.
+  int build_device_program_status_{0};
+  std::vector<DDim> origin_idims_;
+  std::vector<DDim> origin_odims_;
+  std::vector<Tensor *> origin_itensors_;
+  std::vector<Tensor *> origin_otensors_;
+  std::vector<Instruction> origin_program_;
+};
+
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/fc_op.cc b/lite/kernels/npu/bridges/fc_op.cc
index aca72265206818aabd3fd666a422f5764b52de10..65b8ca657facf38da94cfb33f8bde08232a6d4c3 100644
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
@@ -12,31 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
-                          const node_map_type& inputs_map) {
-  auto scope = fc_op->scope();
-  auto op_info = fc_op->op_info();
+int FCConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
-
-  auto fc_node = std::make_shared<ge::op::FullConnection>(unique_op_type);
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
   auto x_var_name = op_info->Input("Input").front();
   auto w_var_name = op_info->Input("W").front();
+  auto out_var_name = op_info->Output("Out").front();
 
   int in_num_col_dims = op_info->GetAttr<int>("in_num_col_dims");
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto w = scope->FindVar(w_var_name)->GetMutable<lite::Tensor>();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
+  auto w = scope->FindVar(w_var_name)->GetMutable<Tensor>();
   auto x_dims = x->dims();
   auto w_dims = w->dims();
 
@@ -50,71 +50,54 @@ node_map_type FCConverter(const std::shared_ptr<lite::OpLite> fc_op,
   VLOG(3) << "[NPU] x dims: " << x_dims << " w dims: " << w_dims << " m: " << m
           << " k: " << k << " n: " << n;
 
-  CHECK(inputs_map.count(x_var_name));
-  CHECK(!inputs_map.count(w_var_name));
+  auto fc_node = graph->AddNode<ge::op::FullConnection>(out_var_name + "/fc");
+  CHECK(!graph->HasNode(w_var_name));
 
-  // reshape x to (m, k, 1, 1)
+  // Reshape x to (m, k, 1, 1)
   auto reshaped_x_node =
-      std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-  reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
+      graph->AddNode<ge::op::Reshape>(x_var_name + "/reshape");
+  reshaped_x_node->set_input_tensor(*graph->GetNode(x_var_name));
   reshaped_x_node->set_attr_shape({m, k, 1, 1});
   reshaped_x_node->set_attr_axis(0);
   fc_node->set_input_x(*reshaped_x_node);
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(reshaped_x_node);
 
-  // create w const node, set its shape to (k, n, 1, 1) and fill with
+  // Create w const node, set its shape to (n, k, 1, 1) and fill with
   // the transposed w tensor
-  auto w_const_node = std::make_shared<ge::op::Const>(w_var_name);
-  ge::TensorDesc w_const_desc(
-      ge::Shape({n, k, 1, 1}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-  ge::TensorPtr w_const_tensor = std::make_shared<ge::Tensor>();
-  w_const_tensor->SetTensorDesc(w_const_desc);
+  Tensor transpose_w;
+  transpose_w.Resize({n, k, 1, 1});
+  auto transpose_w_data = transpose_w.mutable_data<float>();
   auto w_data = w->mutable_data<float>();
-  std::vector<float> transposed_w_data(w_dims.production());
   for (int i = 0; i < k; i++) {
     for (int j = 0; j < n; j++) {
-      transposed_w_data[j * k + i] = w_data[i * n + j];
+      transpose_w_data[j * k + i] = w_data[i * n + j];
     }
   }
-  w_const_tensor->SetData(reinterpret_cast<uint8_t*>(transposed_w_data.data()),
-                          transposed_w_data.size() * sizeof(float));
-  w_const_node->set_attr_value(w_const_tensor);
+  auto w_const_node = graph->AddNode(w_var_name, transpose_w);
   fc_node->set_input_w(*w_const_node);
-  lite::npu::OpList::Global().add(w_const_node);
 
-  // add bias node if bias tensor exists
-  if (lite::npu::HasInputArg(op_info, scope, "Bias")) {
+  // Add bias node if bias tensor exists
+  if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_var_name = op_info->Input("Bias").front();
     auto bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
     auto bias_dims = bias->dims();
-    CHECK(!inputs_map.count(bias_var_name));
+    CHECK(!graph->HasNode(bias_var_name));
     CHECK_EQ(bias_dims.production(), n);
 
-    auto bias_const_node = std::make_shared<ge::op::Const>(bias_var_name);
-    bias_const_node->set_attr_value(lite::npu::CvtTensor(bias, {1, n, 1, 1}));
+    auto bias_const_node = graph->AddNode(bias_var_name, *bias, {1, n, 1, 1});
     fc_node->set_input_b(*bias_const_node);
-    lite::npu::OpList::Global().add(bias_const_node);
   }
-  lite::npu::OpList::Global().add(fc_node);
 
-  // reshape output of fc_node from (m, n, 1, 1) to (m, n)
-  auto reshaped_fc_node =
-      std::make_shared<ge::op::Reshape>(unique_op_type + "_reshape");
+  // Reshape output of fc_node from (m, n, 1, 1) to (m, n)
+  auto reshaped_fc_node = graph->AddNode<ge::op::Reshape>(out_var_name);
   reshaped_fc_node->set_input_tensor(*fc_node);
   reshaped_fc_node->set_attr_shape({m, n});
   reshaped_fc_node->set_attr_axis(0);
-  lite::npu::OpList::Global().add(reshaped_fc_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = reshaped_fc_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(fc, paddle::lite::kernels::npu::bridges::FCConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, fc, paddle::lite::subgraph::npu::FCConverter);
diff --git a/lite/kernels/npu/bridges/graph.cc b/lite/kernels/npu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2676eab14d09e8bfba8116687131533447e158b0
--- /dev/null
+++ b/lite/kernels/npu/bridges/graph.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+// Const node
+std::shared_ptr<ge::op::Const> Graph::AddNode(const std::string& name,
+                                              const Tensor& tensor,
+                                              PrecisionType ptype,
+                                              DataLayoutType ltype) {
+  return AddNode(name, tensor, tensor.dims().Vectorize(), ptype, ltype);
+}
+
+std::shared_ptr<ge::op::Const> Graph::AddNode(const std::string& name,
+                                              const Tensor& tensor,
+                                              std::vector<int64_t> shape,
+                                              PrecisionType ptype,
+                                              DataLayoutType ltype) {
+  CHECK(!HasNode(name)) << "Node " << name << " redefined.";
+  auto node = AddNode<ge::op::Const>(name);
+  node->set_attr_value(CvtTensor(tensor, shape, ptype, ltype));
+  return node;
+}
+
+// Data node
+std::shared_ptr<ge::op::Data> Graph::AddNode(const std::string& name,
+                                             std::vector<int64_t> shape,
+                                             PrecisionType ptype,
+                                             DataLayoutType ltype) {
+  CHECK(!HasNode(name)) << "Node " << name << " redefined.";
+  auto node = AddNode<ge::op::Data>(name);
+  ge::TensorDesc desc(
+      ge::Shape(shape), CvtDataLayoutType(ltype), CvtPrecisionType(ptype));
+  node->update_input_desc_x(desc);
+  nodes_.insert(std::make_pair(name, node));
+  return node;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..153cc65409cfa850f85884de030708b6ebfaff36
--- /dev/null
+++ b/lite/kernels/npu/bridges/graph.h
@@ -0,0 +1,145 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "ai_ddk_lib/include/graph/op/all_ops.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+// Type and registers of converters for converting Paddle Ops to HiAI IR graph
+class Graph {
+ public:
+  template <typename T>
+  std::shared_ptr<T> AddNode(const std::string& name) {
+    auto unique_name = [&](const std::string& key) {
+      int idx = 1;
+      auto it = counts_.find(key);
+      if (it == counts_.end()) {
+        counts_.insert(std::make_pair(key, idx));
+      } else {
+        idx = ++(it->second);
+      }
+      return key + "_" + std::to_string(idx);
+    };
+    auto it = nodes_.find(name);
+    if (it != nodes_.end()) {
+      // Generate a new unique name as the key to bind the origin node:
+      // new_name->node
+      nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second));
+      nodes_.erase(it);
+    }
+    // Create a new node and bind with the name: name->new_node
+    auto node = std::make_shared<T>(unique_name(name + "_op"));
+    nodes_.insert(std::make_pair(name, node));
+    return node;
+  }
+
+  // Const node
+  std::shared_ptr<ge::op::Const> AddNode(
+      const std::string& name,
+      const Tensor& tensor,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<ge::op::Const> AddNode(
+      const std::string& name,
+      const Tensor& tensor,
+      std::vector<int64_t> shape,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+
+  template <typename T>
+  std::shared_ptr<ge::op::Const> AddNode(
+      const std::string& name,
+      const std::vector<T>& data,
+      std::vector<int64_t> shape = {},
+      DataLayoutType ltype = DATALAYOUT(kNCHW)) {
+    const std::type_info& info = typeid(T);
+    PrecisionType ptype = PRECISION(kFloat);
+    if (info == typeid(float)) {
+      ptype = PRECISION(kFloat);
+    } else if (info == typeid(int8_t)) {
+      ptype = PRECISION(kFloat);
+    } else if (info == typeid(int32_t)) {
+      ptype = PRECISION(kInt32);
+    } else {
+      LOG(FATAL) << "[NPU] Unknow data type " << info.name();
+    }
+    if (shape.empty()) {
+      shape = {static_cast<int64_t>(data.size())};
+    } else {
+      int size = 1;
+      for (auto i : shape) {
+        size *= i;
+      }
+      CHECK_EQ(data.size(), size);
+    }
+    Tensor tensor;
+    tensor.Resize(shape);
+    std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
+                reinterpret_cast<const uint8_t*>(data.data()),
+                data.size() * sizeof(T));
+    return AddNode(name, tensor, ptype, ltype);
+  }
+
+  template <typename T>
+  std::shared_ptr<ge::op::Const> AddNode(
+      const std::string& name,
+      T value,
+      std::vector<int64_t> shape = {1},
+      DataLayoutType ltype = DATALAYOUT(kNCHW)) {
+    int64_t size = 1;
+    for (auto i : shape) {
+      size *= i;
+    }
+    std::vector<T> data(size, value);
+    return AddNode(name, data, shape, ltype);
+  }
+
+  // Data node
+  std::shared_ptr<ge::op::Data> AddNode(
+      const std::string& name,
+      std::vector<int64_t> shape,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<ge::Operator> GetNode(std::string name) {
+    CHECK(HasNode(name)) << "[NPU] Node " << name << " not found.";
+    return nodes_.at(name);
+  }
+
+  bool HasNode(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<ge::Operator>> nodes_;
+  std::unordered_map<std::string, int> counts_;
+};
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc
index 8e60a39fe4a32e8750cc161d3485314b42e1ab0c..b54dcee8490d6f3cb4114234653c3615d91baa83 100644
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -12,34 +12,32 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type InterpolateConverter(
-    const std::shared_ptr<lite::OpLite> interpolate_op,
-    const node_map_type& inputs_map) {
-  auto scope = interpolate_op->scope();
-  auto op_info = interpolate_op->op_info();
+int InterpolateConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // get input, output and attributes from lite op
+  // Get input, output and attributes from lite op
   auto x_var_name = op_info->Input("X").front();
-  CHECK(inputs_map.count(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
   auto x_dims = x->dims();
   auto x_h = x_dims[2];
   auto x_w = x_dims[3];
   CHECK_EQ(x_dims.size(), 4);
+  auto out_var_name = op_info->Output("Out").front();
   auto scale = op_info->GetAttr<float>("scale");
   auto out_w = op_info->GetAttr<int>("out_w");
   auto out_h = op_info->GetAttr<int>("out_h");
@@ -50,7 +48,7 @@ node_map_type InterpolateConverter(
                                                  "align_corners = false isn't "
                                                  "supported in HiAI DDK";
 
-  // priority: OutSize > scale > out_h/out_w
+  // Priority: OutSize > scale > out_h/out_w
   if (scale > 0) {
     out_h = static_cast<int>(x_h * scale);
     out_w = static_cast<int>(x_w * scale);
@@ -58,18 +56,17 @@ node_map_type InterpolateConverter(
     out_w = out_w > 0 ? out_w : -1;
   }
 
-  // update out_h and out_w if has OutSize
+  // Update out_h and out_w if has OutSize
   std::shared_ptr<ge::Operator> out_size_node = nullptr;
-  if (lite::npu::HasInputArg(op_info, scope, "OutSize")) {
+  if (HasInputArg(op_info, scope, "OutSize")) {
     auto out_size_var_name = op_info->Input("OutSize").front();
-    if (inputs_map.count(out_size_var_name)) {
-      out_size_node = inputs_map.at(out_size_var_name);
+    if (graph->HasNode(out_size_var_name)) {
+      out_size_node = graph->GetNode(out_size_var_name);
     } else {
-      auto out_size =
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
+      auto out_size = scope->FindVar(out_size_var_name)->GetMutable<Tensor>();
       CHECK_EQ(out_size->numel(), 2);
       auto out_size_data = out_size->mutable_data<int>();
-      // update out_h and out_w if has OutSize
+      // Update out_h and out_w if has OutSize
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     }
@@ -83,46 +80,37 @@ node_map_type InterpolateConverter(
           << " is too large, should not exceed " << largest_multiple
           << " in HiAI DDK";
     }
-    auto out_size_const_node =
-        std::make_shared<ge::op::Const>(unique_op_type + "/out_size");
-    out_size_const_node->set_attr_value(
-        lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-    out_size_node = out_size_const_node;
+    out_size_node = graph->AddNode(out_var_name + "/out_size",
+                                   std::vector<int>({out_h, out_w}));
   }
-  lite::npu::OpList::Global().add(out_size_node);
 
-  std::shared_ptr<ge::Operator> interp_node = nullptr;
   if (interp_method == "bilinear") {
     auto bilinear_interp_node =
-        std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
-    bilinear_interp_node->set_input_x(*inputs_map.at(x_var_name));
+        graph->AddNode<ge::op::ResizeBilinear>(out_var_name);
+    bilinear_interp_node->set_input_x(*graph->GetNode(x_var_name));
     bilinear_interp_node->set_input_size(*out_size_node);
     bilinear_interp_node->set_attr_align_corners(align_corners);
-    interp_node = bilinear_interp_node;
   } else if (interp_method == "nearest") {
     auto nearest_interp_node =
-        std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type);
-    nearest_interp_node->set_input_image(*inputs_map.at(x_var_name));
+        graph->AddNode<ge::op::ResizeNearestNeighbor>(out_var_name);
+    nearest_interp_node->set_input_image(*graph->GetNode(x_var_name));
     nearest_interp_node->set_input_size(*out_size_node);
     nearest_interp_node->set_attr_align_corners(align_corners);
-    interp_node = nearest_interp_node;
   } else {
-    LOG(FATAL) << "[NPU] Unsupported interpolate method: " << interp_method;
+    LOG(WARNING) << "[NPU] Unsupported interpolate method: " << interp_method;
+    return FAILED;
   }
-  lite::npu::OpList::Global().add(interp_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = interp_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(bilinear_interp,
-                    paddle::lite::kernels::npu::bridges::InterpolateConverter);
-REGISTER_NPU_BRIDGE(nearest_interp,
-                    paddle::lite::kernels::npu::bridges::InterpolateConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         bilinear_interp,
+                         paddle::lite::subgraph::npu::InterpolateConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         nearest_interp,
+                         paddle::lite::subgraph::npu::InterpolateConverter);
diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc
index 2313351f6c49ea08451b06dc347c91aeeed4d755..e5b24b40926a36cd809a0b7891fd63f91b7c7f35 100644
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -12,24 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-// Note: inputs_map the var_name contains only the data, the weight should be
-// handle in this converter
-node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
-                           const node_map_type& inputs_map) {
-  auto scope = mul_op->scope();
-  auto op_info = mul_op->op_info();
+// Note: all of the input weight vars should be handled in this converter
+int MulConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
   auto x_var_name = op_info->Input("X").front();
   auto y_var_name = op_info->Input("Y").front();
@@ -37,6 +37,7 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
   auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
   auto x_dims = x->dims();
   auto y_dims = y->dims();
+  auto out_var_name = op_info->Output("Out").front();
   int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
   int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
   int m = x_dims.Slice(0, x_num_col_dims).production();
@@ -44,61 +45,47 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
   CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production())
       << "[NPU] columns of X must be equal with rows of Y";
   int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production();
-  LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k;
-  LOG(INFO) << "x_var_name:" << x_var_name
-            << ", is data: " << inputs_map.count(x_var_name);
-  LOG(INFO) << "y_var_name:" << y_var_name
-            << ", is data: " << inputs_map.count(y_var_name);
-  CHECK(inputs_map.count(x_var_name))
+  VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k;
+  VLOG(3) << "x_var_name:" << x_var_name
+          << ", is data: " << graph->HasNode(x_var_name);
+  VLOG(3) << "y_var_name:" << y_var_name
+          << ", is data: " << graph->HasNode(y_var_name);
+  CHECK(graph->HasNode(x_var_name))
       << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";
 
-  auto mul_node = std::make_shared<ge::op::MatMul>(unique_op_type);
-  // add input x node which supports persistable and non-persistable tensor, and
+  auto mul_node = graph->AddNode<ge::op::MatMul>(out_var_name);
+  // Add input x node which supports persistable and non-persistable tensor, and
   // reshape to (m, k)
-  if (inputs_map.count(x_var_name)) {
+  if (graph->HasNode(x_var_name)) {
     auto reshaped_x_node =
-        std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-    reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
+        graph->AddNode<ge::op::Reshape>(x_var_name + "/reshape");
+    reshaped_x_node->set_input_tensor(*graph->GetNode(x_var_name));
     reshaped_x_node->set_attr_shape({m, k});
     reshaped_x_node->set_attr_axis(0);
     mul_node->set_input_x1(*reshaped_x_node);
-    lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-    lite::npu::OpList::Global().add(reshaped_x_node);
   } else {
-    auto x_const_node = std::make_shared<ge::op::Const>(x_var_name);
-    x_const_node->set_attr_value(lite::npu::CvtTensor(x, {m, k}));
+    auto x_const_node = graph->AddNode(x_var_name, *x, {m, k});
     mul_node->set_input_x1(*x_const_node);
-    lite::npu::OpList::Global().add(x_const_node);
   }
-  // add input y node which only supports persistable tensor, and reshape to (k,
-  // n)
-  if (inputs_map.count(y_var_name)) {
+  // Add input y node which only supports persistable tensor, and reshape to
+  // (k,n)
+  if (graph->HasNode(y_var_name)) {
     auto reshaped_y_node =
-        std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
-    reshaped_y_node->set_input_tensor(*inputs_map.at(y_var_name));
+        graph->AddNode<ge::op::Reshape>(y_var_name + "/reshape");
+    reshaped_y_node->set_input_tensor(*graph->GetNode(y_var_name));
     reshaped_y_node->set_attr_shape({k, n});
     reshaped_y_node->set_attr_axis(0);
     mul_node->set_input_x2(*reshaped_y_node);
-    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
-    lite::npu::OpList::Global().add(reshaped_y_node);
   } else {
-    auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
-    y_const_node->set_attr_value(lite::npu::CvtTensor(y, {k, n}));
+    auto y_const_node = graph->AddNode(y_var_name, *y, {k, n});
     mul_node->set_input_x2(*y_const_node);
-    lite::npu::OpList::Global().add(y_const_node);
   }
-
-  lite::npu::OpList::Global().add(mul_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = mul_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(mul, paddle::lite::kernels::npu::bridges::MulConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, mul, paddle::lite::subgraph::npu::MulConverter);
diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc
index 1a78e4a47440bc66e98103750a4329cb889c3538..8b3f17a8615a5892fdbee24d79c648edbe9b2cf9 100644
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
@@ -12,38 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
-                             const node_map_type& inputs_map) {
-  auto scope = pad2d_op->scope();
-  auto op_info = pad2d_op->op_info();
+int Pad2dConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::Pad> pad2d_node =
-      std::make_shared<ge::op::Pad>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
-  pad2d_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(pad2d_node);
+  auto out_var_name = op_info->Output("Out").front();
+  auto pad2d_node = graph->AddNode<ge::op::Pad>(out_var_name);
+  pad2d_node->set_input_x(*graph->GetNode(x_var_name));
 
   auto mode = op_info->GetAttr<std::string>("mode");
   if (mode == "constant") {
     pad2d_node->set_attr_mode(0);
   } else if (mode == "reflect") {
-    LOG(FATAL) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
+    LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
     pad2d_node->set_attr_mode(1);
+    return FAILED;
   } else {
-    LOG(FATAL) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
+    LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
+    return FAILED;
   }
 
   auto x_dims = scope->FindTensor(x_var_name)->dims();
@@ -51,34 +52,25 @@ node_map_type Pad2dConverter(const std::shared_ptr<lite::OpLite> pad2d_op,
   CHECK_EQ(padding.size(), 4);
   int xds = x_dims.size();
   padding.insert(padding.begin(), xds * 2 - 4, 0);
-  auto npu_padding =
-      std::make_shared<ge::op::Const>(unique_op_type + "/padding");
-  npu_padding->set_attr_value(
-      lite::npu::CreateTensorAndFillData<int>(padding, {xds, 2}));
-  pad2d_node->set_input_padding(*npu_padding);
-  lite::npu::OpList::Global().add(npu_padding);
+  auto padding_const_node =
+      graph->AddNode(out_var_name + "/padding", padding, {xds, 2});
+  pad2d_node->set_input_padding(*padding_const_node);
 
   if (mode == "constant") {
     auto pad_value = op_info->GetAttr<float>("pad_value");
-    auto npu_pad_value =
-        std::make_shared<ge::op::Const>(unique_op_type + "/pad_value");
-    npu_pad_value->set_attr_value(
-        lite::npu::CreateTensorAndFillData<float>({pad_value}));
-    pad2d_node->set_input_constant_values(*npu_pad_value);
-    lite::npu::OpList::Global().add(npu_pad_value);
-
+    auto pad_value_const_node =
+        graph->AddNode(out_var_name + "/pad_value", pad_value);
+    pad2d_node->set_input_constant_values(*pad_value_const_node);
     pad2d_node->set_attr_T(0);  // type of pad_value:  0:float  3:int32
   }
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = pad2d_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(pad2d, paddle::lite::kernels::npu::bridges::Pad2dConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         pad2d,
+                         paddle::lite::subgraph::npu::Pad2dConverter);
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..d6fc5353385fd86b9ae682769684720f4e0ad57c
--- /dev/null
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(NPU, sigmoid);
+USE_SUBGRAPH_BRIDGE(NPU, relu);
+USE_SUBGRAPH_BRIDGE(NPU, tanh);
+USE_SUBGRAPH_BRIDGE(NPU, relu_clipped);
+USE_SUBGRAPH_BRIDGE(NPU, leaky_relu);
+USE_SUBGRAPH_BRIDGE(NPU, softsign);
+USE_SUBGRAPH_BRIDGE(NPU, hard_sigmoid);
+
+USE_SUBGRAPH_BRIDGE(NPU, batch_norm);
+USE_SUBGRAPH_BRIDGE(NPU, concat);
+USE_SUBGRAPH_BRIDGE(NPU, conv2d);
+USE_SUBGRAPH_BRIDGE(NPU, depthwise_conv2d);
+USE_SUBGRAPH_BRIDGE(NPU, conv2d_transpose);
+
+USE_SUBGRAPH_BRIDGE(NPU, elementwise_add);
+USE_SUBGRAPH_BRIDGE(NPU, fusion_elementwise_add_activation);
+USE_SUBGRAPH_BRIDGE(NPU, elementwise_sub);
+USE_SUBGRAPH_BRIDGE(NPU, elementwise_mul);
+USE_SUBGRAPH_BRIDGE(NPU, elementwise_div);
+
+USE_SUBGRAPH_BRIDGE(NPU, fc);
+USE_SUBGRAPH_BRIDGE(NPU, bilinear_interp);
+USE_SUBGRAPH_BRIDGE(NPU, nearest_interp);
+USE_SUBGRAPH_BRIDGE(NPU, mul);
+USE_SUBGRAPH_BRIDGE(NPU, pad2d);
+USE_SUBGRAPH_BRIDGE(NPU, pool2d);
+USE_SUBGRAPH_BRIDGE(NPU, reduce_mean);
+USE_SUBGRAPH_BRIDGE(NPU, reshape);
+USE_SUBGRAPH_BRIDGE(NPU, reshape2);
+USE_SUBGRAPH_BRIDGE(NPU, scale);
+USE_SUBGRAPH_BRIDGE(NPU, shuffle_channel);
+USE_SUBGRAPH_BRIDGE(NPU, softmax);
+USE_SUBGRAPH_BRIDGE(NPU, split);
+USE_SUBGRAPH_BRIDGE(NPU, sqrt);
+USE_SUBGRAPH_BRIDGE(NPU, square);
+USE_SUBGRAPH_BRIDGE(NPU, transpose);
+USE_SUBGRAPH_BRIDGE(NPU, transpose2);
diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
deleted file mode 100644
index 9a432d17e543bece48fb1c1369ee90ff56e8dcbf..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/kernels/npu/bridges/registry.h"
-
-USE_NPU_BRIDGE(sigmoid);
-USE_NPU_BRIDGE(relu);
-USE_NPU_BRIDGE(tanh);
-USE_NPU_BRIDGE(relu_clipped);
-USE_NPU_BRIDGE(leaky_relu);
-USE_NPU_BRIDGE(softsign);
-USE_NPU_BRIDGE(hard_sigmoid);
-
-USE_NPU_BRIDGE(batch_norm);
-USE_NPU_BRIDGE(concat);
-USE_NPU_BRIDGE(conv2d);
-USE_NPU_BRIDGE(depthwise_conv2d);
-USE_NPU_BRIDGE(conv2d_transpose);
-
-USE_NPU_BRIDGE(elementwise_add);
-USE_NPU_BRIDGE(fusion_elementwise_add_activation);
-USE_NPU_BRIDGE(elementwise_sub);
-USE_NPU_BRIDGE(elementwise_mul);
-USE_NPU_BRIDGE(elementwise_div);
-
-USE_NPU_BRIDGE(fc);
-USE_NPU_BRIDGE(bilinear_interp);
-USE_NPU_BRIDGE(nearest_interp);
-USE_NPU_BRIDGE(mul);
-USE_NPU_BRIDGE(pad2d);
-USE_NPU_BRIDGE(pool2d);
-USE_NPU_BRIDGE(reduce_mean);
-USE_NPU_BRIDGE(reshape);
-USE_NPU_BRIDGE(reshape2);
-USE_NPU_BRIDGE(scale);
-USE_NPU_BRIDGE(shuffle_channel);
-USE_NPU_BRIDGE(softmax);
-USE_NPU_BRIDGE(split);
-USE_NPU_BRIDGE(sqrt);
-USE_NPU_BRIDGE(square);
-USE_NPU_BRIDGE(transpose);
-USE_NPU_BRIDGE(transpose2);
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index 7bbe94d5db6b0345bb4a3fefe8a75f2a696902e9..9122da2c16537b376904c7d77bd82ae8c544a18c 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -13,30 +13,29 @@
 // limitations under the License.
 
 #include "lite/operators/pool_op.h"
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
-                            const node_map_type& inputs_map) {
-  auto scope = pool_op->scope();
-  auto op_info = pool_op->op_info();
+int PoolConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::Pooling> pool_node =
-      std::make_shared<ge::op::Pooling>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
   auto x = scope->FindTensor(x_var_name);
-  pool_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(pool_node);
+  auto out_var_name = op_info->Output("Out").front();
+  auto pool_node = graph->AddNode<ge::op::Pooling>(out_var_name);
+  pool_node->set_input_x(*graph->GetNode(x_var_name));
 
   int mode = 0;
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
@@ -47,7 +46,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
     CHECK(op_info->GetAttr<bool>("exclusive"))
         << "[NPU] exclusive must be true in HiAI DDK";
   } else {
-    LOG(FATAL) << "[NPU] Unsupported pooling type: " << pooling_type;
+    LOG(WARNING) << "[NPU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
   }
   pool_node->set_attr_mode(mode);
 
@@ -67,8 +67,8 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
   pool_node->set_attr_global_pooling(global_pooling);
 
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  auto window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end());
-  pool_node->set_attr_window(window);
+  pool_node->set_attr_window(
+      ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()));
 
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
   if (paddings.size() == 2L) {
@@ -78,42 +78,38 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
     }
   }
   CHECK_EQ(paddings.size(), 4L)
-      << "Paddings size should be the same or twice as the inputs size.";
+      << "[NPU] Paddings size should be the same or twice as the inputs size.";
   bool adaptive = false;
   if (op_info->HasAttr("adaptive")) {
     adaptive = op_info->GetAttr<bool>("adaptive");
   }
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
-  operators::UpdatePadding(&paddings,
-                           global_pooling,
-                           adaptive,
-                           padding_algorithm,
-                           x->dims(),
-                           strides,
-                           ksize);
-  auto npu_pad = ge::AttrValue::LIST_INT{
-      paddings[0], paddings[1], paddings[2], paddings[3]};
-  pool_node->set_attr_pad(npu_pad);
-
-  auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end());
-  pool_node->set_attr_stride(npu_stride);
+  lite::operators::UpdatePadding(&paddings,
+                                 global_pooling,
+                                 adaptive,
+                                 padding_algorithm,
+                                 x->dims(),
+                                 strides,
+                                 ksize);
+  pool_node->set_attr_pad(ge::AttrValue::LIST_INT{
+      paddings[0], paddings[1], paddings[2], paddings[3]});
+  pool_node->set_attr_stride(
+      ge::AttrValue::LIST_INT(strides.begin(), strides.end()));
 
   int ceil_mode = 0;
   if (op_info->HasAttr("ceil_mode")) {
     ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
   }
   pool_node->set_attr_ceil_mode(ceil_mode);
-  // output_node->set_attr_data_mode(npu_data_mode);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = pool_node;
-  return outputs_map;
+  // pool_node->set_attr_data_mode(data_mode);
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(pool2d, paddle::lite::kernels::npu::bridges::PoolConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         pool2d,
+                         paddle::lite::subgraph::npu::PoolConverter);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc
index 4725bdfb0e17c4f99dfd2359ff34c96f9e5af6e5..29f11193e373e15ea35e2445a078d1ae6a7c8a1b 100644
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
@@ -12,30 +12,31 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ReduceMeanConverter(
-    const std::shared_ptr<lite::OpLite> reduce_mean_op,
-    const node_map_type& inputs_map) {
-  auto scope = reduce_mean_op->scope();
-  auto op_info = reduce_mean_op->op_info();
+int ReduceMeanConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // get input, and op attributes
+  // Get input and op attributes
   auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Input("Out").front();
   auto x_dims = scope->FindTensor(x_var_name)->dims();
   auto keep_dim = op_info->GetAttr<bool>("keep_dim");
   auto dim = op_info->GetAttr<std::vector<int>>("dim");
-  CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty.";
+  CHECK(!dim.empty()) << "[NPU] \"dim\" of reduce_mean should not be empty.";
   for (size_t i = 0; i < dim.size(); i++) {
     if (dim[i] < 0) {
       dim[i] += x_dims.size();
@@ -43,30 +44,16 @@ node_map_type ReduceMeanConverter(
   }
   std::sort(dim.begin(), dim.end());
 
-  // create reduce_mean(reduce_sum + scale) node and set input node from
-  // inputs_map
-  // creat reduce_sum node
-  auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum");
-  auto reduce_sum_node = std::make_shared<ge::op::ReduceSum>(unique_reduce_sum);
-  CHECK(inputs_map.count(x_var_name));
-  reduce_sum_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(reduce_sum_node);
+  // Create reduce_mean(using reduce_sum + scale) node and set input node from
+  // node map
+  auto reduce_sum_node =
+      graph->AddNode<ge::op::ReduceSum>(out_var_name + "/reducesum");
+  reduce_sum_node->set_input_x(*graph->GetNode(x_var_name));
 
-  auto dim_const_node =
-      std::make_shared<ge::op::Const>(unique_reduce_sum + "/dim");
-  dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData<int>(dim));
+  auto dim_const_node = graph->AddNode(out_var_name + "/dim", dim);
   reduce_sum_node->set_input_w(*dim_const_node);
-  lite::npu::OpList::Global().add(dim_const_node);
-
   reduce_sum_node->set_attr_keep_dims(keep_dim);
 
-  // create scale node
-  auto unique_scale = lite::npu::UniqueName("scale");
-  auto scale_node = std::make_shared<ge::op::Scale>(unique_scale);
-  scale_node->set_input_x(*reduce_sum_node);
-  lite::npu::OpList::Global().add(scale_node);
-
   float scale = 1;
   for (size_t i = 0; i < dim.size(); i++) {
     scale /= x_dims[dim[i]];
@@ -88,24 +75,19 @@ node_map_type ReduceMeanConverter(
   }
 
   auto filter_const_node =
-      std::make_shared<ge::op::Const>(unique_scale + "/filter");
-  filter_const_node->set_attr_value(
-      lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
+      graph->AddNode(out_var_name + "/filter", scale, scale_bias_shape);
+  auto scale_node = graph->AddNode<ge::op::Scale>(out_var_name);
+  scale_node->set_input_x(*reduce_sum_node);
   scale_node->set_input_filter(*filter_const_node);
-  lite::npu::OpList::Global().add(filter_const_node);
-
   scale_node->set_attr_axis(1);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = scale_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(reduce_mean,
-                    paddle::lite::kernels::npu::bridges::ReduceMeanConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         reduce_mean,
+                         paddle::lite::subgraph::npu::ReduceMeanConverter);
diff --git a/lite/kernels/npu/bridges/registry.cc b/lite/kernels/npu/bridges/registry.cc
index ead7567f41d5bb5e8c7e0f70cd9ec7f3542e196b..5f89bcb313ded16c207b305e3265a0a60cd81ecb 100644
--- a/lite/kernels/npu/bridges/registry.cc
+++ b/lite/kernels/npu/bridges/registry.cc
@@ -17,25 +17,38 @@
 
 namespace paddle {
 namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
+namespace subgraph {
 
-Factory& Factory::Instance() {
-  static Factory g_npu_bridge;
-  return g_npu_bridge;
+Registry& Registry::Instance() {
+  static Registry x;
+  return x;
 }
 
-bool Factory::HasType(const std::string& op_type) const {
-  return map_.count(op_type);
+void Registry::Insert(const std::string& dev_type,
+                      const std::string& op_type,
+                      const cvt_func_type& cvt_func_name) {
+  auto it = map_.find(dev_type);
+  if (it == map_.end()) {
+    map_.insert(std::make_pair(
+        dev_type, std::unordered_map<std::string, cvt_func_type>()));
+  }
+  map_.at(dev_type).insert(std::make_pair(op_type, cvt_func_name));
 }
 
-void Factory::Insert(const std::string& op_type, const func_type& func_name) {
-  map_.insert(std::make_pair(op_type, func_name));
+const cvt_func_type& Registry::Select(const std::string& dev_type,
+                                      const std::string& op_type) const {
+  return map_.at(dev_type).at(op_type);
 }
 
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
+bool Registry::Exists(const std::string& dev_type,
+                      const std::string& op_type) const {
+  bool found = map_.find(dev_type) != map_.end();
+  if (found) {
+    found = map_.at(dev_type).find(op_type) != map_.at(dev_type).end();
+  }
+  return found;
+}
+
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/npu/bridges/registry.h b/lite/kernels/npu/bridges/registry.h
index efbf2461c0c7e9f79b7e053bdf082f243f5d3033..c4c88ae02b6f16bb29362703abc1f76a2a035ec0 100644
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
@@ -15,44 +15,46 @@
 #pragma once
 
 #include <functional>
-#include <memory>
 #include <string>
 #include <unordered_map>
-#include <vector>
-#include "ai_ddk_lib/include/graph/operator_reg.h"
 #include "lite/core/op_lite.h"
 #include "lite/utils/macros.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
+namespace subgraph {
 
-// var_name, npu node point
-using node_map_type =
-    std::unordered_map<std::string, std::shared_ptr<ge::Operator>>;
+const int FAILED = 1;
+const int SUCCESS = 0;
+const int REBUILD_WHEN_SHAPE_CHANGED = 2;
+inline bool CHECK_FAILED(int status) { return status & FAILED; }
+inline bool CHECK_SUCCESS(int status) { return !CHECK_FAILED(status); }
+inline bool CHECK_REBUILD_WHEN_SHAPE_CHANGED(int status) {
+  return status & REBUILD_WHEN_SHAPE_CHANGED;
+}
 
-using func_type = std::function<node_map_type(const std::shared_ptr<OpLite>,
-                                              const node_map_type&)>;
-using cvt_map_type = std::unordered_map<std::string, func_type>;
-class Factory {
+using cvt_func_type = std::function<int(void* ctx, OpLite* op)>;
+using cvt_map_type =
+    std::unordered_map<std::string,
+                       std::unordered_map<std::string, cvt_func_type>>;
+class Registry {
  public:
-  static Factory& Instance();
+  static Registry& Instance();
 
-  const cvt_map_type& AllFunctions() const { return map_; }
-  bool HasType(const std::string& op_type) const;
-  void Insert(const std::string& op_type, const func_type& func_name);
-  Factory() = default;
+  void Insert(const std::string& dev_type,
+              const std::string& op_type,
+              const cvt_func_type& cvt_func_name);
+  const cvt_func_type& Select(const std::string& dev_type,
+                              const std::string& op_type) const;
+  bool Exists(const std::string& dev_type, const std::string& op_type) const;
+  Registry() = default;
 
  private:
   cvt_map_type map_;
-  DISALLOW_COPY_AND_ASSIGN(Factory);
+  DISALLOW_COPY_AND_ASSIGN(Registry);
 };
 
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
@@ -70,17 +72,18 @@ class Factory {
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
-#define REGISTER_NPU_BRIDGE(op_type, cvt_func_name)                         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                 \
-      __reg_npu_bridge_##op_type##__,                                       \
-      "REGISTER_NPU_BRIDGE must be called in global namespace only once!"); \
-  int __reg_npu_bridge_##op_type##_Insert() {                               \
-    paddle::lite::kernels::npu::bridges::Factory::Instance().Insert(        \
-        #op_type, cvt_func_name);                                           \
-    return 0;                                                               \
+#define REGISTER_SUBGRAPH_BRIDGE(dev_type, op_type, cvt_func_name)        \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                               \
+      __reg_subgraph_bridge_##dev_type##_##op_type##__,                   \
+      "REGISTER_SUBGRAPH_BRIDGE must be called in global namespace only " \
+      "once!");                                                           \
+  int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert() {           \
+    paddle::lite::subgraph::Registry::Instance().Insert(                  \
+        #dev_type, #op_type, cvt_func_name);                              \
+    return 0;                                                             \
   }
 
-#define USE_NPU_BRIDGE(op_type)                                  \
-  extern int __reg_npu_bridge_##op_type##_Insert();              \
-  static int __reg_npu_bridge_##op_type##_Insert_return UNUSED = \
-      __reg_npu_bridge_##op_type##_Insert();
+#define USE_SUBGRAPH_BRIDGE(dev_type, op_type)                            \
+  extern int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert();     \
+  static int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert_return \
+      UNUSED = __reg_subgraph_bridge_##dev_type##_##op_type##_Insert();
diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc
index a554aac94f270517d26ed76016678989b87b6ea6..9bd77e8eb991e59a49ed3e9d4db4f352b63904d7 100644
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -13,48 +13,49 @@
 // limitations under the License.
 
 #include "lite/operators/reshape_op.h"
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
-                               const node_map_type& inputs_map) {
-  auto scope = reshape_op->scope();
-  auto op_info = reshape_op->op_info();
+int ReshapeConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // get input, output and op attributes
+  // Get input, output and op attributes
   auto x_var_name = op_info->Input("X").front();
-  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto out_var_name = op_info->Output("Out").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<Tensor>();
   auto x_dims = x->dims();
 
-  // create reshape node and set input node from inputs_map
-  auto reshape_node = std::make_shared<ge::op::Reshape>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  reshape_node->set_input_tensor(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  // Create reshape node and set input node from inputs_map
+  auto reshape_node = graph->AddNode<ge::op::Reshape>(out_var_name);
+  reshape_node->set_input_tensor(*graph->GetNode(x_var_name));
 
-  // read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr)
-  if (lite::npu::HasInputArg(op_info, scope, "ShapeTensor")) {
-    LOG(FATAL) << "[NPU] not support \"Shape\" from more than one Tensor.";
-  } else if (lite::npu::HasInputArg(op_info, scope, "Shape")) {
+  // Read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr)
+  if (HasInputArg(op_info, scope, "ShapeTensor")) {
+    LOG(WARNING) << "[NPU] not support \"Shape\" from more than one Tensor.";
+    return FAILED;
+  } else if (HasInputArg(op_info, scope, "Shape")) {
     auto actual_shape_var_name = op_info->Input("Shape").front();
-    if (!inputs_map.count(actual_shape_var_name)) {
+    if (!graph->HasNode(actual_shape_var_name)) {
       auto actual_shape =
-          scope->FindVar(actual_shape_var_name)->GetMutable<lite::Tensor>();
+          scope->FindVar(actual_shape_var_name)->GetMutable<Tensor>();
       auto actual_shape_dims = actual_shape->dims();
       auto actual_shape_data = actual_shape->mutable_data<int>();
       auto shape =
           std::vector<int>(actual_shape_data,
                            actual_shape_data + actual_shape_dims.production());
-      auto out_dims = operators::ValidateShape(shape, x_dims);
+      auto out_dims = lite::operators::ValidateShape(shape, x_dims);
       auto out_shape = out_dims.Vectorize();
       if (out_shape.size() > 4) {
         LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
@@ -62,19 +63,15 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
                      << out_shape.size();
       }
       auto actual_shape_const_node =
-          std::make_shared<ge::op::Const>(actual_shape_var_name);
-      actual_shape_const_node->set_attr_value(
-          lite::npu::CreateTensorAndFillData(
-              std::vector<int>(out_shape.begin(), out_shape.end())));
+          graph->AddNode(actual_shape_var_name,
+                         std::vector<int>(out_shape.begin(), out_shape.end()));
       reshape_node->set_input_w(*actual_shape_const_node);
-      lite::npu::OpList::Global().add(actual_shape_const_node);
     } else {
-      reshape_node->set_input_w(*inputs_map.at(actual_shape_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(actual_shape_var_name));
+      reshape_node->set_input_w(*graph->GetNode(actual_shape_var_name));
     }
   } else {
     auto shape = op_info->GetAttr<std::vector<int>>("shape");
-    auto out_dims = operators::ValidateShape(shape, x_dims);
+    auto out_dims = lite::operators::ValidateShape(shape, x_dims);
     auto out_shape = out_dims.Vectorize();
     if (out_shape.size() > 4) {
       LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
@@ -84,12 +81,9 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
     reshape_node->set_attr_shape(
         ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
   }
-  lite::npu::OpList::Global().add(reshape_node);
 
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = reshape_node;
   if (op_type == "reshape2") {
-    // append an extra reshape node to calc XShape
+    // Append an extra reshape node to calc XShape
     std::vector<int64_t> xshape_dims(x_dims.size() + 1, 1);
     for (size_t i = 0; i < x_dims.size(); i++) {
       xshape_dims[i + 1] = x_dims[i];
@@ -99,24 +93,23 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
                       "but XShape has "
                    << xshape_dims.size();
     }
-    auto xshape_node =
-        std::make_shared<ge::op::Reshape>(unique_op_type + "/xshape");
-    xshape_node->set_input_tensor(*inputs_map.at(x_var_name));
+    auto xshape_var_name = op_info->Output("XShape").front();
+    auto xshape_node = graph->AddNode<ge::op::Reshape>(xshape_var_name);
+    xshape_node->set_input_tensor(*graph->GetNode(x_var_name));
     xshape_node->set_attr_shape(
         ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end()));
-    lite::npu::OpList::Global().add(xshape_node);
-    outputs_map[op_info->Output("XShape").front()] = xshape_node;
   }
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(reshape,
-                    paddle::lite::kernels::npu::bridges::ReshapeConverter);
-REGISTER_NPU_BRIDGE(reshape2,
-                    paddle::lite::kernels::npu::bridges::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         reshape,
+                         paddle::lite::subgraph::npu::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         reshape2,
+                         paddle::lite::subgraph::npu::ReshapeConverter);
diff --git a/lite/kernels/npu/bridges/scale_op.cc b/lite/kernels/npu/bridges/scale_op.cc
index c75d8fabdfa84c2322dee168ba10d4cf9b8266ad..72d0a7d300fa5126b4d1a40521568e0829f8027c 100644
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
@@ -12,28 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
-                             const node_map_type& inputs_map) {
-  auto scope = scale_op->scope();
-  auto op_info = scale_op->op_info();
+int ScaleConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  // get input, output and op attributes
+  // Get input, output and op attributes
   auto x_var_name = op_info->Input("X").front();
   auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
   auto x_dims = x->dims().Vectorize();
   CHECK_GE(x_dims.size(), 2);
+  auto out_var_name = op_info->Output("Out").front();
   std::vector<int64_t> scale_bias_shape = {x_dims[1]};
   float scale = op_info->GetAttr<float>("scale");
   float bias = op_info->GetAttr<float>("bias");
@@ -42,43 +44,31 @@ node_map_type ScaleConverter(const std::shared_ptr<lite::OpLite> scale_op,
     bias *= scale;
   }
 
-  // create scale node and set input node from inputs_map
-  auto scale_node = std::make_shared<ge::op::Scale>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  scale_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(scale_node);
+  // Create scale node and set input node from inputs_map
+  auto scale_node = graph->AddNode<ge::op::Scale>(out_var_name);
+  scale_node->set_input_x(*graph->GetNode(x_var_name));
 
-  // add filter node(fill with scale)
+  // Add filter node(fill with scale)
   auto filter_const_node =
-      std::make_shared<ge::op::Const>(unique_op_type + "/filter");
-  filter_const_node->set_attr_value(
-      lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
+      graph->AddNode(out_var_name + "/filter", scale, scale_bias_shape);
   scale_node->set_input_filter(*filter_const_node);
-  lite::npu::OpList::Global().add(filter_const_node);
 
-  // add bias node(fill with bias)
+  // Add bias node(fill with bias)
   if (fabs(bias) > 1e-6f) {
     auto bias_const_node =
-        std::make_shared<ge::op::Const>(unique_op_type + "/bias");
-    bias_const_node->set_attr_value(
-        lite::npu::CreateTensorAndFillData(bias, scale_bias_shape));
+        graph->AddNode(out_var_name + "/bias", bias, scale_bias_shape);
     scale_node->set_input_bias(*bias_const_node);
     scale_node->set_attr_has_bias_value(true);
-    lite::npu::OpList::Global().add(bias_const_node);
   }
-
   scale_node->set_attr_axis(1);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = scale_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(scale, paddle::lite::kernels::npu::bridges::ScaleConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         scale,
+                         paddle::lite::subgraph::npu::ScaleConverter);
diff --git a/lite/kernels/npu/bridges/shuffle_channel_op.cc b/lite/kernels/npu/bridges/shuffle_channel_op.cc
index 5af504886752f96d88765f6b430ba83d27091d56..1d56b7d206385c490174880ac198d1111da9f0b3 100644
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
@@ -12,45 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type ShuffleChannelConverter(
-    const std::shared_ptr<lite::OpLite> shuffle_channel_op,
-    const node_map_type& inputs_map) {
-  auto scope = shuffle_channel_op->scope();
-  auto op_info = shuffle_channel_op->op_info();
+int ShuffleChannelConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::ShuffleChannel> shuffle_channel_node =
-      std::make_shared<ge::op::ShuffleChannel>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto shuffle_channel_node =
+      graph->AddNode<ge::op::ShuffleChannel>(out_var_name);
 
-  shuffle_channel_node->set_input_x(*inputs_map.at(x_var_name));
+  shuffle_channel_node->set_input_x(*graph->GetNode(x_var_name));
   shuffle_channel_node->set_attr_group(op_info->GetAttr<int>("group"));
-
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(shuffle_channel_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = shuffle_channel_node;
-  return outputs_map;
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(
-    shuffle_channel,
-    paddle::lite::kernels::npu::bridges::ShuffleChannelConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         shuffle_channel,
+                         paddle::lite::subgraph::npu::ShuffleChannelConverter);
diff --git a/lite/kernels/npu/bridges/softmax_op.cc b/lite/kernels/npu/bridges/softmax_op.cc
index 896f65ea796bc269c2c189324eea1e98eda2c11a..e8d97194a84dc21f2ed423fc0c81df292ff68505 100644
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
@@ -12,27 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
-                               const node_map_type& inputs_map) {
-  auto scope = softmax_op->scope();
-  auto op_info = softmax_op->op_info();
+int SoftmaxConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::Softmax> softmax_node =
-      std::make_shared<ge::op::Softmax>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
-
+  auto out_var_name = op_info->Output("Out").front();
   auto x_dims = scope->FindVar(x_var_name)->GetMutable<Tensor>()->dims();
   auto axis = op_info->GetAttr<int>("axis");
   if (x_dims.size() > 3) {
@@ -41,23 +40,17 @@ node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> softmax_op,
         << "  :x_w = " << x_dims[3];
   }
 
-  CHECK(inputs_map.count(x_var_name));
-  softmax_node->set_input_x(*inputs_map.at(x_var_name));
+  auto softmax_node = graph->AddNode<ge::op::Softmax>(out_var_name);
+  softmax_node->set_input_x(*graph->GetNode(x_var_name));
   softmax_node->set_attr_axis(axis);
-
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(softmax_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = softmax_node;
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(softmax,
-                    paddle::lite::kernels::npu::bridges::SoftmaxConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         softmax,
+                         paddle::lite::subgraph::npu::SoftmaxConverter);
diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc
index 2d45c3f557870fe5e2732011d4a942cbea4e39a3..29ba88f8a9aab82156b80e9a8a6ecf84a3eb012c 100644
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
@@ -12,70 +12,60 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type SplitConverter(const std::shared_ptr<lite::OpLite> split_op,
-                             const node_map_type& inputs_map) {
-  lite::Scope* scope = split_op->scope();
-  const lite::OpInfo* op_info = split_op->op_info();
+int SplitConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " << op_type << " ... ";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << " ... ";
 
   auto x_var_name = op_info->Input("X").front();
+  auto out_var_names = op_info->Output("Out");
   auto axis = op_info->GetAttr<int>("axis");
   auto num = op_info->GetAttr<int>("num");
   auto sections = op_info->GetAttr<std::vector<int>>("sections");
   int64_t sections_num = static_cast<int64_t>(sections.size());
 
-  std::shared_ptr<ge::op::Split> output_node =
-      std::make_shared<ge::op::Split>(unique_op_type);
-  CHECK(inputs_map.count(x_var_name));
-  output_node->set_input_x(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-
-  output_node->set_attr_axis(static_cast<int64_t>(axis));
+  auto split_node = graph->AddNode<ge::op::Split>(op_type + "/" + x_var_name);
+  split_node->set_input_x(*graph->GetNode(x_var_name));
+  split_node->set_attr_axis(static_cast<int64_t>(axis));
   if (num > 0) {
-    output_node->set_attr_output_num(static_cast<int64_t>(num));
+    split_node->set_attr_output_num(static_cast<int64_t>(num));
   } else {
-    output_node->set_attr_output_num(sections_num);
+    split_node->set_attr_output_num(sections_num);
     auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end());
-    output_node->set_attr_size_split(size_split);
+    split_node->set_attr_size_split(size_split);
   }
 
-  node_map_type outputs_map;
-  auto out_var_names = op_info->Output("Out");
-  output_node->create_dynamic_output_y(out_var_names.size());
-  int index = 1;
-  for (auto out_var_name : out_var_names) {
-    auto const_node = std::make_shared<ge::op::Const>(
-        unique_op_type + "/const_zero" + std::to_string(index));
-    const_node->set_attr_value(lite::npu::CreateTensorAndFillData(0));
-    lite::npu::OpList::Global().add(const_node);
-    auto add_node = std::make_shared<ge::op::Add>(unique_op_type + "/add" +
-                                                  std::to_string(index));
-    add_node->set_input_x1(*output_node, "y" + std::to_string(index));
-    add_node->set_input_x2(*const_node);
-    outputs_map[out_var_name] = add_node;
-    lite::npu::OpList::Global().add(add_node);
-    index++;
+  split_node->create_dynamic_output_y(out_var_names.size());
+  int idx = 1;
+  for (auto& out_var_name : out_var_names) {
+    auto zero_const_node =
+        graph->AddNode(out_var_name + "/zero" + std::to_string(idx), 0);
+    auto add_node = graph->AddNode<ge::op::Add>(out_var_name);
+    add_node->set_input_x1(*split_node, "y" + std::to_string(idx));
+    add_node->set_input_x2(*zero_const_node);
+    idx++;
   }
-
-  lite::npu::OpList::Global().add(output_node);
-  return outputs_map;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(split, paddle::lite::kernels::npu::bridges::SplitConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         split,
+                         paddle::lite::subgraph::npu::SplitConverter);
diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc
index 84ab3a9eb2db7420a7dd193e1c1cc6c32a362e55..f10fa7b616c7f70352f78c22d46601f4b73f0514 100644
--- a/lite/kernels/npu/bridges/sqrt_op.cc
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
@@ -12,43 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type SqrtConverter(const std::shared_ptr<lite::OpLite> sqrt_op,
-                            const node_map_type& inputs_map) {
-  auto scope = sqrt_op->scope();
-  auto op_info = sqrt_op->op_info();
+int SqrtConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
-
-  std::shared_ptr<ge::op::Sqrt> sqrt_node =
-      std::make_shared<ge::op::Sqrt>(unique_op_type);
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
   auto x_var_name = op_info->Input("X").front();
-
-  CHECK(inputs_map.count(x_var_name));
-  sqrt_node->set_input_x(*inputs_map.at(x_var_name));
-
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(sqrt_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = sqrt_node;
-  return outputs_map;
+  auto out_var_name = op_info->Output("Out").front();
+  auto sqrt_node = graph->AddNode<ge::op::Sqrt>(out_var_name);
+  sqrt_node->set_input_x(*graph->GetNode(x_var_name));
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(sqrt, paddle::lite::kernels::npu::bridges::SqrtConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU, sqrt, paddle::lite::subgraph::npu::SqrtConverter);
diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc
index 2ca91adba0a8b24e6559599cb5952f8b47722ba3..f7d1a2535ee4ed464a59b8908527e96e57d8da2b 100644
--- a/lite/kernels/npu/bridges/square_op.cc
+++ b/lite/kernels/npu/bridges/square_op.cc
@@ -12,44 +12,35 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type SquareConverter(const std::shared_ptr<lite::OpLite> square_op,
-                              const node_map_type& inputs_map) {
-  auto scope = square_op->scope();
-  auto op_info = square_op->op_info();
+int SquareConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
-
-  std::shared_ptr<ge::op::Square> square_node =
-      std::make_shared<ge::op::Square>(unique_op_type);
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
   auto x_var_name = op_info->Input("X").front();
-
-  CHECK(inputs_map.count(x_var_name));
-  square_node->set_input_x(*inputs_map.at(x_var_name));
-
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(square_node);
-
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = square_node;
-  return outputs_map;
+  auto out_var_name = op_info->Output("Out").front();
+  auto square_node = graph->AddNode<ge::op::Square>(out_var_name);
+  square_node->set_input_x(*graph->GetNode(x_var_name));
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(square,
-                    paddle::lite::kernels::npu::bridges::SquareConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         square,
+                         paddle::lite::subgraph::npu::SquareConverter);
diff --git a/lite/kernels/npu/bridges/transpose_op.cc b/lite/kernels/npu/bridges/transpose_op.cc
index 7beab3b7ffd199be36f9f095057bdf8986ef72e2..126390e9b7148c00637a0486f0fe4f864396ea34 100644
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
@@ -12,64 +12,45 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/graph.h"
 #include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace npu {
-namespace bridges {
 
-node_map_type TransposeConverter(
-    const std::shared_ptr<lite::OpLite> transpose_op,
-    const node_map_type& inputs_map) {
-  auto scope = transpose_op->scope();
-  auto op_info = transpose_op->op_info();
+int TransposeConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::npu::UniqueName(op_type);
-  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::Permute> transpose_node =
-      std::make_shared<ge::op::Permute>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
-
-  // paddlelite doesn't have this input
-  // w must be set, but it does nothing
-  auto w_var_name = unique_op_type + "/w";
-  auto* w = scope->Var(w_var_name)->GetMutable<Tensor>();
-  w->Resize({1});
-  auto* w_data = w->mutable_data<float>();
-  for (int i = 0; i < w->numel(); i++) {
-    w_data[i] = 1.f;
-  }
-  auto npu_w = std::make_shared<ge::op::Const>(w_var_name);
-  npu_w->set_attr_value(lite::npu::CvtTensor(w));
-  lite::npu::OpList::Global().add(npu_w);
-
+  auto out_var_name = op_info->Input("Out").front();
   auto axis = op_info->GetAttr<std::vector<int>>("axis");
-  auto npu_axis = ge::AttrValue::LIST_INT(axis.begin(), axis.end());
-
-  CHECK(inputs_map.count(x_var_name));
-  transpose_node->set_input_x(*inputs_map.at(x_var_name));
-  transpose_node->set_input_w(*npu_w);
-  transpose_node->set_attr_order(npu_axis);
-
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(transpose_node);
 
-  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = transpose_node;
-  return outputs_map;
+  auto transpose_node = graph->AddNode<ge::op::Permute>(out_var_name);
+  transpose_node->set_input_x(*graph->GetNode(x_var_name));
+  auto w_const_node = graph->AddNode(out_var_name + "/w", 1.0f);
+  transpose_node->set_input_w(*w_const_node);
+  transpose_node->set_attr_order(
+      ge::AttrValue::LIST_INT(axis.begin(), axis.end()));
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace npu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(transpose,
-                    paddle::lite::kernels::npu::bridges::TransposeConverter);
-REGISTER_NPU_BRIDGE(transpose2,
-                    paddle::lite::kernels::npu::bridges::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         transpose,
+                         paddle::lite::subgraph::npu::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         transpose2,
+                         paddle::lite::subgraph::npu::TransposeConverter);
diff --git a/lite/kernels/npu/bridges/unsqueeze_op.cc b/lite/kernels/npu/bridges/unsqueeze_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0da82e2f63bd194b7bbba50d81a46c2444af3fa3
--- /dev/null
+++ b/lite/kernels/npu/bridges/unsqueeze_op.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int UnsqueezeConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " << op_type << "... ";
+
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto out_shape = scope->FindTensor(out_var_name)->dims().Vectorize();
+  CHECK(op_info->HasAttr("axes"))
+      << "[NPU] unsqueeze not support axes from tensor now";
+
+  auto unsqueeze_node = graph->AddNode<ge::op::Reshape>(out_var_name);
+  unsqueeze_node->set_input_tensor(*graph->GetNode(x_var_name));
+  unsqueeze_node->set_attr_shape(
+      ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         unsqueeze,
+                         paddle::lite::subgraph::npu::UnsqueezeConverter);
+REGISTER_SUBGRAPH_BRIDGE(NPU,
+                         unsqueeze2,
+                         paddle::lite::subgraph::npu::UnsqueezeConverter);
diff --git a/lite/kernels/npu/bridges/unsqueeze_op_test.cc b/lite/kernels/npu/bridges/unsqueeze_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c59843f614c29ea9fca1dd33e6bc6fd75d4246c6
--- /dev/null
+++ b/lite/kernels/npu/bridges/unsqueeze_op_test.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/unsqueeze_op.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+static DDim GetOutputShape(const std::vector<int>& unsqz_dims,
+                           const DDim& in_dims) {
+  int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+  int cur_output_size = in_dims.size();
+  std::vector<int64_t> output_shape(output_size, 0);
+
+  // Validate Check: rank range.
+  CHECK_LE(output_size, 6) << "The output tensor's rank should be less than 6.";
+
+  for (int axis : unsqz_dims) {
+    int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
+    // Validate Check: the axis bound
+    CHECK((cur >= 0) && (cur <= cur_output_size))
+        << "The unsqueeze dims must be within range of current rank.";
+    // Move old axis, and insert new axis
+    for (int i = cur_output_size; i >= cur; --i) {
+      if (output_shape[i] == 1) {
+        // Move axis
+        output_shape[i + 1] = 1;
+        output_shape[i] = 0;
+      }
+    }
+
+    output_shape[cur] = 1;
+    // Add the output size.
+    cur_output_size++;
+  }
+
+  // Make output shape
+  for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+    if (output_shape[out_idx] == 0) {
+      output_shape[out_idx] = in_dims[in_idx++];
+    }
+  }
+
+  return DDim(output_shape);
+}
+
+template <typename dtype>
+void unsqueeze_ref(const std::shared_ptr<operators::UnsqueezeOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  auto axes = op_info->GetAttr<std::vector<int>>("axes");
+  auto y_dims = GetOutputShape(axes, x->dims());
+  out->Resize(y_dims);
+
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  memcpy(out_data, x_data, x->numel() * sizeof(float));
+}
+
+void test_unsqueeze(const std::vector<int64_t>& input_shape,
+                    std::vector<int> axes) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("unsqueeze");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("axes", axes);
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::UnsqueezeOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  unsqueeze_ref<float>(op);
+
+  // compare results
+  CHECK_EQ(out->dims().size(), out_ref->dims().size());
+  for (int i = 0; i < out->dims().size(); i++) {
+    CHECK_EQ(out->dims()[i], out_ref->dims()[i]);
+  }
+
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, unsqueeze) {
+  test_unsqueeze({2}, {0, 2});
+  test_unsqueeze({2, 3}, {1, 3});
+  test_unsqueeze({1, 2, 3}, {3});
+  test_unsqueeze({5, 6, 7}, {1});
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(unsqueeze);
+USE_NPU_BRIDGE(unsqueeze);
diff --git a/lite/backends/npu/builder.cc b/lite/kernels/npu/bridges/utility.cc
similarity index 65%
rename from lite/backends/npu/builder.cc
rename to lite/kernels/npu/bridges/utility.cc
index 954fad8c916e152c5de06ce285b4ac17ecf22a01..fdee98cc868c6509fe158073ff32b7a5953e42b4 100644
--- a/lite/backends/npu/builder.cc
+++ b/lite/kernels/npu/bridges/utility.cc
@@ -12,59 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/npu/builder.h"
-#include <mutex>  // NOLINT
+#include "lite/kernels/npu/bridges/utility.h"
 #include <utility>
-#include "lite/backends/npu/runtime.h"
 
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace npu {
 
-// Build HIAI IR graph to om model, and store om model data into lite tensor
-bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
-                std::vector<ge::Operator>& outputs,  // NOLINT
-                lite::Tensor* model_data) {
-  LOG(INFO) << "[NPU] Build model.";
-  CHECK_GT(inputs.size(), 0);
-  CHECK_GT(outputs.size(), 0);
-  CHECK_NE(model_data, 0);
-  // build IR graph to om model
-  ge::Graph ir_graph("graph");
-  ir_graph.SetInputs(inputs).SetOutputs(outputs);
-  ge::Model om_model("model", "model");
-  om_model.SetGraph(ir_graph);
-  domi::HiaiIrBuild ir_build;
-  domi::ModelBufferData om_model_buf;
-  if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
-    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
-    return false;
-  }
-  if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
-    LOG(WARNING) << "[NPU] BuildIRModel failed!";
-    return false;
-  }
-  // store om model into tensor
-  model_data->Resize({om_model_buf.length});
-  memcpy(model_data->mutable_data<int8_t>(),
-         om_model_buf.data,
-         om_model_buf.length);
-  ir_build.ReleaseModelBuff(om_model_buf);
-  return true;
-}
-
-std::string UniqueName(const std::string& prefix) {
-  static std::mutex counter_mtx;
-  static std::unordered_map<std::string, int> counter_map;
-  std::unique_lock<std::mutex> counter_lck(counter_mtx);
-  int counter = 1;
-  auto it = counter_map.find(prefix);
-  if (it == counter_map.end()) {
-    counter_map[prefix] = counter;
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname) {
+  auto iarg_names = op_info->input_argnames();
+  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
+      iarg_names.end()) {
+    auto inputs = op_info->Input(argname);
+    if (inputs.empty()) {
+      return false;
+    }
+    auto var_name = inputs.front();
+    auto var = scope->FindVar(var_name);
+    return var != nullptr;
   } else {
-    counter = ++(it->second);
+    return false;
   }
-  return prefix + "_" + std::to_string(counter);
 }
 
 ge::DataType CvtPrecisionType(PrecisionType itype) {
@@ -102,25 +73,25 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) {
   return otype;
 }
 
-ge::TensorPtr CvtTensor(lite::Tensor* in_tensor,
+ge::TensorPtr CvtTensor(const Tensor& in_tensor,
                         std::vector<int64_t> out_shape,
                         PrecisionType in_ptype,
                         DataLayoutType in_ltype) {
-  uint8_t* in_data = nullptr;
-  auto in_size = in_tensor->dims().production();
-  auto in_shape = in_tensor->dims().Vectorize();
+  const uint8_t* in_data = nullptr;
+  auto in_size = in_tensor.dims().production();
+  auto in_shape = in_tensor.dims().Vectorize();
   if (out_shape.empty()) {
     out_shape = in_shape;
   }
   int in_bytes;
   if (in_ptype == PRECISION(kFloat)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<float>());
     in_bytes = in_size * sizeof(float);
   } else if (in_ptype == PRECISION(kInt32)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int32_t>());
     in_bytes = in_size * sizeof(int32_t);
   } else if (in_ptype == PRECISION(kInt8)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int8_t>());
     in_bytes = in_size * sizeof(int8_t);
   } else {
     LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype);
@@ -148,7 +119,7 @@ int CvtActMode(std::string act_type) {
     act_mode = 1;
   } else if (act_type == "tanh") {
     act_mode = 2;
-  } else if (act_type == "relu_clipped") {
+  } else if (act_type == "relu_clipped" || act_type == "relu6") {
     act_mode = 3;
   } else if (act_type == "elu") {
     act_mode = 4;
@@ -169,24 +140,7 @@ int CvtActMode(std::string act_type) {
   return act_mode;
 }
 
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname) {
-  auto iarg_names = op_info->input_argnames();
-  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
-      iarg_names.end()) {
-    auto inputs = op_info->Input(argname);
-    if (inputs.empty()) {
-      return false;
-    }
-    auto var_name = inputs.front();
-    auto var = scope->FindVar(var_name);
-    return var != nullptr;
-  } else {
-    return false;
-  }
-}
-
 }  // namespace npu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/npu/builder.h b/lite/kernels/npu/bridges/utility.h
similarity index 82%
rename from lite/backends/npu/builder.h
rename to lite/kernels/npu/bridges/utility.h
index 70200354fbab15f043a537300e92e2a26a3d739e..db8086edde0fa20fddeb6a34ddfd90f9240914cf 100644
--- a/lite/backends/npu/builder.h
+++ b/lite/kernels/npu/bridges/utility.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -24,12 +25,10 @@
 #include "ai_ddk_lib/include/graph/op/all_ops.h"
 #include "ai_ddk_lib/include/graph/operator.h"
 #include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "ai_ddk_lib/include/hiai_ir_build.h"
 #include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/tensor.h"
+#include "lite/utils/macros.h"
 
-// Extended Ops of HIAI DDK
+// Extended ops based on HIAI DDK
 namespace ge {
 /**
  * Pads a tensor.
@@ -59,39 +58,25 @@ REG_OP(Pad)
 
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace npu {
 
-class OpList {
- public:
-  static OpList& Global() {
-    static thread_local OpList x;
-    return x;
-  }
-  void clear() { lists_.clear(); }
-  void add(std::shared_ptr<ge::Operator> p) { lists_.push_back(p); }
-
- private:
-  std::vector<std::shared_ptr<ge::Operator>> lists_;
-};
-
-// Build HIAI IR graph to om model, and store om model data into lite tensor
-bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
-                std::vector<ge::Operator>& outputs,  // NOLINT
-                lite::Tensor* model_data);
-
-std::string UniqueName(const std::string& prefix);
+// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor
+bool HasInputArg(const OpInfo* op_info,
+                 const Scope* scope,
+                 const std::string& argname);
 
 ge::DataType CvtPrecisionType(PrecisionType itype);
 
 ge::Format CvtDataLayoutType(DataLayoutType itype);
 
-ge::TensorPtr CvtTensor(Tensor* in_tensor,
+ge::TensorPtr CvtTensor(const Tensor& in_tensor,
                         std::vector<int64_t> out_shape = {},
                         PrecisionType in_ptype = PRECISION(kFloat),
                         DataLayoutType in_ltype = DATALAYOUT(kNCHW));
 
 template <typename T>
-ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
+ge::TensorPtr CreateTensorAndFillData(const std::vector<T>& data,
                                       std::vector<int64_t> shape = {},
                                       ge::Format format = ge::FORMAT_NCHW) {
   const std::type_info& info = typeid(T);
@@ -136,10 +121,7 @@ ge::TensorPtr CreateTensorAndFillData(T value,
 
 int CvtActMode(std::string act_type);
 
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname);
-
 }  // namespace npu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/npu/graph_compute.cc b/lite/kernels/npu/graph_compute.cc
deleted file mode 100644
index 9a05a33062fa8f58c0f4bd96424d3fb20e457f4b..0000000000000000000000000000000000000000
--- a/lite/kernels/npu/graph_compute.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/npu/graph_compute.h"
-#include <sys/time.h>
-#include <time.h>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-
-void GraphCompute::PrepareForRun() {
-  auto& ctx = this->ctx_->template As<NPUContext>();
-  auto& param = this->Param<param_t>();
-
-  // Load HiAI model from the weight tensor and release its buffer
-  // to save memory
-  CHECK(param.weight);
-  CHECK(lite::npu::LoadModel(*param.weight, &model_client_, &model_name_));
-  // TODO(hong19860320): find an good way to free the model data.
-  // No interface exists to free the data of tensor, so I resize the dim to 1
-  // and change target to force it to realloc a small size memory.
-  param.weight->Resize({1});
-  param.weight->mutable_data<int8_t>(TargetType::kARM);
-  CHECK(model_client_);
-
-  // Query the dimensions of NPU input and output tensors from HiAI model
-  std::vector<hiai::TensorDimension> npu_idims;
-  std::vector<hiai::TensorDimension> npu_odims;
-  int ret =
-      model_client_->GetModelIOTensorDim(model_name_, npu_idims, npu_odims);
-  CHECK_EQ(ret, hiai::AI_SUCCESS)
-      << "[NPU] Get the dimensions of input and output tensors failed.";
-
-  // Check whether the data sizes of NPU input and output tensors are the
-  // same as CPU's, then create and initialize NPU input and output tensors.
-  npu_itensors_.resize(npu_idims.size());
-  npu_otensors_.resize(npu_odims.size());
-  npu_idatasizes_.resize(npu_idims.size());
-  npu_odatasizes_.resize(npu_odims.size());
-  for (size_t i = 0; i < npu_idims.size(); ++i) {
-    auto cpu_itensor = param.inputs[i].second;
-    CHECK(cpu_itensor);
-    VLOG(3) << "[NPU] CPU input dims[" << i << "]: " << cpu_itensor->dims();
-    VLOG(3) << "[NPU] NPU input dims[" << i << "]: {"
-            << npu_idims[i].GetNumber() << "," << npu_idims[i].GetChannel()
-            << "," << npu_idims[i].GetHeight() << "," << npu_idims[i].GetWidth()
-            << "}";
-    npu_idatasizes_[i] = npu_idims[i].GetNumber() * npu_idims[i].GetChannel() *
-                         npu_idims[i].GetHeight() * npu_idims[i].GetWidth();
-    CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
-    npu_itensors_[i].reset(new hiai::AiTensor);
-    npu_itensors_[i]->Init(&(npu_idims[i]));
-  }
-  for (size_t i = 0; i < npu_odims.size(); ++i) {
-    auto cpu_otensor = param.outputs[i].second;
-    CHECK(cpu_otensor);
-    VLOG(3) << "[NPU] CPU output dims[" << i << "]: " << cpu_otensor->dims();
-    VLOG(3) << "[NPU] NPU output dims[" << i << "]: {"
-            << npu_odims[i].GetNumber() << "," << npu_odims[i].GetChannel()
-            << "," << npu_odims[i].GetHeight() << "," << npu_odims[i].GetWidth()
-            << "}";
-    npu_odatasizes_[i] = npu_odims[i].GetNumber() * npu_odims[i].GetChannel() *
-                         npu_odims[i].GetHeight() * npu_odims[i].GetWidth();
-    if (cpu_otensor->dims().production() != npu_odatasizes_[i]) {
-      cpu_otensor->Resize({npu_odims[i].GetNumber(),
-                           npu_odims[i].GetChannel(),
-                           npu_odims[i].GetHeight(),
-                           npu_odims[i].GetWidth()});
-    }
-    npu_otensors_[i].reset(new hiai::AiTensor);
-    npu_otensors_[i]->Init(&(npu_odims[i]));
-  }
-}
-
-void GraphCompute::Run() {
-  auto& param = this->Param<param_t>();
-
-  // Check whether the data sizes of NPU input tensors are the same as
-  // CPU's, and copy the data of CPU input tensors to NPU's.
-  CHECK_EQ(param.inputs.size(), npu_itensors_.size());
-  CHECK_EQ(param.outputs.size(), npu_otensors_.size());
-  for (size_t i = 0; i < param.inputs.size(); ++i) {
-    auto cpu_itensor = param.inputs[i].second;
-    CHECK(cpu_itensor);
-    CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
-    std::memcpy(static_cast<float*>(npu_itensors_[i]->GetBuffer()),
-                cpu_itensor->data<float>(),
-                sizeof(float) * static_cast<size_t>(npu_idatasizes_[i]));
-  }
-
-  // Run HiAI model with model name
-  std::string key = "model_name";  // Note: key seems must be model_name
-  model_context_.AddPara(key, model_name_);
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  int istamp;
-  auto start_time = GetCurrentUS();
-  CHECK_EQ(hiai::AI_SUCCESS,
-           model_client_->Process(
-               model_context_, npu_itensors_, npu_otensors_, 1000, istamp));
-  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
-
-  // Check whether the data sizes of NPU output tensors are the same as
-  // CPU's, and copy the data of NPU output tensors to CPU's.
-  for (size_t i = 0; i < param.outputs.size(); ++i) {
-    auto cpu_otensor = param.outputs[i].second;
-    CHECK(cpu_otensor);
-    CHECK_EQ(cpu_otensor->dims().production(), npu_odatasizes_[i]);
-    std::memcpy(cpu_otensor->mutable_data<float>(),
-                static_cast<float*>(npu_otensors_[i]->GetBuffer()),
-                sizeof(float) * static_cast<size_t>(npu_odatasizes_[i]));
-  }
-}
-
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(graph_op,
-                     kNPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::npu::GraphCompute,
-                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..def7717a973fa232d4a8a56bf44465ef4e341a39
--- /dev/null
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -0,0 +1,189 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "ai_ddk_lib/include/hiai_ir_build.h"
+#include "lite/backends/npu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  // Convert all of input data vars and added into the HiAI IR graph
+  subgraph::npu::Graph graph;
+  for (auto& input_name : input_names_) {
+    auto input_tensor = scope_->FindMutableTensor(input_name);
+    CHECK(input_tensor);
+    auto input_node =
+        graph.AddNode(input_name, input_tensor->dims().Vectorize());
+    CHECK(input_node);
+    // HiAI DDK doesn't support dynamic dimensions/shapes, so need to rebuild
+    // the program when the shape of any input tensor is changed.
+    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
+  }
+  // Convert all of ops and its weights and added into the HiAI IR graph
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = inst.op();
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists("NPU", op_type)) {
+      return subgraph::FAILED;
+    }
+    status |= bridges.Select("NPU", op_type)(reinterpret_cast<void*>(&graph),
+                                             const_cast<OpLite*>(op));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Set the input and output nodes of the HiAI IR graph
+  std::vector<ge::Operator> input_nodes, output_nodes;
+  for (auto& input_name : input_names_) {
+    input_nodes.push_back(*graph.GetNode(input_name));
+  }
+  for (auto& output_name : output_names_) {
+    output_nodes.push_back(*graph.GetNode(output_name));
+  }
+  // Build the HiAI IR graph to HiAI om model
+  device_program_ =
+      lite::npu::Device::Global().Build(model_name_, input_nodes, output_nodes);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[NPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+
+  // Query and check the dimensions of input and output tensors
+  std::vector<hiai::TensorDimension> device_idims, device_odims;
+  if (device_program_->GetModelIOTensorDim(
+          model_name_, device_idims, device_odims) != hiai::AI_SUCCESS) {
+    LOG(WARNING)
+        << "[NPU] Get the dimensions of input and output tensors failed!";
+    return subgraph::FAILED;
+  }
+  CHECK_EQ(device_idims.size(), input_names_.size());
+  CHECK_EQ(device_odims.size(), output_names_.size());
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  device_idatasizes_.resize(input_names_.size());
+  device_itensors_.resize(input_names_.size());
+  origin_odims_.resize(output_names_.size());
+  origin_otensors_.resize(output_names_.size());
+  device_odatasizes_.resize(output_names_.size());
+  device_otensors_.resize(output_names_.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "[NPU] Input dims[" << i << "]: {" << device_idims[i].GetNumber()
+            << "," << device_idims[i].GetChannel() << ","
+            << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
+            << "}";
+    device_idatasizes_[i] =
+        device_idims[i].GetNumber() * device_idims[i].GetChannel() *
+        device_idims[i].GetHeight() * device_idims[i].GetWidth();
+    CHECK_EQ(device_idatasizes_[i], origin_idims_[i].production());
+    device_itensors_[i].reset(new hiai::AiTensor);
+    device_itensors_[i]->Init(&(device_idims[i]));
+  }
+  for (int i = 0; i < output_names_.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "[NPU] Output dims[" << i << "]: {"
+            << device_odims[i].GetNumber() << ","
+            << device_odims[i].GetChannel() << ","
+            << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
+            << "}";
+    device_odatasizes_[i] =
+        device_odims[i].GetNumber() * device_odims[i].GetChannel() *
+        device_odims[i].GetHeight() * device_odims[i].GetWidth();
+    CHECK_EQ(device_odatasizes_[i], origin_odims_[i].production());
+    device_otensors_[i].reset(new hiai::AiTensor);
+    device_otensors_[i]->Init(&(device_odims[i]));
+  }
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  // Copy the data of origin input tensors to the buffer of input HiAI tensors
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    std::memcpy(static_cast<float*>(device_itensors_[i]->GetBuffer()),
+                origin_itensors_[i]->mutable_data<float>(),
+                sizeof(float) * static_cast<size_t>(device_idatasizes_[i]));
+  }
+  // Run the HiAI model by name
+  std::string key = "model_name";  // Note: key seems must be model_name
+  model_context_.AddPara(key, model_name_);
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  int istamp;
+  auto start_time = GetCurrentUS();
+  CHECK_EQ(
+      device_program_->Process(
+          model_context_, device_itensors_, device_otensors_, 1000, istamp),
+      hiai::AI_SUCCESS);
+  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
+  // Copy the data of output HiAI tensor to the buffer of origin output tensors
+  for (size_t i = 0; i < output_names_.size(); i++) {
+    std::memcpy(origin_otensors_[i]->mutable_data<float>(),
+                static_cast<float*>(device_otensors_[i]->GetBuffer()),
+                sizeof(float) * static_cast<size_t>(device_odatasizes_[i]));
+  }
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kNPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::npu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc6ca9c13d3e0110b27bef4e1399b1abedadb256
--- /dev/null
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "ai_ddk_lib/include/HiAiModelManagerService.h"
+#include "lite/core/kernel.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+  std::string model_name_;
+  hiai::AiContext model_context_;
+  std::vector<int64_t> device_idatasizes_;
+  std::vector<int64_t> device_odatasizes_;
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_itensors_;
+  std::vector<std::shared_ptr<hiai::AiTensor>> device_otensors_;
+  std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
+};
+
+class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 99b23c19f0f5870102782f0b4d639f6103257c31..3423b1e920e5e7c4aaa34125303b09d943e47b62 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -14,6 +14,8 @@ add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
 add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
+add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
 
@@ -22,7 +24,7 @@ lite_cc_test(test_elementwise_add_opencl SRCS elementwise_add_compute_test.cc
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 lite_cc_test(test_pool_opencl SRCS pool_compute_test.cc
-        DEPS pool_opencl op_registry program context
+        DEPS pool_opencl op_registry program context cl_image_converter
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 lite_cc_test(test_fc_opencl SRCS fc_compute_test.cc
@@ -39,18 +41,24 @@ lite_cc_test(test_io_copy_compute_opencl SRCS io_copy_compute_test.cc
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 #TODO(ysh329): comment buffer-impl relu
-#lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
-#        DEPS relu_opencl op_registry program context
-#        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_relu_opencl SRCS relu_compute_test.cc
+        DEPS relu_opencl layout_opencl op_registry program context
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
-        DEPS depthwise_conv2d_opencl op_registry program context
+        DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
+lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc
+        DEPS conv2d_1x1_opencl cl_image_converter op_registry program context
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
+        DEPS reshape_opencl cl_image_converter op_registry program context
+        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
         DEPS conv_opencl op_registry program context
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
 lite_cc_test(test_layout_opencl SRCS layout_compute_test.cc
-        DEPS layout_opencl op_registry program context
+        DEPS layout_opencl op_registry program context cl_image_converter
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
diff --git a/lite/kernels/opencl/conv2d_1x1_compute.cc b/lite/kernels/opencl/conv2d_1x1_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f313f542cbb22629b17db5dae197fe682d00f47
--- /dev/null
+++ b/lite/kernels/opencl/conv2d_1x1_compute.cc
@@ -0,0 +1,249 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+#define USE_BUFFER_FOR_CONV1x1_BIAS
+class Conv2d1x1Image2DCompute : public KernelLite<TARGET(kOpenCL),
+                                                  PRECISION(kFloat),
+                                                  DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConvParam;
+
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    if (param.fuse_relu) {
+      build_options_ += " -DRELU";
+    }
+
+    const bool has_bias = param.bias != nullptr;
+    const bool is_element_wise_bias =
+        has_bias && param.output->dims() == param.bias->dims();
+    if (has_bias) {
+      build_options_ += is_element_wise_bias ? " -DBIASE_ELE" : " -DBIASE_CH";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/conv2d_1x1_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    auto input_dims = param.x->dims();
+    auto paddings = *param.paddings;
+    auto strides = param.strides;
+    auto* input_image = param.x->data<float, cl::Image2D>();
+    auto* filter_image = param.filter->data<float, cl::Image2D>();
+    auto filter_dims = param.filter->dims();
+    auto output_dims = param.output->dims();
+
+    int input_width = input_dims[3];
+    int input_height = input_dims[2];
+    int output_width = output_dims[3];
+    int output_height = output_dims[2];
+    auto out_image_shape = InitImageDimInfoWith(output_dims);
+    auto* out_image = param.output->mutable_data<float, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+
+    const bool has_bias = param.bias != nullptr;
+    const bool is_element_wise_bias =
+        has_bias && param.output->dims() == param.bias->dims();
+    int offset = static_cast<int>(param.filter->dims()[2]) / 2 -
+                 static_cast<int>(paddings[0]);
+
+    // calc input_c_block
+    auto input_image_shape = InitImageDimInfoWith(input_dims);
+    int input_c_block = input_image_shape["width"] / input_dims[3];
+    int input_c = input_dims[1];
+    auto dilations = *param.dilations;
+
+    const std::vector<size_t>& default_work_size =
+        DefaultWorkSize(output_dims,
+                        DDim(std::vector<DDim::value_type>{
+                            static_cast<int64_t>(out_image_shape["width"]),
+                            static_cast<int64_t>(out_image_shape["height"])}));
+
+    int c_block = default_work_size[0];
+    int w = default_work_size[1];
+    int nh = default_work_size[2];
+
+    VLOG(4) << "============ conv2d_1x1 params ============";
+    VLOG(4) << "input_image_shape: " << input_image_shape["width"] << ","
+            << input_image_shape["height"];
+    VLOG(4) << "input_c_block: " << input_c_block;
+    VLOG(4) << "input_c: " << input_c;
+    VLOG(4) << "input_image: " << input_image;
+    VLOG(4) << "filter_dims: " << filter_dims;
+    VLOG(4) << "filter_image: " << filter_image;
+    VLOG(4) << "output_dims: " << output_dims;
+    VLOG(4) << "out_image_shape: " << out_image_shape["width"] << ", "
+            << out_image_shape["height"];
+    VLOG(4) << "paddings: " << paddings[0] << "," << paddings[1];
+    VLOG(4) << "has bias: " << has_bias;
+    VLOG(4) << "is_element_wise_bias : " << is_element_wise_bias;
+    VLOG(4) << "strides: " << strides[0] << "," << strides[1];
+    VLOG(4) << "offset: " << offset;
+    VLOG(4) << "dilations.size : " << dilations.size();
+    VLOG(4) << "dilations: " << dilations[0] << ", " << dilations[1];
+    VLOG(4) << "default work size{c_block, w, nh}: "
+            << "{" << c_block << ", " << w << ", " << nh << ""
+            << "}";
+
+    CHECK_GE(dilations.size(), 2);
+    CHECK(dilations[0] == dilations[1]);
+    CHECK_GE(input_dims.size(), 4);
+    CHECK_GE(paddings.size(), 2);
+    CHECK(paddings[0] == paddings[1]);
+    CHECK_GE(strides.size(), 2);
+    CHECK(strides[0] == strides[1]);
+
+    // handle bias  use buffer for channel wise , use image for element wise
+    const cl::Buffer* bias_buf = nullptr;
+    const cl::Image2D* bias_image = nullptr;
+    if (has_bias) {
+#ifndef USE_BUFFER_FOR_CONV1x1_BIAS
+      is_element_wise_bias
+          ? (bias_image = param.bias->data<float, cl::Image2D>())
+          : (bias_buf = param.bias->data<float, cl::Buffer>());
+#else
+      bias_image = param.bias->data<float, cl::Image2D>();
+#endif
+    }
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    int maped_w = maptofactor(w, 4);
+
+    VLOG(4) << "kernel_key: " << kernel_key.str();
+    VLOG(4) << "kernel ready ... " << kernel_key.str();
+    VLOG(4) << "maped_w: " << maped_w;
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, c_block);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, maped_w);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, nh);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *input_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *filter_image);
+    CL_CHECK_FATAL(status);
+    if (has_bias) {
+#ifndef USE_BUFFER_FOR_CONV1x1_BIAS
+      if (is_element_wise_bias != 0) {
+        VLOG(4) << "set bias_image: ";
+        status = kernel.setArg(++arg_idx, *bias_image);
+      } else {
+        VLOG(4) << "set bias_buf: ";
+        status = kernel.setArg(++arg_idx, *bias_buf);
+      }
+#else
+      status = kernel.setArg(++arg_idx, *bias_image);
+#endif
+      CL_CHECK_FATAL(status);
+    }
+    status = kernel.setArg(++arg_idx, *out_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, strides[0]);
+    CL_CHECK_FATAL(status);
+
+    status = kernel.setArg(++arg_idx, offset);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, input_c_block);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, input_c);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, dilations[0]);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, input_width);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, input_height);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, output_width);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, output_height);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, w);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                    static_cast<size_t>(maped_w),
+                    static_cast<size_t>(default_work_size.data()[2])};
+
+    VLOG(4) << "out_image: " << out_image;
+    VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
+            << global_work_size[1] << "," << global_work_size[2] << "}";
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_image, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"conv2d_1x1"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(conv2d_1x1,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::Conv2d1x1Image2DCompute,
+                     image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageNW))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/conv2d_1x1_compute_test.cc b/lite/kernels/opencl/conv2d_1x1_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c35e73449277aad3de3b3112624f0b1b0d26a4ae
--- /dev/null
+++ b/lite/kernels/opencl/conv2d_1x1_compute_test.cc
@@ -0,0 +1,378 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/cl_image_converter.h"
+#include "lite/backends/opencl/target_wrapper.h"
+
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename Dtype1, typename Dtype2>
+static void conv_basic(const Dtype1* din,
+                       Dtype2* dout,
+                       int num,
+                       int chout,
+                       int hout,
+                       int wout,
+                       int chin,
+                       int hin,
+                       int win,
+                       const Dtype1* weights,
+                       const Dtype2* bias,
+                       int group,
+                       int kernel_w,
+                       int kernel_h,
+                       int stride_w,
+                       int stride_h,
+                       int dila_w,
+                       int dila_h,
+                       int pad_w,
+                       int pad_h,
+                       bool flag_bias,
+                       bool flag_relu) {
+  Dtype2 beta = 0;
+  auto src_data = din;
+  auto dst_data_ref = dout;
+  auto weights_data = weights;
+  auto with_bias = flag_bias;
+  auto bias_data = bias;
+
+  int in_num = num;
+  int out_channels = chout;
+  int out_h = hout;
+  int out_w = wout;
+
+  int in_channel = chin;
+  int in_h = hin;
+  int in_w = win;
+  int out_c_group = out_channels / group;
+  int in_c_group = in_channel / group;
+
+  for (int n = 0; n < in_num; ++n) {
+    for (int g = 0; g < group; ++g) {
+      for (int oc = 0; oc < out_c_group; ++oc) {
+        for (int oh = 0; oh < out_h; ++oh) {
+          for (int ow = 0; ow < out_w; ++ow) {
+            int out_idx = n * group * out_c_group * out_h * out_w +
+                          g * out_c_group * out_h * out_w + oc * out_h * out_w +
+                          oh * out_w + ow;
+            Dtype2 bias_d =
+                with_bias ? (bias_data[g * out_c_group + oc]) : (Dtype2)0;
+            dst_data_ref[out_idx] = bias_d;  // + dst_data_ref[out_idx] * beta;
+            for (int ic = 0; ic < in_c_group; ++ic) {
+              for (int kh = 0; kh < kernel_h; ++kh) {
+                for (int kw = 0; kw < kernel_w; ++kw) {
+                  int iw = ow * stride_w - pad_w + kw * (dila_w);
+                  int ih = oh * stride_h - pad_h + kh * (dila_h);
+                  if (iw < 0 || iw >= in_w) continue;
+                  if (ih < 0 || ih >= in_h) continue;
+
+                  int iidx = n * in_channel * in_h * in_w +
+                             g * in_c_group * in_h * in_w + ic * in_h * in_w +
+                             ih * in_w + iw;
+                  int widx =
+                      g * out_c_group * in_c_group * kernel_h * kernel_w +
+                      oc * in_c_group * kernel_h * kernel_w +
+                      ic * kernel_h * kernel_w + kh * kernel_w + kw;
+
+                  dst_data_ref[out_idx] += src_data[iidx] * weights_data[widx];
+                }
+              }
+            }
+            if (flag_relu) {
+              dst_data_ref[out_idx] = dst_data_ref[out_idx] > (Dtype2)0
+                                          ? dst_data_ref[out_idx]
+                                          : (Dtype2)0;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(conv2d_1x1, compute) {
+  // conv infos
+  const int ksize = 1;
+  const int stride = 1;
+  const int pad = 0;
+  const int group = 1;
+  const int dilation = 0;
+  //  int loop_cnt = 0;
+
+  const bool bias_flag = true;
+  const bool relu_flag = true;
+  const int batch_size = 8;
+  const int oc = 64;
+  const int ih = 28;
+  const int iw = 28;
+  const int ic = 63;
+
+  const int oh = ih;
+  const int ow = iw;
+
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create("conv2d_1x1",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFloat),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+  LOG(INFO) << "created conv2d_1x1 kernel";
+
+  LOG(INFO) << "prepare kernel ------";
+
+  lite::Tensor input, filter, bias, output;
+  operators::ConvParam param;
+  param.x = &input;
+  param.filter = &filter;
+  param.output = &output;
+  if (bias_flag) {
+    param.bias = &bias;
+  }
+  param.fuse_relu = relu_flag;
+
+  std::vector<int> paddings = {pad, pad, pad, pad};
+  std::vector<int> dilations = {dilation, dilation};
+
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
+  param.strides = std::vector<int>{stride, stride};
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  std::unique_ptr<KernelContext> conv_1x1_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(conv_1x1_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(conv_1x1_context));
+
+  const DDim& input_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+
+  const DDim& filter_dim =
+      lite::DDim{std::vector<int64_t>({oc, ic, ksize, ksize})};
+  const DDim& out_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, oc, ih, iw})};
+  // element wise bias
+  const DDim& bias_dim = lite::DDim{std::vector<int64_t>({oc})};
+
+  param.x->Resize(input_dim);
+  param.filter->Resize(filter_dim);
+  param.output->Resize(out_dim);
+  if (bias_flag) {
+    param.bias->Resize(bias_dim);
+  }
+
+  kernel->SetParam(param);
+
+  size_t input_image_width = iw * ((ic + 3) / 4);
+  size_t input_image_height = ih * batch_size;
+
+  size_t out_image_width = ow * ((oc + 3) / 4);
+  size_t out_image_height = oh * batch_size;
+
+  size_t bias_image_width = ow * ((oc + 3) / 4);
+  size_t bias_image_height = oh * batch_size;
+
+  size_t filter_image_width = ksize * ((oc + 3) / 4);
+  size_t filter_image_height = ic * ksize;
+
+  auto* input_data = input.mutable_data<float, cl::Image2D>(input_image_width,
+                                                            input_image_height);
+  auto* filter_data = filter.mutable_data<float, cl::Image2D>(
+      filter_image_width, filter_image_height);
+  bias.mutable_data<float, cl::Image2D>(bias_image_width, bias_image_height);
+  auto* bias_data = bias.mutable_data<float, cl::Image2D>(bias_image_width,
+                                                          bias_image_height);
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  LOG(INFO) << "map input ...";
+  auto* mapped_input =
+      static_cast<float*>(TargetWrapperCL::MapImage(input_data,
+                                                    input_image_width,
+                                                    input_image_height,
+                                                    cl_image2d_row_pitch,
+                                                    cl_image2d_slice_pitch));
+
+  LOG(INFO) << "map filter ...";
+  auto* mapped_filter =
+      static_cast<float*>(TargetWrapperCL::MapImage(filter_data,
+                                                    filter_image_width,
+                                                    filter_image_height,
+                                                    cl_image2d_row_pitch,
+                                                    cl_image2d_slice_pitch));
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> gen(-5, 5);
+  std::vector<float> input_v(batch_size * ic * ih * iw);
+  std::vector<float> filter_v(oc * ic * ksize * ksize);
+  std::vector<float> output_v(batch_size * oc * ih * iw);
+  std::vector<float> bias_v(oc);
+
+  float* input_v_data = &input_v[0];
+  float* filter_v_data = &filter_v[0];
+  float* output_v_data = &output_v[0];
+  float* bias_v_data = &bias_v[0];
+
+  LOG(INFO) << "gen input and filter ...";
+
+  for (auto& i : input_v) {
+    i = gen(engine);
+  }
+  for (auto& f : filter_v) {
+    f = gen(engine);
+  }
+
+  LOG(INFO) << "after gen input and filter ...";
+  LOG(INFO) << "input_v.size(): " << input_v.size();
+  LOG(INFO) << "filter_v.size(): " << filter_v.size();
+  LOG(INFO) << "output_v.size(): " << output_v.size();
+  LOG(INFO) << "bias_v.size(): " << bias_v.size();
+  LOG(INFO) << "input_dim.production(): " << input_dim.production();
+  LOG(INFO) << "filter_dim.production(): " << filter_dim.production();
+  LOG(INFO) << "out_dim.production(): " << out_dim.production();
+  LOG(INFO) << "bias_dim.production(): " << bias_dim.production();
+  LOG(INFO) << "4 * input_image_height * input_image_width: "
+            << 4 * input_image_height * input_image_width;
+  LOG(INFO) << "4 * filter_image_width * filter_image_height: "
+            << 4 * filter_image_width * filter_image_height;
+
+  CHECK(input_dim.production() == input_v.size());
+  CHECK_LE(input_dim.production(), 4 * input_image_height * input_image_width);
+  CHECK(filter_dim.production() == filter_v.size());
+  CHECK_LE(filter_dim.production(),
+           4 * filter_image_width * filter_image_height);
+
+  paddle::lite::CLImageConverterDefault default_convertor;
+  LOG(INFO) << "set mapped input  ...";
+  default_convertor.NCHWToImage(input_v_data, mapped_input, input_dim);
+  LOG(INFO) << "set mapped filter  ...";
+  paddle::lite::CLImageConverterNWBlock nw_convertor;
+  nw_convertor.NCHWToImage(filter_v_data, mapped_filter, filter_dim);
+
+  LOG(INFO) << "resize output  ...";
+  output.Resize(out_dim);
+
+  // cpu conv basic calc
+  lite::Tensor out_ref;
+  out_ref.Resize(out_dim);
+
+  float* mapped_bias = nullptr;
+  if (bias_flag) {
+    mapped_bias =
+        static_cast<float*>(TargetWrapperCL::MapImage(bias_data,
+                                                      bias_image_width,
+                                                      bias_image_height,
+                                                      cl_image2d_row_pitch,
+                                                      cl_image2d_slice_pitch));
+
+    for (int i = 0; i < bias_dim.production(); ++i) {
+      bias_v[i] = static_cast<int>(gen(engine));
+    }
+    CLImageConverterFolder folder_convertor;
+    folder_convertor.NCHWToImage(bias_v_data, mapped_bias, bias_dim);
+  }
+  LOG(INFO) << "prepare kernel ready";
+
+  LOG(INFO) << "kernel launch ...";
+  kernel->Launch();
+  LOG(INFO) << "mutable output ...";
+  auto* output_data = output.mutable_data<float, cl::Image2D>(out_image_width,
+                                                              out_image_height);
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  auto* mapped_output =
+      static_cast<float*>(TargetWrapperCL::MapImage(output_data,
+                                                    out_image_width,
+                                                    out_image_height,
+                                                    cl_image2d_row_pitch,
+                                                    cl_image2d_slice_pitch));
+  LOG(INFO) << "mutable_data out_ref_data: ";
+
+  // run cpu ref
+  auto* out_ref_data = out_ref.mutable_data<float>(TARGET(kARM));
+
+  LOG(INFO) << " conv_basic beigin ..... ";
+
+  conv_basic<float, float>(input_v_data,
+                           out_ref_data,
+                           batch_size,
+                           oc,
+                           oh,
+                           ow,
+                           ic,
+                           ih,
+                           iw,
+                           filter_v_data,
+                           bias_v_data,  // mapped_bias,
+                           group,
+                           ksize,
+                           ksize,
+                           stride,
+                           stride,
+                           dilation,
+                           dilation,
+                           pad,
+                           pad,
+                           bias_flag,
+                           relu_flag);
+  LOG(INFO) << " conv_basic end ..... ";
+
+  LOG(INFO) << " out_dim: " << out_dim;
+  const DDim& out_image_dims = lite::DDim{
+      std::vector<int64_t>({static_cast<int64_t>(out_image_width),
+                            static_cast<int64_t>(out_image_height)})};
+  default_convertor.ImageToNCHW(
+      mapped_output, output_v_data, out_image_dims, out_dim);
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(output_v_data[i], out_ref_data[i], 1e-3);
+    if (abs(output_v_data[i] - out_ref_data[i]) > 1e-3) {
+      LOG(FATAL) << "error idx:" << i;
+    }
+  }
+
+  TargetWrapperCL::Unmap(output_data, mapped_output);
+  TargetWrapperCL::Unmap(filter_data, mapped_filter);
+  TargetWrapperCL::Unmap(input_data, mapped_input);
+  if (bias_flag) {
+    if (mapped_bias) {
+      TargetWrapperCL::Unmap(bias_data, mapped_bias);
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(conv2d_1x1, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_compute.cc
index ed942d7f0cb7b0bab119f258fb6393b9dbd211a6..5d573c14f7f4ea9b3768b5c8fdfe30c1b6c84c99 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 
@@ -29,6 +30,10 @@ class DepthwiseConv2dCompute
  public:
   using param_t = operators::ConvParam;
 
+  std::string doc() const override {
+    return "DepthwiseConv2d using cl::Buffer, kFloat";
+  }
+
   void PrepareForRun() override {
     const auto& param = *param_.get_mutable<param_t>();
     if (param.fuse_relu) {
@@ -109,11 +114,233 @@ class DepthwiseConv2dCompute
   }
 
  private:
-  std::string kernel_func_name_{"depthwise_conv2d"};
+  std::string kernel_func_name_{"depthwise_conv2d_3x3"};
   std::string build_options_{"-DCL_DTYPE=float"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
+class DepthwiseConv2dComputeFP16Image
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConvParam;
+
+  std::string doc() const override {
+    return "DepthwiseConv2d using cl::Image2D/kImageDefault, kFP16";
+  }
+
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    if (param.fuse_relu) {
+      build_options_ += " -DRELU";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    auto x_dims = param.x->dims();
+    auto filter_dims = param.filter->dims();
+    auto output_dims = param.output->dims();
+    auto paddings = *param.paddings;
+    auto strides = param.strides;
+    auto dilations = *param.dilations;
+    int offset = filter_dims[2] / 2 - paddings[0];
+    int input_c_block = (x_dims[1] + 3) / 4;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* input_img = param.x->data<int16_t, cl::Image2D>();
+    auto* filter_img = param.filter->data<int16_t, cl::Image2D>();
+
+    auto* bias_img = param.bias == nullptr
+                         ? static_cast<cl::Image2D*>(nullptr)
+                         : param.bias->data<int16_t, cl::Image2D>();
+
+    auto image_shape = InitImageDimInfoWith(output_dims);
+
+    auto* output_img = param.output->mutable_data<int16_t, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int c_block = (output_dims[1] + 3) / 4;
+    int w = output_dims[3];
+    int nh = output_dims[0] * output_dims[2];
+    auto global_work_size = cl::NDRange(c_block, w, nh);
+
+    VLOG(4) << "setArg";
+    VLOG(4) << "c_block = " << c_block;
+    VLOG(4) << "w = " << w;
+    VLOG(4) << "nh = " << nh;
+
+    VLOG(4) << "strides = " << strides[0];
+    VLOG(4) << "offset = " << offset;
+    VLOG(4) << "dilations = " << dilations[0];
+    VLOG(4) << "input_c_block = " << input_c_block;
+    VLOG(4) << "x_dims[3] = " << x_dims[3];
+    VLOG(4) << "x_dims[2] = " << x_dims[2];
+    VLOG(4) << "output_dims[3] = " << output_dims[3];
+    VLOG(4) << "output_dims[2] = " << output_dims[2];
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(w));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *input_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *filter_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *output_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(offset));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(input_c_block));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(output_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"depth_conv2d_3x3"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class DepthwiseConv2d3x3s1ComputeFP16Image
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ConvParam;
+
+  std::string doc() const override {
+    return "DepthwiseConv2d3x3s1 using cl::Image2D/kImageDefault, kFP16";
+  }
+
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    if (param.fuse_relu) {
+      build_options_ += " -DRELU";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/depthwise_conv2d_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    auto x_dims = param.x->dims();
+    auto filter_dims = param.filter->dims();
+    auto output_dims = param.output->dims();
+    auto paddings = *param.paddings;
+    auto strides = param.strides;
+    auto dilations = *param.dilations;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    auto* input_img = param.x->data<int16_t, cl::Image2D>();
+    auto* filter_img = param.filter->data<int16_t, cl::Image2D>();
+
+    auto* bias_img = param.bias == nullptr
+                         ? static_cast<cl::Image2D*>(nullptr)
+                         : param.bias->data<int16_t, cl::Image2D>();
+
+    auto image_shape = InitImageDimInfoWith(output_dims);
+
+    auto* output_img = param.output->mutable_data<int16_t, cl::Image2D>(
+        image_shape["width"], image_shape["height"]);
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int c_block = (output_dims[1] + 3) / 4;
+    int w = output_dims[3];
+    int nh = output_dims[0] * output_dims[2];
+
+    int w_blk_size = 2;
+    int w_blk = (w + w_blk_size - 1) / w_blk_size;
+
+    auto global_work_size = cl::NDRange(c_block, w_blk, nh);
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, static_cast<const int>(c_block));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(w_blk));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(nh));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *input_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *filter_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *output_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(dilations[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(x_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(output_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"depth_conv2d_3x3s1"};
+  std::string build_options_{"-DCL_DTYPE_half"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
@@ -130,3 +357,28 @@ REGISTER_LITE_KERNEL(depthwise_conv2d,
     .BindInput("Filter", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(
+    depthwise_conv2d,
+    kOpenCL,
+    kFP16,
+    kImageDefault,
+    paddle::lite::kernels::opencl::DepthwiseConv2dComputeFP16Image,
+    image2d)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Bias",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("Filter",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageNW))})
+    .BindOutput("Output",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
index 3556d1abedd5b4548b78b90b75de2ee86572fdb7..c52aa87a73c8f9cbd91851c96162cde817f299b4 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
@@ -14,6 +14,7 @@
 
 #include <gtest/gtest.h>
 #include <random>
+#include "lite/backends/opencl/cl_image_converter.h"
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
@@ -89,7 +90,7 @@ void depth_conv(const T* input_data,
   }
 }
 
-TEST(depthwise_conv2d, compute) {
+TEST(depthwise_conv2d_buffer_fp32, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
                                                  TARGET(kOpenCL),
@@ -176,7 +177,135 @@ TEST(depthwise_conv2d, compute) {
   TargetWrapperCL::Unmap(input_data, mapped_input);
 }
 
+TEST(depthwise_conv2d_image2d_fp16, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create("depthwise_conv2d",
+                                                 TARGET(kOpenCL),
+                                                 PRECISION(kFP16),
+                                                 DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel";
+  lite::Tensor input, filter, output;
+  operators::ConvParam param;
+  param.x = &input;
+  param.filter = &filter;
+  param.output = &output;
+  std::vector<int> paddings = {0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.strides = std::vector<int>{1, 1};
+  std::vector<int> dilations = {1, 1};
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> dep_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(dep_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(dep_context));
+
+  LOG(INFO) << "kernel ready";
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> gen(-5, 5);
+  std::vector<float> input_v(1 * 32 * 112 * 112);
+  std::vector<float> filter_v(32 * 1 * 3 * 3);
+  for (auto& i : input_v) {
+    i = gen(engine);
+  }
+  for (auto& f : filter_v) {
+    f = gen(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  input.Resize({1, 32, 112, 112});
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim input_image_shape =
+      default_converter->InitImageDimInfoWith(input.dims());
+  LOG(INFO) << "input_image_shape = " << input_image_shape[0] << " "
+            << input_image_shape[1];
+  std::vector<float> input_image_data(input_image_shape.production() *
+                                      4);  // 4 : RGBA
+  default_converter->NCHWToImage(
+      input_v.data(), input_image_data.data(), input.dims());
+  auto* input_image = input.mutable_data<int16_t, cl::Image2D>(
+      input_image_shape[0], input_image_shape[1], input_image_data.data());
+
+  LOG(INFO) << "prepare kernel";
+  filter.Resize({32, 1, 3, 3});
+  CLImageConverterNWBlock* nw_converter = new CLImageConverterNWBlock();
+  DDim filter_image_shape = nw_converter->InitImageDimInfoWith(filter.dims());
+  LOG(INFO) << "filter_image_shape = " << filter_image_shape[0] << " "
+            << filter_image_shape[1];
+  std::vector<float> filter_image_data(filter_image_shape.production() *
+                                       4);  // 4 : RGBA
+  nw_converter->NCHWToImage(
+      filter_v.data(), filter_image_data.data(), filter.dims());
+  auto* filter_image = filter.mutable_data<int16_t, cl::Image2D>(
+      filter_image_shape[0], filter_image_shape[1], filter_image_data.data());
+
+  LOG(INFO) << "launch";
+  output.Resize({1, 32, 110, 110});
+  DDim output_image_shape =
+      default_converter->InitImageDimInfoWith(output.dims());
+  LOG(INFO) << "output_image_shape = " << output_image_shape[0] << " "
+            << output_image_shape[1];
+  auto* output_image = output.mutable_data<int16_t, cl::Image2D>(
+      output_image_shape[0], output_image_shape[1]);
+
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<int16_t, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    LOG(INFO) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+    LOG(INFO) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  lite::Tensor output_ref;
+  output_ref.Resize({1, 32, 110, 110});
+  auto* output_ref_data = output_ref.mutable_data<float>(TARGET(kARM));
+  depth_conv<float, 1, 1>(input_v.data(),
+                          input.dims(),
+                          filter_v.data(),
+                          filter.dims(),
+                          output_ref_data,
+                          output_ref.dims());
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  float* output_image_data = new float[output_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(output_image_data,
+                              output_image,
+                              output_image_shape[0],
+                              output_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+
+  float* output_data = new float[output_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      output_image_data, output_data, output_image_shape, output.dims());
+
+  LOG(INFO) << "output_data vs output_ref_data";
+  for (int i = 0; i < output.dims().production(); i++) {
+    EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+    LOG(INFO) << output_data[i] << " " << output_ref_data[i];
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
 
 USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kOpenCL, kFP16, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/elementwise_add_compute.h b/lite/kernels/opencl/elementwise_add_compute.h
index 2f41dfaa2b2f88977a5f56b3c33b556fb06c9125..bd0398ca3f286abca369910a649947d112b40b9a 100644
--- a/lite/kernels/opencl/elementwise_add_compute.h
+++ b/lite/kernels/opencl/elementwise_add_compute.h
@@ -41,7 +41,7 @@ class ElementwiseAddCompute
   size_t num_{1};
   param_t* ele_param_{nullptr};
   std::string kernel_func_name_{"elementwise_add"};
-  std::string build_options_{"-DCL_DTYPE=float"};
+  std::string build_options_{"-DCL_DTYPE_float"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
diff --git a/lite/kernels/opencl/image_helper.h b/lite/kernels/opencl/image_helper.h
index d164f1ef777a02e5fd3bd33f5cab117de17834b8..d0d282250d1c5658bc8f684b52b4b0d140895833 100644
--- a/lite/kernels/opencl/image_helper.h
+++ b/lite/kernels/opencl/image_helper.h
@@ -40,6 +40,39 @@ static std::map<std::string, size_t> InitImageDimInfoWith(
   size_t height = H * N;
   return std::map<std::string, size_t>({{"width", width}, {"height", height}});
 }
+inline static int maptofactor(int i, int factor) {
+  return (i + factor - 1) / factor;
+}
+
+static std::vector<size_t> DefaultWorkSize(const DDim& image_dim,
+                                           const DDim& image_shape) {
+  // n c h w
+  //  auto image_dim = image.dims();
+  if (image_dim.size() == 4) {
+    auto n = image_dim[0];
+    auto h = image_dim[2];
+    auto w = image_dim[3];
+    auto image_width = image_shape[0];
+    size_t work_size_0 = image_width / w;
+    size_t work_size_1 = w;
+    size_t work_size_2 = n * h;
+    return {work_size_0, work_size_1, work_size_2};
+  } else if (image_dim.size() == 2) {
+    auto h = image_dim[0];
+    auto w = image_dim[1];
+    return {1,
+            static_cast<unsigned int>(image_shape[0]),
+            static_cast<unsigned int>(image_shape[1])};
+  } else if (image_dim.size() == 1) {
+    return {1, static_cast<unsigned int>(image_shape[0]), 1};
+  } else if (image_dim.size() == 3) {
+    size_t c = image_dim[0];
+    size_t h = image_dim[1];
+    size_t w = image_dim[2];
+    return {(c + 3) / 4, w, h};
+  }
+  LOG(FATAL) << " not support this dim, need imp ";
+}
 
 }  // namespace opencl
 }  // namespace kernels
diff --git a/lite/kernels/opencl/layout_compute.cc b/lite/kernels/opencl/layout_compute.cc
index a2869457fc8dabdfb39d3d447404c0a6f6f77375..f3393fcaa14f4eeae1672eaaf2973efe6b5c1de5 100644
--- a/lite/kernels/opencl/layout_compute.cc
+++ b/lite/kernels/opencl/layout_compute.cc
@@ -28,8 +28,11 @@ namespace lite {
 namespace kernels {
 namespace opencl {
 
-class LayoutComputeBufferChwToImage2DHwc
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC)> {
+// [NCHW] -> [ImageDefault]
+class LayoutComputeBufferChwToImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kAny),
+                        DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::LayoutParam;
 
@@ -117,7 +120,8 @@ class LayoutComputeBufferChwToImage2DHwc
   }
 
   std::string doc() const override {
-    return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(RGBA)";
+    return "Trans Layout from cl::Buffer(NCHW) to "
+           "cl::Image2D(ImageDefault/RGBA)";
   }
 
  private:
@@ -126,7 +130,8 @@ class LayoutComputeBufferChwToImage2DHwc
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
-class LayoutComputeImage2DHwcToBufferChw
+// [ImageDefault] -> [NCHW]
+class LayoutComputeImageDefaultToBufferChw
     : public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)> {
  public:
   using param_t = operators::LayoutParam;
@@ -206,7 +211,8 @@ class LayoutComputeImage2DHwcToBufferChw
   }
 
   std::string doc() const override {
-    return "Trans Layout from cl::Image2D(RGBA) to cl::Buffer(NCHW)";
+    return "Trans Layout from cl::Image2D(ImageDefault/RGBA) to "
+           "cl::Buffer(NCHW)";
   }
 
  private:
@@ -215,20 +221,115 @@ class LayoutComputeImage2DHwcToBufferChw
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
+// [NCHW] -> [ImageDW]
+class LayoutComputeBufferChwToImage2DNw
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageNW)> {
+ public:
+  using param_t = operators::LayoutParam;
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "buffer/layout_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = Param<param_t>();
+    auto* x_data = param.x->data<float, cl::Buffer>();
+    auto x_dims = param.x->dims();
+
+    CHECK(x_dims.size() == 4) << " Tensor dim is not 4.";
+    size_t image_width = x_dims[3] * ((x_dims[0] + 3) / 4);
+    size_t image_height = x_dims[1] * x_dims[2];
+
+    auto* y_data =
+        param.y->mutable_data<float, cl::Image2D>(image_width, image_height);
+    auto y_dims = param.y->dims();
+
+    // out info
+    std::vector<size_t> new_dims = {1, 1, 1, 1};
+    for (int tidx = 0; tidx < x_dims.size(); ++tidx) {
+      new_dims[4 - x_dims.size() + tidx] = x_dims[tidx];
+    }
+
+    const int out_N = new_dims[0];
+    const int out_C = new_dims[1];
+    const int out_H = new_dims[2];
+    const int out_W = new_dims[3];
+
+    const int Stride2 = out_C * out_H * out_W;
+    const int Stride1 = out_H * out_W;
+    const int Stride0 = out_W;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_data);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *y_data);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_H));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_W));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_N));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(Stride0));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(Stride1));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(Stride2));
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << "gws:[3D]" << ((out_N + 3) / 4) << " " << out_W << " "
+            << (out_C * out_H);
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>((out_N + 3) / 4),  // N blocks
+                    static_cast<cl::size_type>(out_W),            // w
+                    static_cast<cl::size_type>(out_C * out_H)};   // ch
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(y_data, event_);
+    context.cl_context()->GetCommandQueue().finish();
+    //    auto image_shape = InitImageDimInfoWith(x_dims);
+  }
+
+  std::string doc() const override {
+    return "Trans Layout from cl::Buffer(NCHW) to cl::Image2D(ImageDW/CLNW)";
+  }
+
+ private:
+  std::string kernel_func_name_{"buffer_to_image2d_nw"};
+  std::string build_options_{"-DCL_DTYPE_float "};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-// BufferChwToImage2DHwc
-// [chw] -> [hwc]
+// [NCHW] -> [ImageDefault]
 REGISTER_LITE_KERNEL(
     layout,
     kOpenCL,
     kAny,
-    kNHWC,
-    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
-    buffer_chw_to_image2d_hwc_opencl_fp32)
+    kImageDefault,
+    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
+    NCHW_to_ImageDefault)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kAny),
@@ -236,17 +337,16 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kAny),
-                                       DATALAYOUT(kNHWC))})
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
 
-// [chw] -> [hwc]
 REGISTER_LITE_KERNEL(
     layout_once,
     kOpenCL,
     kAny,
-    kNHWC,
-    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DHwc,
-    buffer_chw_to_image2d_hwc_opencl_fp32)
+    kImageDefault,
+    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImageDefault,
+    NCHW_to_ImageDefault)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kAny),
@@ -254,42 +354,58 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kAny),
-                                       DATALAYOUT(kNHWC))})
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
 
-// Image2DHwcBufferChw
-// [hwc] -> [chw]
+// [ImageDefault] -> [NCHW]
 REGISTER_LITE_KERNEL(
     layout,
     kOpenCL,
     kAny,
     kNCHW,
-    paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
-    image2d_hwc_to_buffer_chw_opencl_fp32)
+    paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
+    ImageDefault_to_NCHW)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kAny),
-                                      DATALAYOUT(kNHWC))})
+                                      DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kAny),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
 
-// [hwc] -> [chw]
 REGISTER_LITE_KERNEL(
     layout_once,
     kOpenCL,
     kAny,
     kNCHW,
-    paddle::lite::kernels::opencl::LayoutComputeImage2DHwcToBufferChw,
-    image2d_hwc_to_buffer_chw_opencl_fp32)
+    paddle::lite::kernels::opencl::LayoutComputeImageDefaultToBufferChw,
+    ImageDefault_to_NCHW)
     .BindInput("Input",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kAny),
-                                      DATALAYOUT(kNHWC))})
+                                      DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kAny),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
+
+// [NCHW] -> [ImageNW]
+REGISTER_LITE_KERNEL(
+    layout_once,
+    kOpenCL,
+    kFloat,
+    kImageNW,
+    paddle::lite::kernels::opencl::LayoutComputeBufferChwToImage2DNw,
+    NCHW_to_ImageNW)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageNW))})
+    .Finalize();
diff --git a/lite/kernels/opencl/layout_compute_test.cc b/lite/kernels/opencl/layout_compute_test.cc
index 3e8dd78f616d4d1e3fabf51ba8d3ddf43dd561f1..34fdce98251f6056cb4c5ff23393b7b6f12fe89f 100644
--- a/lite/kernels/opencl/layout_compute_test.cc
+++ b/lite/kernels/opencl/layout_compute_test.cc
@@ -24,7 +24,7 @@ namespace lite {
 
 // #define LOOP_TEST
 // #define PRINT_RESULT
-TEST(layout, compute) {
+TEST(layout_ImageDefault, compute) {
   LOG(INFO) << "main steps of test: host -> layout(buf2img) -> layout(img2buf) "
                "-> device";
 
@@ -43,8 +43,11 @@ TEST(layout, compute) {
           LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
                     << h << " " << w << " ========";
           // set layout kernels
-          auto buf_to_img_kernels = KernelRegistry::Global().Create(
-              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNHWC));
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
           auto img_to_buf_kernels = KernelRegistry::Global().Create(
               "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
           ASSERT_FALSE(buf_to_img_kernels.empty());
@@ -145,10 +148,144 @@ TEST(layout, compute) {
 #endif
 }
 
+TEST(layout_ImageNW, compute) {
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 100; n += 21) {
+    for (auto c : {1, 3}) {
+      for (int h = 1; h <= 100; h += 13) {
+        for (int w = 1; w <= 100; w += 17) {
+#else
+          const int n = 1;
+          const int c = 1;
+          const int h = 1;
+          const int w = 100;
+#endif  // LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_nw_kernels =
+              KernelRegistry::Global().Create("layout_once",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageNW));
+          ASSERT_FALSE(buf_to_img_nw_kernels.empty());
+          auto buf_to_img_nw_kernel = std::move(buf_to_img_nw_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_nw_kernel->doc();
+
+          // set tensors about op param
+          operators::LayoutParam bufferToImageParam;
+          lite::Tensor x, y, cpu_y;
+          bufferToImageParam.x = &x;
+          bufferToImageParam.y = &y;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);  // useless for image2D
+          cpu_y.Resize(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+
+          // mute in buffer
+          auto* x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          // mute out image nw
+          size_t image_width = w * ((n + 3) / 4);
+          size_t image_height = c * h;
+          auto* y_data =
+              y.mutable_data<float, cl::Image2D>(image_width, image_height);
+          auto* cpu_y_data =
+              cpu_y.mutable_data<float, cl::Image2D>(image_width, image_height);
+
+          auto* mapped_x = static_cast<float*>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+
+          const size_t cl_image2d_row_pitch{0};
+          const size_t cl_image2d_slice_pitch{0};
+
+          auto* mapped_y = static_cast<float*>(
+              TargetWrapperCL::MapImage(y_data,
+                                        image_width,
+                                        image_height,
+                                        cl_image2d_row_pitch,
+                                        cl_image2d_slice_pitch));
+
+          auto* mapped_cpu_y = static_cast<float*>(
+              TargetWrapperCL::MapImage(cpu_y_data,
+                                        image_width,
+                                        image_height,
+                                        cl_image2d_row_pitch,
+                                        cl_image2d_slice_pitch));
+
+          // random datas
+          std::default_random_engine engine;
+          std::uniform_real_distribution<float> gen(-5, 5);
+
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = gen(engine);
+          }
+
+          // gen cpu y_data
+          CLImageConverterNWBlock nw_converter;
+          nw_converter.NCHWToImage(mapped_x, mapped_cpu_y, x_dim);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          // set kernel params
+          buf_to_img_nw_kernel->SetParam(bufferToImageParam);
+
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+
+          // set context
+          buf_to_img_nw_kernel->SetContext(std::move(buf_to_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_nw_kernel->Launch();
+
+// result
+#ifdef PRINT_RESULT
+          LOG(INFO) << "---- print result ----";
+          for (int eidx = 0; i < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // PRINT_RESULT
+
+          // check result: compare input and output
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(mapped_cpu_y[eidx], mapped_y[eidx], 1e-3);
+            if (abs(mapped_cpu_y[eidx] - mapped_y[eidx]) > 1e-3) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", mapped_x[" << eidx
+                        << "]:" << mapped_cpu_y[eidx] << ", mapped_y[" << eidx
+                        << "]:" << mapped_y[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+
+#ifdef LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(
-    layout, kOpenCL, kAny, kNHWC, buffer_chw_to_image2d_hwc_opencl_fp32);
-USE_LITE_KERNEL(
-    layout, kOpenCL, kAny, kNCHW, image2d_hwc_to_buffer_chw_opencl_fp32);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(layout_once, kOpenCL, kFloat, kImageNW, NCHW_to_ImageNW);
diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_compute.cc
index d275b312d67b5aba7050a195949ee4c3792b5da7..fca2cbe96d56b65e5f33acacff20c781b3400ed0 100644
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
@@ -16,6 +16,7 @@
 #include "lite/backends/opencl/cl_include.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
 #include "lite/operators/op_params.h"
 #include "lite/utils/replace_stl/stream.h"
 #include "lite/utils/string.h"
@@ -30,6 +31,8 @@ class PoolCompute
  public:
   using param_t = operators::PoolParam;
 
+  std::string doc() const override { return "Pool using cl::Buffer, kFloat"; }
+
   void PrepareForRun() override {
     const auto& param = *param_.get_mutable<param_t>();
     kernel_func_name_ += param.pooling_type;
@@ -113,7 +116,111 @@ class PoolCompute
 
  private:
   std::string kernel_func_name_{"pool_"};
-  std::string build_options_{"-DCL_DTYPE=float"};
+  std::string build_options_{"-DCL_DTYPE_float"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
+                                             PRECISION(kFloat),
+                                             DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  std::string doc() const override { return "Pool using cl::Image2D, kFloat"; }
+
+  void PrepareForRun() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    kernel_func_name_ += param.pooling_type;
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/pool_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    const auto& param = *param_.get_mutable<param_t>();
+    const auto& in_dims = param.x->dims();
+    const auto& out_dims = param.output->dims();
+    const std::string pooling_type = param.pooling_type;
+    const bool global_pooling = param.global_pooling;
+    std::vector<int> paddings = *param.paddings;
+    std::vector<int> strides = param.strides;
+    std::vector<int> ksize = param.ksize;
+    if (global_pooling) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[2 * i] = 0;
+        paddings[2 * i + 1] = 0;
+        ksize[i] = static_cast<int>(in_dims[i + 2]);
+      }
+    }
+    bool pads_equal =
+        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+    if (!pads_equal) {
+      LOG(FATAL)
+          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+    }
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+
+    auto* x_img = param.x->data<float, cl::Image2D>();
+    LOG(INFO) << "x_image" << x_img;
+
+    auto out_image_shape = InitImageDimInfoWith(out_dims);
+    LOG(INFO) << "out_image_shape = " << out_image_shape["width"] << " "
+              << out_image_shape["height"];
+    auto* out_img = param.output->mutable_data<float, cl::Image2D>(
+        out_image_shape["width"], out_image_shape["height"]);
+    LOG(INFO) << "out_image" << out_img;
+
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int c_block = (out_dims[1] + 3) / 4;
+    int w = out_dims[3];
+    int nh = out_dims[0] * out_dims[2];
+    auto global_work_size = cl::NDRange(c_block, w, nh);
+
+    cl_int status;
+    int arg_idx = 0;
+    status = kernel.setArg(arg_idx, *x_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_img);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(in_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(out_dims[3]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(ksize[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[0]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(strides[1]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
+    CL_CHECK_FATAL(status);
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_img, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"pool_"};
+  std::string build_options_{"-DCL_DTYPE_float"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
@@ -131,3 +238,19 @@ REGISTER_LITE_KERNEL(pool2d,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(pool2d,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::PoolComputeImage2D,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_compute_test.cc
index 25f0e72634775f4c5e82a6bd800f9ca980da2e34..f97c758469ece8f2eaf59ebbb1b5065d71641616 100644
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
@@ -73,15 +73,14 @@ void pool_avg(const int padding_height,
   }
 }
 
-TEST(pool2d, compute) {
+TEST(pool2d_buffer_fp32, compute) {
   LOG(INFO) << "to get kernel ...";
   auto kernels = KernelRegistry::Global().Create(
       "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW));
   ASSERT_FALSE(kernels.empty());
 
   auto kernel = std::move(kernels.front());
-
-  LOG(INFO) << "get kernel";
+  LOG(INFO) << "get kernel:" << kernel->doc();
 
   lite::Tensor x, out;
   operators::PoolParam param;
@@ -143,7 +142,102 @@ TEST(pool2d, compute) {
   TargetWrapperCL::Unmap(out_data, mapped_out);
 }
 
+TEST(pool2d_image2d_fp32, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "pool2d", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "get kernel:" << kernel->doc();
+
+  lite::Tensor x, out;
+  operators::PoolParam param;
+  param.x = &x;
+  param.output = &out;
+  param.global_pooling = false;
+  param.pooling_type = "avg";
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.strides = std::vector<int>{1, 1};
+  param.ksize = std::vector<int>{7, 7};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  kernel->SetParam(param);
+  std::unique_ptr<KernelContext> pool_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(pool_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(pool_context));
+
+  const DDim in_dim = DDim(std::vector<DDim::value_type>{4, 11, 107, 107});
+  const DDim out_dim = DDim(std::vector<DDim::value_type>{4, 11, 101, 101});
+  x.Resize(in_dim);
+  out.Resize(out_dim);
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> dist(-5, 5);
+  std::vector<float> input_v(4 * 11 * 107 * 107);
+  for (auto& i : input_v) {
+    i = dist(engine);
+  }
+
+  LOG(INFO) << "prepare input";
+  CLImageConverterDefault* default_converter = new CLImageConverterDefault();
+  DDim x_image_shape = default_converter->InitImageDimInfoWith(in_dim);
+  LOG(INFO) << "x_image_shape = " << x_image_shape[0] << " "
+            << x_image_shape[1];
+  std::vector<float> x_image_data(x_image_shape.production() * 4);  // 4 : RGBA
+  default_converter->NCHWToImage(input_v.data(), x_image_data.data(), in_dim);
+  auto* x_image = x.mutable_data<float, cl::Image2D>(
+      x_image_shape[0], x_image_shape[1], x_image_data.data());
+  LOG(INFO) << "x_image:" << x_image;
+
+  DDim out_image_shape = default_converter->InitImageDimInfoWith(out_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = out.mutable_data<float, cl::Image2D>(out_image_shape[0],
+                                                         out_image_shape[1]);
+  LOG(INFO) << "out_image:" << out_image;
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_ptr);
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  std::unique_ptr<float[]> out_ref(new float[out_dim.production()]);
+  pool_avg(0, 0, 1, 1, 7, 7, input_v.data(), in_dim, out_ref.get(), out_dim);
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+  float* out_image_data = new float[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              out_image,
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter->ImageToNCHW(
+      out_image_data, out_data, out_image_shape, out_dim);
+
+  for (int i = 0; i < out_dim.production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref[i], 1e-6);
+  }
+}
+
 }  // namespace lite
 }  // namespace paddle
 
 USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/opencl/relu_compute.cc b/lite/kernels/opencl/relu_compute.cc
index c7b89c939b0bf571f27ac1dfdd272a9324f8e89f..c5272fa14ac1af25ca44d611a59ed04016d771d0 100644
--- a/lite/kernels/opencl/relu_compute.cc
+++ b/lite/kernels/opencl/relu_compute.cc
@@ -29,6 +29,7 @@ class ReluCompute
  public:
   using param_t = operators::ActivationParam;
 
+  std::string doc() const override { return "Relu using cl::Buffer, kFloat"; }
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
@@ -72,15 +73,21 @@ class ReluCompute
 
  private:
   std::string kernel_func_name_{"relu"};
-  std::string build_options_{"-DCL_DTYPE=float -DRELU"};
+  std::string build_options_{"-DCL_DTYPE_float -DRELU"};
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
-class ReluComputeFloatImage
-    : public KernelLite<TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+class ReluComputeFloatImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFloat),
+                        DATALAYOUT(kImageDefault)> {
  public:
   using param_t = operators::ActivationParam;
 
+  std::string doc() const override {
+    return "Relu using cl::Image2D(ImageDefault/RGBA), kFloat";
+  }
+
   void PrepareForRun() override {
     auto& context = ctx_->As<OpenCLContext>();
     context.cl_context()->AddKernel(
@@ -139,6 +146,80 @@ class ReluComputeFloatImage
   std::shared_ptr<cl::Event> event_{new cl::Event};
 };
 
+class ReluComputeFP16ImageDefault
+    : public KernelLite<TARGET(kOpenCL),
+                        PRECISION(kFP16),
+                        DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  std::string doc() const override {
+    return "Relu using cl::Image2D(ImageDefault/RGBA), kFP16";
+  }
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/relu_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    const auto& x_dims = param.X->dims();
+    auto* x_buf =
+        param.X->data<int16_t,
+                      cl::Image2D>();  // use int16_t represents half float
+    auto image_shape = InitImageDimInfoWith(x_dims);
+    auto* out_buf =
+        param.Out->mutable_data<int16_t, cl::Image2D>(  // use int16_t
+                                                        // represents half float
+            image_shape["width"],
+            image_shape["height"]);
+    const auto& y_dims = param.Out->dims();  // useless: check dim only
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+
+    int arg_idx = 0;
+    cl_int status = kernel.setArg(arg_idx, *x_buf);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_buf);
+    CL_CHECK_FATAL(status);
+
+    VLOG(4) << TargetToStr(param.X->target());
+    VLOG(4) << TargetToStr(param.Out->target());
+    VLOG(4) << "image_shape(w,h):" << image_shape["width"] << " "
+            << image_shape["height"];
+    VLOG(4) << "x_dims[" << x_dims.size() << "D]:" << x_dims[0] << " "
+            << x_dims[1] << " " << x_dims[2] << " " << x_dims[3];
+    VLOG(4) << "y_dims[" << y_dims.size() << "D]:" << y_dims[0] << " "
+            << y_dims[1] << " " << y_dims[2] << " " << y_dims[3];
+
+    auto global_work_size =
+        cl::NDRange{static_cast<cl::size_type>(image_shape["width"]),
+                    static_cast<cl::size_type>(image_shape["height"])};
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    // TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
+    // context.cl_wait_list()->emplace(out_buf, event_);
+    context.cl_context()->GetCommandQueue().finish();
+  }
+
+ private:
+  std::string kernel_func_name_{"relu"};
+  std::string build_options_{"-DCL_DTYPE_half -DRELU"};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
 }  // namespace opencl
 }  // namespace kernels
 }  // namespace lite
@@ -154,18 +235,35 @@ class ReluComputeFloatImage
 //    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kOpenCL))})
 //    .Finalize();
 
-REGISTER_LITE_KERNEL(relu,
-                     kOpenCL,
-                     kFloat,
-                     kNHWC,
-                     paddle::lite::kernels::opencl::ReluComputeFloatImage,
-                     image2d)
+REGISTER_LITE_KERNEL(
+    relu,
+    kOpenCL,
+    kFloat,
+    kImageDefault,
+    paddle::lite::kernels::opencl::ReluComputeFloatImageDefault,
+    ImageDefault)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kOpenCL),
                                       PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC))})
+                                      DATALAYOUT(kImageDefault))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kOpenCL),
                                        PRECISION(kFloat),
-                                       DATALAYOUT(kNHWC))})
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(relu,
+                     kOpenCL,
+                     kFP16,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReluComputeFP16ImageDefault,
+                     ImageDefault)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFP16),
+                                      DATALAYOUT(kImageDefault))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFP16),
+                                       DATALAYOUT(kImageDefault))})
     .Finalize();
diff --git a/lite/kernels/opencl/relu_compute_test.cc b/lite/kernels/opencl/relu_compute_test.cc
index d2f0812bae608324a8ab31756981271fe1c334e4..3745f3a8f7d8ab1d5e8f49d1c2b1ba8ff0c0a30d 100644
--- a/lite/kernels/opencl/relu_compute_test.cc
+++ b/lite/kernels/opencl/relu_compute_test.cc
@@ -17,6 +17,7 @@
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
+#include "lite/kernels/opencl/image_helper.h"
 
 namespace paddle {
 namespace lite {
@@ -28,7 +29,8 @@ void relu_compute_ref(const dtype *x_data, const DDim &x_dim, dtype *out_data) {
   }
 }
 
-TEST(opencl_relu, compute) {
+#if 0   // relu_buffer
+TEST(opencl_relu_buffer, compute) {
   // prepare data
   const DDim x_dim = DDim(std::vector<DDim::value_type>{3, 6, 10, 10});
   lite::Tensor x, out;
@@ -87,8 +89,328 @@ TEST(opencl_relu, compute) {
   TargetWrapperCL::Unmap(out_data, mapped_out);
   TargetWrapperCL::Unmap(x_data, mapped_x);
 }
+#endif  // relu_buffer
+
+// #define LOOP_TEST
+// #define PRINT_RESULT
+TEST(relu_image2d_fp32, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+#endif  // LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto relu_img_kernels =
+              KernelRegistry::Global().Create("relu",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFloat),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(relu_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto relu_img_kernel = std::move(relu_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> relu_in
+          // relu(img): relu_in -> relu_out
+          // layout(img->buf): relu_out -> y
+          lite::Tensor x, y, relu_in, relu_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &relu_in;
+          ImageToBufferParam.x = &relu_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam ReluParam;
+          ReluParam.X = &relu_in;
+          ReluParam.Out = &relu_out;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          relu_in.Resize(x_dim);
+          relu_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto relu_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+            mapped_y[i] = static_cast<int>(0);
+          }
+          auto *relu_in_data = relu_in.mutable_data<float, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+          auto *relu_out_data = relu_out.mutable_data<float, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          relu_img_kernel->SetParam(ReluParam);
+          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(relu_img_context->As<OpenCLContext>()));
+          relu_img_kernel->SetContext(std::move(relu_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: relu_img_kernel";
+          relu_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
+// result
+#ifdef PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]:" << mapped_y[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
+
+TEST(relu_image2d_fp16, compute) {
+  LOG(INFO) << "main steps of test: host -> layout(buf2img) -> relu(img) -> "
+               "layout(img2buf) "
+               "-> host";
+
+#ifdef LOOP_TEST
+  for (int n = 1; n <= 100; n += 33) {
+    for (auto c : {1, 3}) {
+      for (int h = 12; h <= 100; h += 13) {
+        for (int w = 12; w <= 100; w += 25) {
+#else
+  const int n = 1;
+  const int c = 2;
+  const int h = 3;
+  const int w = 4;
+#endif  // LOOP_TEST
+
+          LOG(INFO) << "======== input shape[n,c,h,w]:" << n << " " << c << " "
+                    << h << " " << w << " ========";
+          // set layout kernels
+          auto buf_to_img_kernels =
+              KernelRegistry::Global().Create("layout",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kAny),
+                                              DATALAYOUT(kImageDefault));
+          auto img_to_buf_kernels = KernelRegistry::Global().Create(
+              "layout", TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW));
+          auto relu_img_kernels =
+              KernelRegistry::Global().Create("relu",
+                                              TARGET(kOpenCL),
+                                              PRECISION(kFP16),
+                                              DATALAYOUT(kImageDefault));
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(buf_to_img_kernels.empty());
+          ASSERT_FALSE(relu_img_kernels.empty());
+
+          auto buf_to_img_kernel = std::move(buf_to_img_kernels.front());
+          auto img_to_buf_kernel = std::move(img_to_buf_kernels.front());
+          auto relu_img_kernel = std::move(relu_img_kernels.front());
+          LOG(INFO) << "get 1st kernel: " << buf_to_img_kernel->doc();
+          LOG(INFO) << "get 2nd kernel: " << img_to_buf_kernel->doc();
+          LOG(INFO) << "get 3rd kernel: " << relu_img_kernel->doc();
+
+          // set tensors about op param
+          LOG(INFO) << "set tensors about op param";
+          // layout(buf->img): x -> relu_in
+          // relu(img): relu_in -> relu_out
+          // layout(img->buf): relu_out -> y
+          lite::Tensor x, y, relu_in, relu_out, y_ref;
+          operators::LayoutParam BufferToImageParam;
+          operators::LayoutParam ImageToBufferParam;
+          BufferToImageParam.x = &x;
+          BufferToImageParam.y = &relu_in;
+          ImageToBufferParam.x = &relu_out;
+          ImageToBufferParam.y = &y;
+          operators::ActivationParam ReluParam;
+          ReluParam.X = &relu_in;
+          ReluParam.Out = &relu_out;
+
+          const DDim x_dim = DDim(std::vector<DDim::value_type>{n, c, h, w});
+          x.Resize(x_dim);
+          y.Resize(x_dim);
+          relu_in.Resize(x_dim);
+          relu_out.Resize(x_dim);
+          y_ref.Resize(x_dim);
+          auto relu_image2d_shape =
+              paddle::lite::kernels::opencl::InitImageDimInfoWith(x_dim);
+
+          // initialize tensors
+          LOG(INFO) << "initialize tensors";
+          auto *x_data = x.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data = y.mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+          auto *y_data_ref = y_ref.mutable_data<float>(TARGET(kARM));
+          auto *mapped_x = static_cast<float *>(TargetWrapperCL::Map(
+              x_data, 0, sizeof(float) * x_dim.production()));
+          auto *mapped_y = static_cast<float *>(TargetWrapperCL::Map(
+              y_data, 0, sizeof(float) * x_dim.production()));
+          for (int i = 0; i < x_dim.production(); ++i) {
+            mapped_x[i] = static_cast<int>(i) - x_dim.production() / 2;
+            mapped_y[i] = static_cast<int>(0);
+          }
+          auto *relu_in_data = relu_in.mutable_data<int16_t, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+          auto *relu_out_data = relu_out.mutable_data<int16_t, cl::Image2D>(
+              relu_image2d_shape["width"], relu_image2d_shape["height"]);
+
+          // set context and kernel args
+          LOG(INFO) << "set context and kernel args";
+          std::unique_ptr<KernelContext> context(new KernelContext);
+          context->As<OpenCLContext>().InitOnce();
+
+          buf_to_img_kernel->SetParam(BufferToImageParam);
+          std::unique_ptr<KernelContext> buf_to_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(buf_to_img_context->As<OpenCLContext>()));
+          buf_to_img_kernel->SetContext(std::move(buf_to_img_context));
+
+          img_to_buf_kernel->SetParam(ImageToBufferParam);
+          std::unique_ptr<KernelContext> img_to_buf_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(img_to_buf_context->As<OpenCLContext>()));
+          img_to_buf_kernel->SetContext(std::move(img_to_buf_context));
+
+          relu_img_kernel->SetParam(ReluParam);
+          std::unique_ptr<KernelContext> relu_img_context(new KernelContext);
+          context->As<OpenCLContext>().CopySharedTo(
+              &(relu_img_context->As<OpenCLContext>()));
+          relu_img_kernel->SetContext(std::move(relu_img_context));
+
+          // run kernels
+          LOG(INFO) << "run kernel: buf_to_img_kernel";
+          buf_to_img_kernel->Launch();
+          LOG(INFO) << "run kernel: relu_img_kernel";
+          relu_img_kernel->Launch();
+          LOG(INFO) << "run kernel: img_to_buf_kernel";
+          img_to_buf_kernel->Launch();
+
+          // compute ref cpu
+          relu_compute_ref<float>(mapped_x, x_dim, y_data_ref);
+// result
+#ifdef PRINT_RESULT
+          LOG(INFO) << "---- print kernel result (input -> output) ----";
+          for (int eidx = 0; eidx < x_dim.production(); ++eidx) {
+            std::cout << mapped_x[eidx] << " -> " << mapped_y[eidx]
+                      << std::endl;
+          }
+#endif  // PRINT_RESULT
+
+          // check result: compare kernel output and cpu output(y_data_ref)
+          for (int eidx = 0; eidx < x_dim.production(); eidx++) {
+            EXPECT_NEAR(y_data_ref[eidx], mapped_y[eidx], 1e-6);
+            if (abs(y_data_ref[eidx] - mapped_y[eidx]) > 1e-6) {
+              LOG(INFO) << "1st diff in this case at eidx[from 0]:" << eidx
+                        << " / " << x_dim.production() << ", y_data_ref["
+                        << eidx << "]:" << y_data_ref[eidx] << ", mapped_y["
+                        << eidx << "]:" << mapped_y[eidx];
+              break;
+            }
+          }
+
+          // free
+          LOG(INFO) << "free: unmap x, y";
+          TargetWrapperCL::Unmap(x_data, mapped_x);
+          TargetWrapperCL::Unmap(y_data, mapped_y);
+#ifdef LOOP_TEST
+        }  // w
+      }    // h
+    }      // c
+  }        // n
+#else
+// nothing to do.
+#endif
+}
 
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
+// relu buffer
+// USE_LITE_KERNEL(relu, kOpenCL, kFloat, kNCHW, def);
+
+// relu image2d fp32
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kImageDefault, NCHW_to_ImageDefault);
+USE_LITE_KERNEL(layout, kOpenCL, kAny, kNCHW, ImageDefault_to_NCHW);
+USE_LITE_KERNEL(relu, kOpenCL, kFloat, kImageDefault, ImageDefault);
+
+// relu image2d fp16
+USE_LITE_KERNEL(relu, kOpenCL, kFP16, kImageDefault, ImageDefault);
diff --git a/lite/kernels/opencl/reshape_compute.cc b/lite/kernels/opencl/reshape_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7af648c5601e0a516eb92b3090cb8d7e836a5447
--- /dev/null
+++ b/lite/kernels/opencl/reshape_compute.cc
@@ -0,0 +1,206 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/opencl/cl_include.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/opencl/image_helper.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/logging.h"
+#include "lite/utils/replace_stl/stream.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+
+// reshape operator
+class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
+                                                   PRECISION(kFloat),
+                                                   DATALAYOUT(kImageDefault)> {
+ public:
+  using param_t = operators::ReshapeParam;
+
+  void PrepareForRun() override {
+    auto& context = ctx_->As<OpenCLContext>();
+    context.cl_context()->AddKernel(
+        kernel_func_name_, "image/reshape_kernel.cl", build_options_);
+  }
+
+  void Run() override {
+    VLOG(4) << "reshape_compute run ... ";
+
+    auto& param = *param_.get_mutable<param_t>();
+    const Tensor* const x = param.x;
+
+    const auto x_dims = x->dims();
+    const std::map<std::string, size_t>& input_image_shape =
+        InitImageDimInfoWith(x_dims);
+
+    const int64_t& input_image_width = input_image_shape.at("width");
+    const int64_t& input_image_height = input_image_shape.at("height");
+
+    const cl::Image2D* const x_image = x->data<float, cl::Image2D>();
+
+    const std::vector<int>& shape_vct = param.shape_vct;
+    Tensor* const output = param.output;
+    const DDimLite& out_dims = output->dims();
+    VLOG(4) << "out_dims= " << out_dims;
+
+    const std::map<std::string, size_t>& out_image_shape =
+        InitImageDimInfoWith(out_dims);
+    cl::Image2D* const out_image = output->mutable_data<float, cl::Image2D>(
+        out_image_shape.at("width"), out_image_shape.at("height"));
+    LOG(INFO) << "out_dims=   " << out_dims;
+
+    const std::vector<size_t>& default_work_size = DefaultWorkSize(
+        out_dims,
+        DDim(std::vector<DDim::value_type>{
+            static_cast<int64_t>(out_image_shape.at("width")),
+            static_cast<int64_t>(out_image_shape.at("height"))}));
+
+    int x_v_dims[4] = {1, 1, 1, 1};
+    int out_v_dims[4] = {1, 1, 1, 1};
+    // 1 1000 1 1
+    for (int i = 0; i < x_dims.size(); i++) {
+      x_v_dims[4 - x_dims.size() + i] = x_dims[i];
+    }
+    // 1 1 1 1000
+    for (int i = 0; i < out_dims.size(); i++) {
+      out_v_dims[4 - out_dims.size() + i] = out_dims[i];
+    }
+
+    int out_C = out_v_dims[1];
+    int out_H = out_v_dims[2];
+    int out_W = out_v_dims[3];
+    int in_W = x_v_dims[3];
+    int in_H = x_v_dims[2];
+    int in_Stride0 = in_W;
+    int in_Stride1 = x_v_dims[2] * x_v_dims[3];
+    int in_Stride2 = x_v_dims[1] * x_v_dims[2] * x_v_dims[3];
+    int out_Stride0 = out_W;
+    int out_Stride1 = out_H * out_W;
+    int out_Stride2 = out_C * out_H * out_W;
+    VLOG(4) << "out_C=" << out_C;
+    VLOG(4) << "out_H=" << out_H;
+    VLOG(4) << "out_W=" << out_W;
+    VLOG(4) << "in_W=" << in_W;
+    VLOG(4) << "default_work_size= " << default_work_size[0] << ", "
+            << default_work_size[1] << ", " << default_work_size[2];
+    VLOG(4) << "in_Stride0=" << in_Stride0;
+    VLOG(4) << "in_Stride1=" << in_Stride1;
+    VLOG(4) << "out_Stride0=" << out_Stride0;
+    VLOG(4) << "out_Stride1=" << out_Stride1;
+
+    auto& context = ctx_->As<OpenCLContext>();
+    CHECK(context.cl_context() != nullptr);
+    STL::stringstream kernel_key;
+    kernel_key << kernel_func_name_ << build_options_;
+    auto kernel = context.cl_context()->GetKernel(kernel_key.str());
+    VLOG(4) << TargetToStr(x->target());
+    VLOG(4) << TargetToStr(param.output->target());
+
+    int arg_idx = 0;
+
+    cl_int status;
+    status = kernel.setArg(arg_idx, *x_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, *out_image);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_C);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_H);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_W);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_W);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_H);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_Stride0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_Stride1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, in_Stride2);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_Stride0);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_Stride1);
+    CL_CHECK_FATAL(status);
+    status = kernel.setArg(++arg_idx, out_Stride2);
+    CL_CHECK_FATAL(status);
+
+    auto global_work_size =
+        cl::NDRange{static_cast<size_t>(default_work_size.data()[0]),
+                    static_cast<size_t>(default_work_size.data()[1]),
+                    static_cast<size_t>(default_work_size.data()[2])};
+
+    status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
+        kernel,
+        cl::NullRange,
+        global_work_size,
+        cl::NullRange,
+        nullptr,
+        event_.get());
+    CL_CHECK_FATAL(status);
+    context.cl_wait_list()->emplace(out_image, event_);
+  }
+
+ private:
+  std::string kernel_func_name_{"reshape"};
+  std::string build_options_{"-DCL_DTYPE_float "};
+  std::shared_ptr<cl::Event> event_{new cl::Event};
+};
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(reshape,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(reshape2,
+                     kOpenCL,
+                     kFloat,
+                     kImageDefault,
+                     paddle::lite::kernels::opencl::ReshapeComputeFloatImage,
+                     image2d)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kImageDefault))})
+    .BindInput("ShapeTensor", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindInput("Shape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("XShape", {LiteType::GetTensorTy(TARGET(kOpenCL))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kOpenCL),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kImageDefault))})
+    .Finalize();
diff --git a/lite/kernels/opencl/reshape_compute_test.cc b/lite/kernels/opencl/reshape_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5ba1c118e7fa952fe1172080ee97555a82c7260
--- /dev/null
+++ b/lite/kernels/opencl/reshape_compute_test.cc
@@ -0,0 +1,227 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <random>
+#include "lite/backends/opencl/target_wrapper.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+#include "lite/operators/reshape_op.h"
+#include "lite/utils/logging.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace opencl {
+static DDim ValidateShape(const std::vector<int>& shape,
+                          const DDim& input_dims) {
+  const lite::DDim::value_type input_size = input_dims.production();
+  auto input_shape = input_dims.Vectorize();
+  bool all_positive = std::all_of(
+      input_shape.cbegin(), input_shape.cend(), [](lite::DDim::value_type i) {
+        return i > 0;
+      });
+  // only one dimension can be set to -1, whose size will be automatically
+  // infered.
+  const int unk_dim_val = -1;
+  const int copy_dim_val = 0;
+
+  std::vector<lite::DDim::value_type> output_shape(shape.size(), 0);
+  lite::DDim::value_type capacity = 1;
+  int unk_dim_idx = -1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    if (shape[i] == unk_dim_val) {
+      CHECK_EQ(unk_dim_idx, -1)
+          << "Only one input dimension of Attr(shape) can be unknown.";
+      unk_dim_idx = i;
+    } else if (shape[i] == copy_dim_val) {
+      CHECK_LT(static_cast<int>(i), input_shape.size())
+          << "The index of dimension to copy from input shape must be less "
+             "than the size of input shape.";
+    } else {
+      CHECK_GT(shape[i], 0) << "Each input dimension of Attr(shape) must not "
+                               "be negtive except one unknown dimension.";
+    }
+
+    capacity *= (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
+                          : input_shape[i]);
+    output_shape[i] = (shape[i] ? static_cast<lite::DDim::value_type>(shape[i])
+                                : input_shape[i]);
+  }
+
+  if (unk_dim_idx != -1) {
+    if (all_positive) {
+      // input_size < 0 and is un-determinate in compile time, skip the check,
+      // for example, input_dims = [-1, 8, 1, 1], shape = [-1, 3, 8],
+      // capacity = -24, input_size = -8, output_shape[0] = 0
+      // the following check will fail.
+      output_shape[unk_dim_idx] = -input_size / capacity;
+      CHECK_EQ(output_shape[unk_dim_idx] * capacity, -input_size)
+          << "Invalid shape is given.";
+    } else {
+      output_shape[unk_dim_idx] = -1;
+    }
+  } else {
+    CHECK_EQ(capacity, input_size) << "Invalid shape is given.";
+  }
+  return lite::DDim(output_shape);
+}
+
+TEST(reshape_opencl, compute) {
+  LOG(INFO) << "to get kernel ...";
+  auto kernels = KernelRegistry::Global().Create(
+      "reshape", TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kImageDefault));
+  ASSERT_FALSE(kernels.empty());
+  auto kernel = std::move(kernels.front());
+
+  LOG(INFO) << "created reshape kernel";
+
+  LOG(INFO) << "prepare kernel ------";
+
+  int64_t batch_size = 1;
+  int64_t ic = 2;
+  int64_t ih = 4;
+  int64_t iw = 6;
+
+  lite::Tensor input, output;
+
+  operators::ReshapeParam param;
+
+  Tensor shape_tensor;
+  shape_tensor.Resize({2});
+  auto* shape_tensor_data = shape_tensor.mutable_data<int>();
+  shape_tensor_data[0] = 6;
+  shape_tensor_data[1] = 8;
+
+  param.x = &input;
+  param.shape_tensor = &shape_tensor;  // use shape_tensor
+  param.inplace = false;
+  param.output = &output;
+
+  const DDim input_dim =
+      lite::DDim{std::vector<int64_t>({batch_size, ic, ih, iw})};
+  input.Resize(input_dim);
+
+  std::vector<int> final_shape = std::vector<int>(
+      shape_tensor_data, shape_tensor_data + shape_tensor.numel());
+
+  auto output_dim = ValidateShape(final_shape, input_dim);
+  param.output->Resize(output_dim);
+  LOG(INFO) << " output_dim------" << output_dim;
+
+  LOG(INFO) << "prepare kernel SetParam------";
+  kernel->SetParam(param);
+
+  size_t input_image_width = iw * ((ic + 3) / 4);
+  size_t input_image_height = ih * batch_size;
+
+  const size_t cl_image2d_row_pitch{0};
+  const size_t cl_image2d_slice_pitch{0};
+
+  //  LOG(INFO) << "map input ...";
+  //  auto* mapped_input =
+  //      static_cast<float*>(TargetWrapperCL::MapImage(input_data,
+  //                                                    input_image_width,
+  //                                                    input_image_height,
+  //                                                    cl_image2d_row_pitch,
+  //                                                    cl_image2d_slice_pitch));
+
+  std::default_random_engine engine;
+  std::uniform_real_distribution<float> gen(-5, 5);
+  std::vector<float> input_v(batch_size * ic * ih * iw);
+
+  LOG(INFO) << "gen input ...";
+
+  float* input_v_data = &input_v[0];
+  for (auto& i : input_v) {
+    i = gen(engine);
+  }
+  paddle::lite::CLImageConverterDefault default_convertor;
+
+  std::vector<float> x_image_data(input_image_width * input_image_height *
+                                  4);  // 4 : RGBA
+
+  LOG(INFO) << "set mapped input  ...";
+  default_convertor.NCHWToImage(input_v_data, x_image_data.data(), input_dim);
+
+  auto* input_image = input.mutable_data<float, cl::Image2D>(
+      input_image_width, input_image_height, x_image_data.data());
+
+  LOG(INFO) << "prepare kernel ready";
+
+  LOG(INFO) << "mutable output ...";
+  CLImageConverterDefault default_converter;
+  DDim out_image_shape = default_converter.InitImageDimInfoWith(output_dim);
+  LOG(INFO) << "out_image_shape = " << out_image_shape[0] << " "
+            << out_image_shape[1];
+  auto* out_image = output.mutable_data<float, cl::Image2D>(out_image_shape[0],
+                                                            out_image_shape[1]);
+  VLOG(4) << "out_dims= " << output_dim;
+
+  LOG(INFO) << "kernel context ...";
+  std::unique_ptr<KernelContext> context(new KernelContext);
+  context->As<OpenCLContext>().InitOnce();
+
+  std::unique_ptr<KernelContext> reshape_context(new KernelContext);
+  context->As<OpenCLContext>().CopySharedTo(
+      &(reshape_context->As<OpenCLContext>()));
+  kernel->SetContext(std::move(reshape_context));
+
+  LOG(INFO) << "kernel launch ...";
+  kernel->Launch();
+
+  auto* wait_list = context->As<OpenCLContext>().cl_wait_list();
+  auto* out_ptr = param.output->data<float, cl::Image2D>();
+  auto it = wait_list->find(out_image);
+
+  if (it != wait_list->end()) {
+    VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
+    auto& event = *(it->second);
+    event.wait();
+  } else {
+    LOG(FATAL) << "Could not find the sync event for the target cl tensor.";
+  }
+
+  float* out_image_data = new float[out_image_shape.production() * 4];
+  TargetWrapperCL::ImgcpySync(out_image_data,
+                              output.data<float, cl::Image2D>(),
+                              out_image_shape[0],
+                              out_image_shape[1],
+                              cl_image2d_row_pitch,
+                              cl_image2d_slice_pitch,
+                              IoDirection::DtoH);
+  float* out_data = new float[out_image_shape.production() * 4];
+  default_converter.ImageToNCHW(
+      out_image_data, out_data, out_image_shape, output_dim);
+  // check output dims
+  for (int i = 0; i < output.dims().size(); i++) {
+    CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
+  }
+
+  // check output data
+  for (int i = 0; i < output.numel(); i++) {
+    EXPECT_NEAR(out_data[i], input_v_data[i], 1e-3);
+    if (abs(out_data[i] - input_v_data[i]) > 1e-3) {
+      LOG(INFO) << "error idx:" << i;
+    }
+  }
+}
+
+}  // namespace opencl
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(reshape, kOpenCL, kFloat, kImageDefault, image2d);
+USE_LITE_KERNEL(reshape2, kOpenCL, kFloat, kImageDefault, image2d);
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
index bf3a1685f028740da1b7f4dfa38f19b73d30df89..7bf131729a56f021e0ec987b5b28b85108cbb39e 100644
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -1,3 +1,7 @@
+if(NOT LITE_WITH_X86)
+    return()
+endif()
+
 add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_ops math_function)
 # lite_cc_library(mean_compute_x86 SRCS mean_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(fill_constant_compute_x86 SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps})
@@ -20,14 +24,13 @@ add_kernel(stack_compute_x86 X86 basic SRCS stack_compute.cc DEPS ${lite_kernel_
 add_kernel(dropout_compute_x86 X86 basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_x86 X86 basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_function)
 add_kernel(layer_norm_compute_x86 X86 basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
-# add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
 # lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} )
 add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps} blas math_function sequence2batch gru_compute)
 #add_kernel(gru_compute_x86 X86 basic SRCS gru_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_compute.cc DEPS ${lite_kernel_deps})
 
-# lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
 # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
 # lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
@@ -59,9 +62,6 @@ add_kernel(search_seq_fc_compute_x86 X86 extra SRCS search_seq_fc_compute.cc DEP
 add_kernel(sequence_topk_avg_pooling_compute_x86 X86 basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps} sequence_topk_avg_pooling)
 add_kernel(search_fc_compute_x86 X86 basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps} search_fc)
 
-if(NOT LITE_WITH_X86)
-    return()
-endif()
 add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kernel_deps} blas)
 
 lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
@@ -100,3 +100,4 @@ lite_cc_test(test_sequence_concat_compute_x86 SRCS sequence_concat_compute_test.
 lite_cc_test(test_var_conv_2d_compute_x86 SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_x86)
 #lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86)
 lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_x86)
+lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
diff --git a/lite/kernels/x86/fc_compute.h b/lite/kernels/x86/fc_compute.h
index 620236a4548abedd6eafd384282aa9df62d8dd9c..4f9f6c428a80602850a84cca8fab561d6a42dee6 100644
--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
@@ -13,66 +13,131 @@
 // limitations under the License.
 #pragma once
 
-#include <Eigen/Core>
+#include <vector>
+#include "lite/backends/x86/jit/helper.h"
+#include "lite/backends/x86/jit/kernel_base.h"
+#include "lite/backends/x86/jit/kernels.h"
+#include "lite/backends/x86/math/blas.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/type_system.h"
 #include "lite/operators/fc_op.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
 namespace x86 {
 
-template <typename T>
-void fc_compute_eigen(const T* x,
-                      int x_h,
-                      int x_w,  //
-                      const T* w,
-                      int w_h,
-                      int w_w,     //
-                      const T* b,  //
-                      T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> W(w, w_h, w_w);
-  Eigen::Map<matrix_t> Out(out, x_h, w_w);
-
-  Out = X * W;
+inline void FCOutputSize(const lite::DDim& in_dims,
+                         const lite::DDim& w_dims,
+                         std::vector<int64_t>& out_dims,  // NOLINT
+                         int in_num_col_dims,
+                         bool padding_weights) {
+  auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
 
-  if (b) {
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_w);
-    Out = Out.array().rowwise() + B.transpose().array();
+  out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
+  for (int i = 0; i < in_num_col_dims; ++i) {
+    out_dims.push_back(in_dims[i]);
   }
+  out_dims.push_back(w_dims1);
 }
 
-template <typename T>
-void fc_compute_naive(const T* x,
-                      int x_h,
-                      int x_w,  //
-                      const T* w,
-                      int w_h,
-                      int w_w,     //
-                      const T* b,  //
-                      T* out) {
-  CHECK_EQ(x_w, w_h);
-  // out shape: (x_h, w_w)
-  memset(out, 0, x_h * w_w * sizeof(T));
-  for (int i = 0; i < x_h; i++) {
-    for (int j = 0; j < w_w; j++) {
-      T tmp = static_cast<T>(0);
-      for (int k = 0; k < x_w; k++) {
-        tmp += x[i * x_w + k] * w[k * w_w + j];
+template <lite::TargetType Target, typename T>
+class FCFunctor {
+ public:
+  void operator()(const lite::X86Context& context,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T* X,
+                  const T* W,
+                  T* Y,
+                  const T* B = nullptr,
+                  bool relu = false,
+                  bool padding_weights = false) {
+    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
+    lite::Tensor Y1;
+    T* Y1_data = nullptr;
+    if (N % 128 == 0 && K % 128 == 0) {
+      const int NN = N + 4;
+      const int KK = K + 4;
+      lite::Tensor X1;
+      X1.Resize({M * KK});
+      Y1.Resize({M * (N + 4)});
+      T* X1_data = X1.mutable_data<T>();
+      Y1_data = Y1.mutable_data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < M; i++) {
+        memcpy(X1_data + i * KK, X + i * K, K * sizeof(X[0]));
+      }
+      lite::Tensor W1;
+      T* W1_data = nullptr;
+      if (!padding_weights) {
+        W1.Resize({(K + 4) * (N + 4)});
+        W1_data = W1.mutable_data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+        for (int i = 0; i < K; i++) {
+          memcpy(W1_data + i * NN, W + i * N, N * sizeof(W[0]));
+        }
+      }
+      blas.GEMM(false,
+                false,
+                M,
+                N,
+                K,
+                static_cast<T>(1.0),
+                X1_data,
+                KK,
+                (padding_weights ? W : W1_data),
+                NN,
+                static_cast<T>(0.0),
+                Y1_data,
+                NN);
+    } else {
+      blas.MatMul(M, N, K, X, W, Y);
+    }
+    if (B == NULL) {
+      if (N % 128 == 0 && K % 128 == 0) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+        for (int i = 0; i < M; i++) {
+          memcpy(Y + i * N, Y1_data + i * (N + 4), N * sizeof(Y[0]));
+        }
+      }
+      return;
+    }
+    if (relu) {
+      auto compute =
+          paddle::lite::jit::KernelFuncs<paddle::lite::jit::VAddReluTuple<T>,
+                                         lite::fluid::CPUPlace>::Cache()
+              .At(N);
+      for (int i = 0; i < M; i++) {
+        T* dst = Y + i * N;
+        T* src = (N % 128 == 0 && K % 128 == 0) ? Y1_data + i * (N + 4) : dst;
+        compute(B, src, dst, N);
+      }
+    } else {
+      auto compute =
+          paddle::lite::jit::KernelFuncs<paddle::lite::jit::VAddTuple<T>,
+                                         lite::fluid::CPUPlace>::Cache()
+              .At(N);
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < M; i++) {
+        T* dst = Y + i * N;
+        T* src = (N % 128 == 0 && K % 128 == 0) ? Y1_data + i * (N + 4) : dst;
+        compute(B, src, dst, N);
       }
-      out[i * w_w + j] = tmp + b[j];
     }
   }
-}
+};
 
 template <typename T>
 class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
@@ -81,20 +146,43 @@ class FcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
-    CHECK_GE(param.input->dims().size(), 2UL);
-    CHECK_EQ(param.output->dims().size(), 2UL);
+    auto* input = param.input;
+    auto* w = param.w;
+    auto* bias = param.bias;
+    auto* output = param.output;
+    int in_num_col_dims = param.in_num_col_dims;
+    bool with_relu = (param.activation_type == "relu") ? true : false;
+
+    auto w_dims = w->dims();
+    bool padding_weights = param.padding_weights;
+
+    std::vector<int64_t> output_dims;
+    FCOutputSize(
+        input->dims(), w_dims, output_dims, in_num_col_dims, padding_weights);
+    output->Resize(output_dims);
+    output->set_lod(input->lod());
+
+    auto out_dims = output->dims();
+    auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
+    auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
+    int M = out_dims.production() / w_dims1;
+
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+    T* output_data = output->mutable_data<T>();
 
-    fc_compute_eigen(
-        param.input->data<T>(),  // x
-        param.input->dims().Slice(0, param.in_num_col_dims).production(),
-        param.input->dims()
-            .Slice(param.in_num_col_dims, param.input->dims().size())
-            .production(),
-        param.w->data<T>(),     // w
-        param.w->dims()[0],     // w_h
-        param.w->dims()[1],     // w_w
-        param.bias->data<T>(),  // b
-        param.output->mutable_data<T>());
+    auto& context = ctx_->As<X86Context>();
+    FCFunctor<lite::TargetType::kX86, T> fc;
+    fc(context,
+       M,
+       w_dims1,
+       w_dims0,
+       input_data,
+       w_data,
+       output_data,
+       bias ? bias->data<T>() : NULL,
+       with_relu,
+       padding_weights);
   }
 
   virtual ~FcCompute() = default;
diff --git a/lite/kernels/x86/fc_compute_test.cc b/lite/kernels/x86/fc_compute_test.cc
index abc0597457b7bc8ccd5e9f760ebd28197d7a85d5..32394798b33b0fe6f5dbe15399f31e90f8ff17d4 100644
--- a/lite/kernels/x86/fc_compute_test.cc
+++ b/lite/kernels/x86/fc_compute_test.cc
@@ -11,8 +11,11 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+
 #include "lite/kernels/x86/fc_compute.h"
 #include <gtest/gtest.h>
+#include <memory>
+#include <utility>
 #include <vector>
 #include "lite/core/op_registry.h"
 
@@ -43,7 +46,7 @@ TEST(fc_x86, run_test) {
   w.Resize(lite::DDim(w_shape));
   std::vector<int64_t> b_shape{1, 4};
   b.Resize(lite::DDim(b_shape));
-  std::vector<int64_t> out_shape{1, 4};
+  std::vector<int64_t> out_shape{batch_size, 4};
   out.Resize(lite::DDim(out_shape));
 
   auto x_data = x.mutable_data<float>();
@@ -55,16 +58,12 @@ TEST(fc_x86, run_test) {
     x_data[i] = static_cast<float>(i);
   }
   for (int64_t i = 0; i < w.dims().production(); i++) {
-    w_data[i] = static_cast<float>(i);
+    w_data[i] = static_cast<float>(2);
   }
   for (int64_t i = 0; i < b.dims().production(); i++) {
-    b_data[i] = static_cast<float>(i);
+    b_data[i] = static_cast<float>(2);
   }
 
-  /* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3,  //
-                                     w_data, 3, 4,           //
-                                     b_data, ref_data); */
-
   // FcCompute fc;
   FcCompute<float> fc;
   operators::FcParam param;
@@ -75,21 +74,17 @@ TEST(fc_x86, run_test) {
   param.bias = &b;
   param.output = &out;
   param.in_mat_dims = x.dims();
+  param.activation_type = "relu";
 
-  // std::unique_ptr<KernelContext> ctx(new KernelContext);
-  // ctx->As<X86Context>();
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
   fc.SetParam(param);
-  // fc.SetContext(std::move(ctx));
+  fc.SetContext(std::move(ctx));
   fc.Run();
-
-  VLOG(3) << "output vs ref";
+  std::vector<float> ref_data({8, 8, 8, 8, 26, 26, 26, 26});
   for (int i = 0; i < out.dims().production(); i++) {
-    VLOG(3) << out_data[i];
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
   }
-
-  /* for (int i = 0; i < out.dims().production(); ++i) {
-     EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
-   }*/
 }
 
 }  // namespace x86
diff --git a/lite/kernels/x86/sequence_concat_compute.h b/lite/kernels/x86/sequence_concat_compute.h
index 553e2e8b0667106f25685a9ef155d7e61a672f31..8dd7077f7dbbb3e61f21d63e8c935157b3d2d579 100644
--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
@@ -52,7 +52,29 @@ class SequenceConcatCompute
 
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
-    // auto& param = Param<param_t>();
+
+    int64_t batch_size = 0;
+    int64_t feature_size = 0;
+    std::vector<int64_t> out_dims;
+    for (const auto& tensor : param.X) {
+      const auto x_dims = tensor->dims();
+      if (out_dims.empty()) {
+        out_dims = x_dims.Vectorize();
+      }
+      batch_size += x_dims[0];
+      if (feature_size == 0) {
+        feature_size = x_dims.production() / x_dims[0];
+      } else {
+        CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+            << "Inputs of sequence concat must have same feature size";
+      }
+    }
+    if (batch_size < 0) {
+      batch_size = -1;  // Normalize batch size for compile time.
+    }
+    out_dims[0] = batch_size;
+    param.Out->Resize(out_dims);
+
     T* dout = param.Out->mutable_data<T>();
 
     std::vector<lite::Tensor> x_in_order;
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
index 72c48ceab079bc65e4f2363a1702de52586733d6..d9c6de358650d5bc84e12762198988c0e46e34bf 100644
--- a/lite/kernels/xpu/CMakeLists.txt
+++ b/lite/kernels/xpu/CMakeLists.txt
@@ -1,9 +1,4 @@
 
-if(NOT LITE_WITH_XPU)
-  return ()
-endif()
-
-add_kernel(graph_compute_xpu XPU basic SRCS graph_compute.cc DEPS ${lite_kernel_deps} xpu_runtime)
-# lite_cc_test(test_graph_compute_xpu SRCS graph_compute_test.cc DEPS graph_compute_xpu)
-
 add_subdirectory(bridges)
+
+add_kernel(subgraph_compute_xpu XPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_xpu subgraph_bridge_engine ${xpu_subgraph_bridges})
diff --git a/lite/kernels/xpu/bridges/CMakeLists.txt b/lite/kernels/xpu/bridges/CMakeLists.txt
index 47724728dfdb270ae5beb85852af6037735fda71..19e3dd7ec53f87454563987597071973b4dc3123 100644
--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
@@ -1,32 +1,33 @@
-lite_cc_library(xpu_bridge_registry SRCS registry.cc)
+if(NOT LITE_WITH_XPU)
+  return()
+endif()
 
-set(xpu_bridge_deps xpu_bridge_registry xpu_builder op)
+lite_cc_library(subgraph_bridge_utility_xpu SRCS utility.cc DEPS ${xpu_builder_libs} tensor)
+lite_cc_library(subgraph_bridge_graph_xpu SRCS graph.cc DEPS subgraph_bridge_utility_xpu)
 
-lite_cc_library(xpu_bridge_act_op SRCS act_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_conv_op SRCS conv_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_elementwise_ops SRCS elementwise_ops.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_pool_op SRCS pool_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_softmax_op SRCS softmax_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_mul_op SRCS mul_op.cc DEPS ${xpu_bridge_deps})
-lite_cc_library(xpu_bridge_batch_norm_op SRCS batch_norm_op.cc DEPS ${xpu_bridge_deps})
+set(xpu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_xpu subgraph_bridge_graph_xpu)
 
-set(xpu_bridges
-        xpu_bridge_registry
-        xpu_bridge_act_op
-        xpu_bridge_conv_op
-        xpu_bridge_elementwise_ops
-        xpu_bridge_pool_op
-        xpu_bridge_softmax_op
-        xpu_bridge_mul_op
-        xpu_bridge_batch_norm_op
-        CACHE INTERNAL "xpu_bridges")
+lite_cc_library(subgraph_bridge_act_op_xpu SRCS act_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_conv_op_xpu SRCS conv_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_elementwise_ops_xpu SRCS elementwise_ops.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_pool_op_xpu SRCS pool_op.cc DEPS ${subgraph_bridge_deps_xpu})
+lite_cc_library(subgraph_bridge_softmax_op_xpu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_xpu})
+lite_cc_library(subgraph_bridge_mul_op_xpu SRCS mul_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_batch_norm_op_xpu SRCS batch_norm_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_transpose_op_xpu SRCS transpose_op.cc DEPS ${xpu_subgraph_bridge_deps})
 
-set(xpu_bridge_test_deps ${xpu_bridges} ${xpu_kernels} ${ops})
+set(xpu_subgraph_bridges
+        subgraph_bridge_registry
+        subgraph_bridge_utility_xpu
+        subgraph_bridge_graph_xpu
+        subgraph_bridge_act_op_xpu
+        subgraph_bridge_conv_op_xpu
+        subgraph_bridge_elementwise_ops_xpu
+        subgraph_bridge_pool_op_xpu
+        subgraph_bridge_softmax_op_xpu
+        subgraph_bridge_mul_op_xpu
+        subgraph_bridge_batch_norm_op_xpu
+        subgraph_bridge_transpose_op_xpu
+        CACHE INTERNAL "xpu_subgraph_bridges")
 
-lite_cc_test(test_xpu_bridge_act_op SRCS act_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_conv_op SRCS conv_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_elementwise_ops SRCS elementwise_ops_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_pool_op SRCS pool_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_softmax_op SRCS softmax_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_mul_op SRCS mul_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
-lite_cc_test(test_xpu_bridge_batch_norm_op SRCS batch_norm_op_test.cc test_helper.cc DEPS ${xpu_bridge_test_deps})
+message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
diff --git a/lite/kernels/xpu/bridges/act_op.cc b/lite/kernels/xpu/bridges/act_op.cc
index d8e11caa96fdbff3a853a192a8d16f2eccd96337..7536ec264d6aee209b66fd8268aefb63e2f05717 100644
--- a/lite/kernels/xpu/bridges/act_op.cc
+++ b/lite/kernels/xpu/bridges/act_op.cc
@@ -12,51 +12,41 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type ActConverter(const std::shared_ptr<lite::OpLite> op,
-                           graph_ctx_type* graph_ctx,
-                           const node_map_type& input_nodes) {
+int ActConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-
-  // create act node and set params from op
+  // Create act node and set params from op
   auto x_var_name = op_info->Input("X").front();
-  CHECK(input_nodes.count(x_var_name));
-  std::shared_ptr<xtcl::xExpr> act_node = nullptr;
+  auto out_var_name = op_info->Output("Out").front();
+  CHECK(graph->HasNode(x_var_name));
   if (op_type == "relu") {
-    act_node = std::make_shared<xtcl::xExpr>(
-        graph_ctx->builder->CreateRelu(*input_nodes.at(x_var_name)));
+    graph->AddNode(out_var_name,
+                   graph->builder_.CreateRelu(*graph->GetNode(x_var_name)));
   } else {
     // TODO(hong19860320) supports more activation ops
-    LOG(FATAL) << "[XPU] Unsupported activation type " << op_type;
+    LOG(WARNING) << "[XPU] Unsupported activation type " << op_type;
+    return FAILED;
   }
-  graph_ctx->builder->SetLayer(unique_op_type);
-
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = act_node;
-  return output_nodes;
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(relu, paddle::lite::kernels::xpu::bridges::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU, relu, paddle::lite::subgraph::xpu::ActConverter);
diff --git a/lite/kernels/xpu/bridges/batch_norm_op.cc b/lite/kernels/xpu/bridges/batch_norm_op.cc
index 0c46b7878cdbb6987a11215d4dfcb80a2672aad2..4ca107679bfd3b750b1fc6b6b1832818b4fff12d 100644
--- a/lite/kernels/xpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/xpu/bridges/batch_norm_op.cc
@@ -12,30 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type BatchNormConverter(const std::shared_ptr<lite::OpLite> op,
-                                 graph_ctx_type* graph_ctx,
-                                 const node_map_type& input_nodes) {
-  auto scope = op->scope();
+int BatchNormConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
-  // get input, and attributes
+  // Get input vars and op attributes
   auto x_var_name = op_info->Input("X").front();
   auto scale_var_name = op_info->Input("Scale").front();
   auto* scale = scope->FindMutableTensor(scale_var_name);
@@ -45,69 +40,33 @@ node_map_type BatchNormConverter(const std::shared_ptr<lite::OpLite> op,
   auto* mean = scope->FindMutableTensor(mean_var_name);
   auto variance_var_name = op_info->Input("Variance").front();
   auto* variance = scope->FindMutableTensor(variance_var_name);
+  auto y_var_name = op_info->Output("Y").front();
   auto epsilon = op_info->GetAttr<float>("epsilon");
 
-  // create scale node
-  CHECK(!input_nodes.count(scale_var_name));
-  auto scale_const_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateTensor(scale_var_name,
-                                       lite::xpu::CvtShape(scale->dims()),
-                                       ::xtcl::Float(32)));
-  auto scale_const_tensor = lite::xpu::CvtTensor(scale);
-  graph_ctx->params->emplace(
-      std::make_pair(scale_var_name, *scale_const_tensor));
-
-  // create bias node
-  CHECK(!input_nodes.count(bias_var_name));
-  auto bias_const_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-          bias_var_name, lite::xpu::CvtShape(bias->dims()), ::xtcl::Float(32)));
-  auto bias_const_tensor = lite::xpu::CvtTensor(bias);
-  graph_ctx->params->emplace(std::make_pair(bias_var_name, *bias_const_tensor));
-
-  // create mean node
-  CHECK(!input_nodes.count(mean_var_name));
-  auto mean_const_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-          mean_var_name, lite::xpu::CvtShape(mean->dims()), ::xtcl::Float(32)));
-  auto mean_const_tensor = lite::xpu::CvtTensor(mean);
-  graph_ctx->params->emplace(std::make_pair(mean_var_name, *mean_const_tensor));
-
-  // create variance node
-  CHECK(!input_nodes.count(variance_var_name));
-  auto variance_const_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateTensor(variance_var_name,
-                                       lite::xpu::CvtShape(variance->dims()),
-                                       ::xtcl::Float(32)));
-  auto variance_const_tensor = lite::xpu::CvtTensor(variance);
-  graph_ctx->params->emplace(
-      std::make_pair(variance_var_name, *variance_const_tensor));
-
-  // create batch_norm node and set params from op
-  CHECK(input_nodes.count(x_var_name));
-  auto batch_norm_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateBatchNorm(*input_nodes.at(x_var_name),
-                                          *scale_const_node,
-                                          *bias_const_node,
-                                          *mean_const_node,
-                                          *variance_const_node,
-                                          1,
-                                          epsilon));
-  batch_norm_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->GetField(*batch_norm_node, 0));
-  graph_ctx->builder->SetLayer(unique_op_type);
+  // Create scale, bias, mean, variance nodes
+  auto scale_const_node = graph->AddNode(scale_var_name, *scale);
+  auto bias_const_node = graph->AddNode(bias_var_name, *bias);
+  auto mean_const_node = graph->AddNode(mean_var_name, *mean);
+  auto variance_const_node = graph->AddNode(variance_var_name, *variance);
 
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Y").front()] = batch_norm_node;
-  return output_nodes;
+  // Create batch_norm node and set params from op
+  auto batch_norm_node =
+      graph->builder_.CreateBatchNorm(*graph->GetNode(x_var_name),
+                                      *scale_const_node,
+                                      *bias_const_node,
+                                      *mean_const_node,
+                                      *variance_const_node,
+                                      1,
+                                      epsilon);
+  graph->AddNode(y_var_name, graph->builder_.GetField(batch_norm_node, 0));
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(batch_norm,
-                    paddle::lite::kernels::xpu::bridges::BatchNormConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         batch_norm,
+                         paddle::lite::subgraph::xpu::BatchNormConverter);
diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc
index d6fc806ad4541a735ea4ef6eff292076836ac5e7..2abddf9e4b5c6dbab662e9d41ad81c7d22697fec 100644
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
@@ -13,31 +13,32 @@
 // limitations under the License.
 
 #include "lite/operators/conv_op.h"
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
-                            graph_ctx_type* graph_ctx,
-                            const node_map_type& input_nodes) {
-  auto scope = op->scope();
+int ConvConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " << op_type << "... ";
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " << op_type << "... ";
 
-  // get input, filter and op attributes
+  // Get input, filter and op attributes
   auto input_var_name = op_info->Input("Input").front();
-  auto input = scope->FindVar(input_var_name)->GetMutable<lite::Tensor>();
+  auto input = scope->FindVar(input_var_name)->GetMutable<Tensor>();
   auto input_dims = input->dims();
   auto filter_var_name = op_info->Input("Filter").front();
-  auto filter = scope->FindVar(filter_var_name)->GetMutable<lite::Tensor>();
+  auto filter = scope->FindVar(filter_var_name)->GetMutable<Tensor>();
   auto filter_dims = filter->dims();
+  auto output_var_name = op_info->Output("Output").front();
   auto bs = input_dims[0];
   auto oc = filter_dims[0];
   CHECK_EQ(input_dims.size(), 4);
@@ -80,26 +81,14 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
   }
   DDim output_dims(output_shape);
 
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-
-  // create filter node
-  CHECK(!input_nodes.count(filter_var_name));
-  auto filter_const_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateTensor(filter_var_name,
-                                       lite::xpu::CvtShape(filter_dims),
-                                       ::xtcl::Float(32)));
-  auto filter_const_tensor = lite::xpu::CvtTensor(filter);
-  graph_ctx->params->emplace(
-      std::make_pair(filter_var_name, *filter_const_tensor));
+  // Create filter node
+  auto filter_const_node = graph->AddNode(filter_var_name, *filter);
 
-  // create conv node and set input, filter, bias nodes and attributes
+  // Create conv node and set input, filter, bias nodes and attributes
   auto conv_attrs = xtcl::make_node<xtcl::network::Conv2DAttrs>();
-  conv_attrs->strides = std::move(lite::xpu::CvtShape(strides));
-  conv_attrs->padding = std::move(lite::xpu::CvtShape(paddings));
-  conv_attrs->dilation = std::move(lite::xpu::CvtShape(dilations));
+  conv_attrs->strides = std::move(CvtShape(strides));
+  conv_attrs->padding = std::move(CvtShape(paddings));
+  conv_attrs->dilation = std::move(CvtShape(dilations));
   conv_attrs->groups = groups;
   // conv_attrs->channels = nullptr;
   conv_attrs->kernel_size = std::move(xtcl::Array<xtcl::xIndexExpr>(nullptr));
@@ -107,20 +96,19 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
   conv_attrs->kernel_layout = "OIHW";
   conv_attrs->out_layout = "";
   // conv_attrs->out_dtype = "";
-  CHECK(input_nodes.count(input_var_name));
-  auto conv_node =
-      std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateConv2D(
-          *input_nodes.at(input_var_name), *filter_const_node, conv_attrs));
-  graph_ctx->builder->SetLayer(unique_op_type);
+  auto conv_node = graph->AddNode(
+      output_var_name,
+      graph->builder_.CreateConv2D(
+          *graph->GetNode(input_var_name), *filter_const_node, conv_attrs));
 
-  // create bias node if has bias
+  // Create bias node if exists bias
   // supports the bias nodes with the following dimensions
   // 0: {oc}
   // 1: {1, oc, oh, ow}
   // 2: {n, oc, oh, ow}
-  if (lite::xpu::HasInputArg(op_info, scope, "Bias")) {
+  if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_var_name = op_info->Input("Bias").front();
-    auto* bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
+    auto* bias = scope->FindVar(bias_var_name)->GetMutable<Tensor>();
     auto bias_dims = bias->dims();
     auto bias_data_size = bias_dims.production();
     auto output_data_size = output_dims.production();
@@ -137,57 +125,46 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
       // 2: {n, oc, oh, ow}
       bias_shape = output_dims.Vectorize();
     } else {
-      LOG(ERROR) << "bias dimension " << bias_dims
+      LOG(ERROR) << "[XPU] Bias dimension " << bias_dims
                  << " isn't supported in conv2d Op when output dimension is "
                  << output_dims;
     }
     std::shared_ptr<xtcl::xExpr> bias_node = nullptr;
-    if (input_nodes.count(bias_var_name)) {
-      // bias node from input node
-      bias_node = input_nodes.at(bias_var_name);
+    if (graph->HasNode(bias_var_name)) {
+      // Bias node from input node
+      bias_node = graph->GetNode(bias_var_name);
     } else {
-      // bias node with const tensor
-      auto bias_const_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateTensor(bias_var_name,
-                                           lite::xpu::CvtShape(bias_shape),
-                                           ::xtcl::Float(32)));
-      auto bias_const_tensor = lite::xpu::CvtTensor(bias, bias_shape);
-      graph_ctx->params->emplace(
-          std::make_pair(bias_var_name, *bias_const_tensor));
-      bias_node = bias_const_node;
+      // Bias node with const tensor
+      bias_node = graph->AddNode(bias_var_name, *bias, bias_shape);
     }
     std::shared_ptr<xtcl::xExpr> add_node = nullptr;
     if (is_channel_bias) {
-      add_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateBiasAdd(*conv_node, 1, *bias_node));
+      add_node = graph->AddNode(
+          output_var_name,
+          graph->builder_.CreateBiasAdd(*conv_node, 1, *bias_node));
     } else {
-      add_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateBinaryOp("add", *conv_node, *bias_node));
+      add_node = graph->AddNode(
+          output_var_name,
+          graph->builder_.CreateBinaryOp("add", *conv_node, *bias_node));
     }
-    graph_ctx->builder->SetLayer(unique_op_type + "/add");
     conv_node = add_node;
   }
 
-  // output converted nodes
-  node_map_type output_nodes;
   if (fuse_relu) {
-    // append relu node if fuse_relu is true
-    auto relu_node = std::make_shared<xtcl::xExpr>(
-        graph_ctx->builder->CreateRelu(*conv_node));
-    graph_ctx->builder->SetLayer(unique_op_type + "/relu");
-    output_nodes[op_info->Output("Output").front()] = relu_node;
-  } else {
-    output_nodes[op_info->Output("Output").front()] = conv_node;
+    // Append relu node if fuse_relu is true
+    graph->AddNode(output_var_name, graph->builder_.CreateRelu(*conv_node));
   }
-  return output_nodes;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(conv2d, paddle::lite::kernels::xpu::bridges::ConvConverter);
-REGISTER_XPU_BRIDGE(depthwise_conv2d,
-                    paddle::lite::kernels::xpu::bridges::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         conv2d,
+                         paddle::lite::subgraph::xpu::ConvConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         depthwise_conv2d,
+                         paddle::lite::subgraph::xpu::ConvConverter);
diff --git a/lite/kernels/xpu/bridges/elementwise_ops.cc b/lite/kernels/xpu/bridges/elementwise_ops.cc
index b9fe7db14d2dfd00a7e74c77d2fe3b84e9593f72..0ab6cc109105822fe441857ad56ccfc003e4427b 100644
--- a/lite/kernels/xpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/xpu/bridges/elementwise_ops.cc
@@ -12,85 +12,72 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type ElementwiseConverter(const std::shared_ptr<lite::OpLite> op,
-                                   graph_ctx_type* graph_ctx,
-                                   const node_map_type& input_nodes) {
-  auto scope = op->scope();
+int ElementwiseConverter(void* ctx, OpLite* op) {
+  CHECK(op != nullptr);
+  CHECK(ctx != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
-  // get input, and attributes
+  // Get input, and attributes
   auto x_var_name = op_info->Input("X").front();
   auto y_var_name = op_info->Input("Y").front();
+  auto out_var_name = op_info->Output("Out").front();
   auto axis = op_info->GetAttr<int>("axis");
-  auto x_tensor = scope->FindMutableTensor(x_var_name);
-  auto y_tensor = scope->FindMutableTensor(y_var_name);
-  auto x_dims = x_tensor->dims();
-  auto y_dims = y_tensor->dims();
+  auto x = scope->FindMutableTensor(x_var_name);
+  auto y = scope->FindMutableTensor(y_var_name);
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
 
-  // create x and y node
+  // Create x and y node
   std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (input_nodes.count(x_var_name)) {
-    x_node = input_nodes.at(x_var_name);
+  if (graph->HasNode(x_var_name)) {
+    x_node = graph->GetNode(x_var_name);
   } else {
-    x_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-        x_var_name, lite::xpu::CvtShape(x_dims), ::xtcl::Float(32)));
-    auto x_const_tensor = lite::xpu::CvtTensor(x_tensor);
-    graph_ctx->params->emplace(std::make_pair(x_var_name, *x_const_tensor));
+    x_node = graph->AddNode(x_var_name, *x);
   }
 
   std::shared_ptr<xtcl::xExpr> y_node = nullptr;
-  if (input_nodes.count(y_var_name)) {
-    y_node = input_nodes.at(y_var_name);
+  if (graph->HasNode(y_var_name)) {
+    y_node = graph->GetNode(y_var_name);
   } else {
-    y_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-        y_var_name, lite::xpu::CvtShape(y_dims), ::xtcl::Float(32)));
-    auto y_const_tensor = lite::xpu::CvtTensor(y_tensor);
-    graph_ctx->params->emplace(std::make_pair(y_var_name, *y_const_tensor));
+    y_node = graph->AddNode(y_var_name, *y);
   }
 
-  // create elementwise node and set input, attributes
+  // Create elementwise node and set input, attributes
   std::shared_ptr<xtcl::xExpr> elementwise_node = nullptr;
   if (y_dims.size() == 1) {
-    elementwise_node = std::make_shared<xtcl::xExpr>(
-        graph_ctx->builder->CreateBiasAdd(*x_node, axis, *y_node));
+    elementwise_node = graph->AddNode(
+        out_var_name, graph->builder_.CreateBiasAdd(*x_node, axis, *y_node));
   } else if (x_dims.size() == y_dims.size()) {
-    elementwise_node = std::make_shared<xtcl::xExpr>(
-        graph_ctx->builder->CreateBinaryOp("add", *x_node, *y_node));
+    elementwise_node = graph->AddNode(
+        out_var_name, graph->builder_.CreateBinaryOp("add", *x_node, *y_node));
   } else {
-    LOG(ERROR) << "XPU elementwise_add only support y of one dimension, or x "
-                  "and y of the same dimension. But recieved x's dimension: "
-               << x_dims << ", y's dimension: " << y_dims << ", axis: " << axis;
+    LOG(WARNING)
+        << "[XPU] elementwise_add only support y of one dimension, or x "
+           "and y of the same dimension. But recieved x's dimension: "
+        << x_dims << ", y's dimension: " << y_dims << ", axis: " << axis;
+    return FAILED;
   }
-  graph_ctx->builder->SetLayer(unique_op_type);
-
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = elementwise_node;
-  return output_nodes;
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(elementwise_add,
-                    paddle::lite::kernels::xpu::bridges::ElementwiseConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         elementwise_add,
+                         paddle::lite::subgraph::xpu::ElementwiseConverter);
diff --git a/lite/kernels/xpu/bridges/graph.cc b/lite/kernels/xpu/bridges/graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d35e219722de82956fddb3e8377e546fe301569
--- /dev/null
+++ b/lite/kernels/xpu/bridges/graph.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/bridges/graph.h"
+#include <utility>
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
+                                            const xtcl::xExpr& layer) {
+  auto unique_name = [&](const std::string& key) {
+    int idx = 1;
+    auto it = counts_.find(key);
+    if (it == counts_.end()) {
+      counts_.insert(std::make_pair(key, idx));
+    } else {
+      idx = ++(it->second);
+    }
+    return key + "_" + std::to_string(idx);
+  };
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    CHECK(params_.find(name) == params_.end()) << "[XPU] Node " << name
+                                               << " redefined.";
+    // Generate a new unique name as the key to bind the origin node if the
+    // origin node isn't a const node: new_name->node
+    nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second));
+    nodes_.erase(it);
+  }
+  // Create a new node and bind with the name: name->new_node
+  auto node = std::make_shared<xtcl::xExpr>(layer);
+  nodes_.insert(std::make_pair(name, node));
+  builder_.SetLayer(unique_name(name + "_op"));
+  return node;
+}
+
+// Const node
+std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
+                                            const Tensor& tensor,
+                                            PrecisionType ptype,
+                                            DataLayoutType ltype) {
+  return AddNode(name, tensor, tensor.dims().Vectorize(), ptype, ltype);
+}
+
+std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
+                                            const Tensor& tensor,
+                                            std::vector<int64_t> shape,
+                                            PrecisionType ptype,
+                                            DataLayoutType ltype) {
+  auto node = AddNode(name, shape, ptype, ltype);
+  params_.emplace(
+      std::make_pair(name, *CvtTensor(tensor, shape, ptype, ltype)));
+  return node;
+}
+
+// Data node
+std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
+                                            std::vector<int64_t> shape,
+                                            PrecisionType ptype,
+                                            DataLayoutType ltype) {
+  CHECK(!HasNode(name));
+  auto node = std::make_shared<xtcl::xExpr>(
+      builder_.CreateTensor(name, CvtShape(shape), CvtPrecisionType(ptype)));
+  nodes_.insert(std::make_pair(name, node));
+  return node;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/graph.h b/lite/kernels/xpu/bridges/graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b43a8435a862fea9834e2ebe67df57edbd4449a
--- /dev/null
+++ b/lite/kernels/xpu/bridges/graph.h
@@ -0,0 +1,129 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+// The Context of the converters which used for converting the ops of subgraph
+// to the XPU IR graph
+class Graph {
+ public:
+  // Layer node
+  std::shared_ptr<xtcl::xExpr> AddNode(const std::string& name,
+                                       const xtcl::xExpr& layer);
+
+  // Const node
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      const Tensor& tensor,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      const Tensor& tensor,
+      std::vector<int64_t> shape,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+
+  template <typename T>
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      const std::vector<T>& data,
+      std::vector<int64_t> shape = {},
+      DataLayoutType ltype = DATALAYOUT(kNCHW)) {
+    const std::type_info& info = typeid(T);
+    PrecisionType ptype = PRECISION(kFloat);
+    if (info == typeid(float)) {
+      ptype = PRECISION(kFloat);
+    } else if (info == typeid(int8_t)) {
+      ptype = PRECISION(kFloat);
+    } else if (info == typeid(int32_t)) {
+      ptype = PRECISION(kInt32);
+    } else {
+      LOG(FATAL) << "[XPU] Unknow data type " << info.name();
+    }
+    if (shape.empty()) {
+      shape = {static_cast<int64_t>(data.size())};
+    } else {
+      int size = 1;
+      for (auto i : shape) {
+        size *= i;
+      }
+      CHECK_EQ(data.size(), size);
+    }
+    Tensor tensor;
+    tensor.Resize(shape);
+    std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
+                reinterpret_cast<const uint8_t*>(data.data()),
+                data.size() * sizeof(T));
+    return AddNode(name, tensor, ptype, ltype);
+  }
+
+  template <typename T>
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      T value,
+      std::vector<int64_t> shape = {1},
+      DataLayoutType ltype = DATALAYOUT(kNCHW)) {
+    int64_t size = 1;
+    for (auto i : shape) {
+      size *= i;
+    }
+    std::vector<T> data(size, value);
+    return AddNode(name, data, shape, ltype);
+  }
+
+  // Data node
+  std::shared_ptr<xtcl::xExpr> AddNode(
+      const std::string& name,
+      std::vector<int64_t> shape,
+      PrecisionType ptype = PRECISION(kFloat),
+      DataLayoutType ltype = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<xtcl::xExpr> GetNode(const std::string& name) {
+    CHECK(HasNode(name)) << "[XPU] Node " << name << " not found.";
+    return nodes_.at(name);
+  }
+
+  bool HasNode(const std::string& name) {
+    return nodes_.find(name) != nodes_.end();
+  }
+
+ public:
+  // XPU network builder and constant tensors
+  xtcl::network::xNetworkBuilder builder_;
+  xtcl::network::xTensorCompiler::ParamNDArrayMap params_;
+
+ private:
+  std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>> nodes_;
+  std::unordered_map<std::string, int> counts_;
+};
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc
index 549abd3b1370a0fb90b4e9f4606ab15b3f9444ba..321c0aa21769bdb16acf33c9677461c3a445e232 100644
--- a/lite/kernels/xpu/bridges/mul_op.cc
+++ b/lite/kernels/xpu/bridges/mul_op.cc
@@ -12,34 +12,30 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op,
-                           graph_ctx_type* graph_ctx,
-                           const node_map_type& input_nodes) {
-  auto scope = op->scope();
+int MulConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
-
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
-  // get input, and attributes
+  // Get input, and attributes
   auto x_var_name = op_info->Input("X").front();
   auto y_var_name = op_info->Input("Y").front();
-  auto y_tensor = scope->FindMutableTensor(y_var_name);
-  auto y_dims = y_tensor->dims();
+  auto out_var_name = op_info->Output("Out").front();
+  auto y = scope->FindMutableTensor(y_var_name);
+  auto y_dims = y->dims();
   CHECK_EQ(y_dims.size(), 2) << "xpu now only support y_dims.size() == 2";
 
   auto x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
@@ -47,54 +43,38 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> op,
   auto y_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
   CHECK_EQ(y_num_col_dims, 1) << "xpu now only support y_num_col_dims == 1";
 
-  // create x node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  x_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateBatchFlatten(*input_nodes.at(x_var_name)));
-  graph_ctx->builder->SetLayer(unique_op_type + "/X");
+  // Flatten x node
+  auto x_node = graph->AddNode(
+      x_var_name + "/flatten",
+      graph->builder_.CreateBatchFlatten(*graph->GetNode(x_var_name)));
 
-  // transpose y
-  DDimLite y_dims_t(std::vector<int64_t>{1, 1});
-  y_dims_t[0] = y_dims[1];
-  y_dims_t[1] = y_dims[0];
-  auto y_var_name_t = unique_op_type + "/Y";
-  Tensor* y_tensor_t = new Tensor();
-  y_tensor_t->Resize(y_dims_t);
-  auto y_data_t = y_tensor_t->mutable_data<float>();
-  auto y_data = y_tensor->mutable_data<float>();
-  for (int i = 0; i < y_dims_t[0]; i++) {
-    for (int j = 0; j < y_dims_t[1]; j++) {
-      y_data_t[i * y_dims_t[1] + j] = y_data[j * y_dims_t[0] + i];
+  // Transpose y data and create y node
+  Tensor transpose_y;
+  DDim transpose_y_dims(std::vector<int64_t>{y_dims[1], y_dims[0]});
+  transpose_y.Resize(transpose_y_dims);
+  auto transpose_y_data = transpose_y.mutable_data<float>();
+  auto y_data = y->mutable_data<float>();
+  for (int i = 0; i < transpose_y_dims[0]; i++) {
+    for (int j = 0; j < transpose_y_dims[1]; j++) {
+      transpose_y_data[i * transpose_y_dims[1] + j] =
+          y_data[j * transpose_y_dims[0] + i];
     }
   }
+  auto y_const_node = graph->AddNode(y_var_name + "/transpose", transpose_y);
 
-  // create y node
-  std::shared_ptr<xtcl::xExpr> y_const_node = nullptr;
-  y_const_node = std::make_shared<xtcl::xExpr>(graph_ctx->builder->CreateTensor(
-      y_var_name_t, lite::xpu::CvtShape(y_dims_t), ::xtcl::Float(32)));
-  auto y_const_tensor = lite::xpu::CvtTensor(y_tensor_t);
-  graph_ctx->params->emplace(std::make_pair(y_var_name_t, *y_const_tensor));
-  delete y_tensor_t;
-
-  // create mul node and set params from op
-  std::shared_ptr<xtcl::xExpr> mul_node = nullptr;
-  mul_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateDense(*x_node,
-                                      static_cast<int>(y_dims[1]),
-                                      ::xtcl::NullValue<::xtcl::DataType>(),
-                                      *y_const_node));
-  graph_ctx->builder->SetLayer(unique_op_type);
-
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = mul_node;
-  return output_nodes;
+  // Create mul node and set params from op
+  graph->AddNode(
+      out_var_name,
+      graph->builder_.CreateDense(*x_node,
+                                  static_cast<int>(y_dims[1]),
+                                  ::xtcl::NullValue<::xtcl::DataType>(),
+                                  *y_const_node));
+  return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(mul, paddle::lite::kernels::xpu::bridges::MulConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU, mul, paddle::lite::subgraph::xpu::MulConverter);
diff --git a/lite/kernels/xpu/bridges/paddle_use_bridges.h b/lite/kernels/xpu/bridges/paddle_use_bridges.h
new file mode 100644
index 0000000000000000000000000000000000000000..211a9a9ab0d4027fb2cb6ec31447c481e6221d94
--- /dev/null
+++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h
@@ -0,0 +1,26 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+USE_SUBGRAPH_BRIDGE(XPU, relu);
+USE_SUBGRAPH_BRIDGE(XPU, conv2d);
+USE_SUBGRAPH_BRIDGE(XPU, depthwise_conv2d);
+USE_SUBGRAPH_BRIDGE(XPU, elementwise_add);
+USE_SUBGRAPH_BRIDGE(XPU, pool2d);
+USE_SUBGRAPH_BRIDGE(XPU, softmax);
+USE_SUBGRAPH_BRIDGE(XPU, mul);
+USE_SUBGRAPH_BRIDGE(XPU, batch_norm);
+USE_SUBGRAPH_BRIDGE(XPU, transpose);
+USE_SUBGRAPH_BRIDGE(XPU, transpose2);
diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc
index fbc6a9919c446508afa5a3b8a1c35352f9b8ecfa..0e6cc818c9e104db5c6950a8643748067e2833dd 100644
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
@@ -12,30 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op,
-                            graph_ctx_type* graph_ctx,
-                            const node_map_type& input_nodes) {
+int PoolConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-
-  // get input, and attributes
+  // Get input, and attributes
   auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
   auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
@@ -44,54 +40,51 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> op,
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
   auto exclusive = op_info->GetAttr<bool>("exclusive");
 
-  // create pool node and set params from op
-  CHECK(input_nodes.count(x_var_name));
-  std::shared_ptr<xtcl::xExpr> pool_node = nullptr;
+  // Create pool node and set params from op
   if (pooling_type == "max") {
     if (global_pooling) {
-      pool_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateGlobalMaxPool2D(
-              *input_nodes.at(x_var_name)));
+      graph->AddNode(
+          out_var_name,
+          graph->builder_.CreateGlobalMaxPool2D(*graph->GetNode(x_var_name)));
     } else {
-      pool_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateMaxPool2D(*input_nodes.at(x_var_name),
-                                              lite::xpu::CvtShape(ksize),
-                                              lite::xpu::CvtShape(strides),
-                                              lite::xpu::CvtShape(paddings),
-                                              "NCHW",
-                                              ceil_mode));
+      graph->AddNode(
+          out_var_name,
+          graph->builder_.CreateMaxPool2D(*graph->GetNode(x_var_name),
+                                          CvtShape(ksize),
+                                          CvtShape(strides),
+                                          CvtShape(paddings),
+                                          "NCHW",
+                                          ceil_mode));
     }
   } else if (pooling_type == "avg") {
     if (global_pooling) {
-      pool_node = std::make_shared<xtcl::xExpr>(
-          graph_ctx->builder->CreateGlobalAvgPool2D(
-              *input_nodes.at(x_var_name)));
+      graph->AddNode(
+          out_var_name,
+          graph->builder_.CreateGlobalAvgPool2D(*graph->GetNode(x_var_name)));
     } else {
-      pool_node = std::make_shared<xtcl::xExpr>(
-          // !exclusive ---> count_include_pad
-          graph_ctx->builder->CreateAvgPool2D(*input_nodes.at(x_var_name),
-                                              lite::xpu::CvtShape(ksize),
-                                              lite::xpu::CvtShape(strides),
-                                              lite::xpu::CvtShape(paddings),
-                                              "NCHW",
-                                              ceil_mode,
-                                              !exclusive));
+      // !exclusive ---> count_include_pad
+      graph->AddNode(
+          out_var_name,
+          graph->builder_.CreateAvgPool2D(*graph->GetNode(x_var_name),
+                                          CvtShape(ksize),
+                                          CvtShape(strides),
+                                          CvtShape(paddings),
+                                          "NCHW",
+                                          ceil_mode,
+                                          !exclusive));
     }
   } else {
-    LOG(FATAL) << "Unsupported pooling type: " << pooling_type;
+    LOG(WARNING) << "[XPU] Unsupported pooling type: " << pooling_type;
+    return FAILED;
   }
-  graph_ctx->builder->SetLayer(unique_op_type);
-
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = pool_node;
-  return output_nodes;
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(pool2d, paddle::lite::kernels::xpu::bridges::PoolConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         pool2d,
+                         paddle::lite::subgraph::xpu::PoolConverter);
diff --git a/lite/kernels/xpu/bridges/registry.h b/lite/kernels/xpu/bridges/registry.h
deleted file mode 100644
index c990399c1cdeb865dc214d2f1c6d1970b6d27b85..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/bridges/registry.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <xtcl/xtcl.h>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-// xpu network builder and constant tensors
-class graph_ctx_type {
- public:
-  std::shared_ptr<xtcl::network::xNetworkBuilder> builder;
-  std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params;
-};
-
-// var_name, xpu node pointer
-using node_map_type =
-    std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>>;
-
-using func_type = std::function<node_map_type(
-    const std::shared_ptr<OpLite>, graph_ctx_type*, const node_map_type&)>;
-using cvt_map_type = std::unordered_map<std::string, func_type>;
-class Factory {
- public:
-  static Factory& Instance();
-
-  const cvt_map_type& AllFunctions() const { return map_; }
-  bool HasType(const std::string& op_type) const;
-  void Insert(const std::string& op_type, const func_type& func_name);
-  Factory() = default;
-
- private:
-  cvt_map_type map_;
-  DISALLOW_COPY_AND_ASSIGN(Factory);
-};
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name)                         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                 \
-      __reg_xpu_bridge_##op_type##__,                                       \
-      "REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \
-  int __reg_xpu_bridge_##op_type##_Insert() {                               \
-    paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert(        \
-        #op_type, cvt_func_name);                                           \
-    return 0;                                                               \
-  }
-
-#define USE_XPU_BRIDGE(op_type)                                  \
-  extern int __reg_xpu_bridge_##op_type##_Insert();              \
-  static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \
-      __reg_xpu_bridge_##op_type##_Insert();
diff --git a/lite/kernels/xpu/bridges/softmax_op.cc b/lite/kernels/xpu/bridges/softmax_op.cc
index 3972496762a1d399ab59e7a69b0e9e18a9c28300..af3f233e2d844fb4911c8af3e8ff8a0a458d3fd9 100644
--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
@@ -12,50 +12,40 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include "lite/kernels/xpu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
 
 namespace paddle {
 namespace lite {
-namespace kernels {
+namespace subgraph {
 namespace xpu {
-namespace bridges {
 
-node_map_type SoftmaxConverter(const std::shared_ptr<lite::OpLite> op,
-                               graph_ctx_type* graph_ctx,
-                               const node_map_type& input_nodes) {
+int SoftmaxConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
   auto op_info = op->op_info();
   auto op_type = op_info->Type();
-  auto unique_op_type = lite::xpu::UniqueName(op_type);
-  LOG(INFO) << "[XPU] Converting " + op_type + "...";
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
 
-  // check context
-  CHECK(graph_ctx != nullptr);
-  CHECK(graph_ctx->builder != nullptr);
-  CHECK(graph_ctx->params != nullptr);
-
-  // get op's attributes
+  // Get op's attributes
   auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
   auto axis = op_info->GetAttr<int>("axis");
 
-  // create softmax node and set params from ops
-  CHECK(input_nodes.count(x_var_name));
-  std::shared_ptr<xtcl::xExpr> softmax_node = nullptr;
-  softmax_node = std::make_shared<xtcl::xExpr>(
-      graph_ctx->builder->CreateSoftmax(*input_nodes.at(x_var_name), axis));
-  graph_ctx->builder->SetLayer(unique_op_type);
-
-  // output converted nodes
-  node_map_type output_nodes;
-  output_nodes[op_info->Output("Out").front()] = softmax_node;
-  return output_nodes;
+  // Create softmax node and set params from ops
+  graph->AddNode(
+      out_var_name,
+      graph->builder_.CreateSoftmax(*graph->GetNode(x_var_name), axis));
+  return SUCCESS;
 }
 
-}  // namespace bridges
 }  // namespace xpu
-}  // namespace kernels
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_XPU_BRIDGE(softmax,
-                    paddle::lite::kernels::xpu::bridges::SoftmaxConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         softmax,
+                         paddle::lite::subgraph::xpu::SoftmaxConverter);
diff --git a/lite/kernels/xpu/bridges/transpose_op.cc b/lite/kernels/xpu/bridges/transpose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d0e87836db1fd5db320587fb08aae4f2e0bdbf2
--- /dev/null
+++ b/lite/kernels/xpu/bridges/transpose_op.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+
+int TransposeConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+
+  // Create node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+
+  CHECK(graph->HasNode(x_var_name));
+  graph->AddNode(
+      out_var_name,
+      graph->builder_.CreateTranspose(
+          *graph->GetNode(x_var_name),
+          Cvt2ArrayInt(std::vector<int64_t>(axis.begin(), axis.end()))));
+
+  return SUCCESS;
+}
+
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         transpose,
+                         paddle::lite::subgraph::xpu::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         transpose2,
+                         paddle::lite::subgraph::xpu::TransposeConverter);
diff --git a/lite/backends/xpu/builder.cc b/lite/kernels/xpu/bridges/utility.cc
similarity index 51%
rename from lite/backends/xpu/builder.cc
rename to lite/kernels/xpu/bridges/utility.cc
index 796eaf9c46ceb3d29f1ffdc4c86ac45509f07ba1..cf8d09a53a71123b0624c766ea922086685c40fc 100644
--- a/lite/backends/xpu/builder.cc
+++ b/lite/kernels/xpu/bridges/utility.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/backends/xpu/builder.h"
-#include <mutex>  // NOLINT
+#include "lite/kernels/xpu/bridges/utility.h"
 #include <utility>
-#include "lite/backends/xpu/runtime.h"
 
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace xpu {
 
 bool HasInputArg(const OpInfo* op_info,
@@ -39,20 +38,6 @@ bool HasInputArg(const OpInfo* op_info,
   }
 }
 
-std::string UniqueName(const std::string& prefix) {
-  static std::mutex counter_mtx;
-  static std::unordered_map<std::string, int> counter_map;
-  std::unique_lock<std::mutex> counter_lck(counter_mtx);
-  int counter = 1;
-  auto it = counter_map.find(prefix);
-  if (it == counter_map.end()) {
-    counter_map[prefix] = counter;
-  } else {
-    counter = ++(it->second);
-  }
-  return prefix + "_" + std::to_string(counter);
-}
-
 xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
   xtcl::DataType out_type = ::xtcl::Float(32);
   switch (in_type) {
@@ -66,8 +51,8 @@ xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
       out_type = ::xtcl::Int(32);
       break;
     default:
-      LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type)
-                 << ") from Lite to XPU";
+      LOG(FATAL) << "[XPU] Can not convert precision type("
+                 << PrecisionToStr(in_type) << ") from Lite to XPU";
       break;
   }
   return out_type;
@@ -86,8 +71,8 @@ DLDataType CvtDataType(PrecisionType in_type) {
       out_type = {kDLInt, 32, 1};
       break;
     default:
-      LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type)
-                 << ") from Lite to XPU";
+      LOG(FATAL) << "[XPU] Can not convert data type("
+                 << PrecisionToStr(in_type) << ") from Lite to XPU";
       break;
   }
   return out_type;
@@ -109,28 +94,28 @@ xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims) {
   return CvtShape(in_dims.Vectorize());
 }
 
-std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor,
+std::shared_ptr<xtcl::xNDArray> CvtTensor(const Tensor& in_tensor,
                                           std::vector<int64_t> out_shape,
                                           PrecisionType in_ptype,
                                           DataLayoutType in_ltype) {
-  uint8_t* in_data = nullptr;
-  auto in_size = in_tensor->dims().production();
-  auto in_shape = in_tensor->dims().Vectorize();
+  const uint8_t* in_data = nullptr;
+  auto in_size = in_tensor.dims().production();
+  auto in_shape = in_tensor.dims().Vectorize();
   if (out_shape.empty()) {
     out_shape = in_shape;
   }
   int in_bytes;
   if (in_ptype == PRECISION(kFloat)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<float>());
     in_bytes = in_size * sizeof(float);
   } else if (in_ptype == PRECISION(kInt32)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int32_t>());
     in_bytes = in_size * sizeof(int32_t);
   } else if (in_ptype == PRECISION(kInt8)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
+    in_data = reinterpret_cast<const uint8_t*>(in_tensor.data<int8_t>());
     in_bytes = in_size * sizeof(int8_t);
   } else {
-    LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype);
+    LOG(FATAL) << "[XPU] Unknow precision type " << PrecisionToStr(in_ptype);
   }
   auto out_tensor = std::make_shared<xtcl::xNDArray>(
       xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0}));
@@ -140,50 +125,19 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor,
   return out_tensor;
 }
 
-// Build the XPU subgraph to the XPU model, store the model data into the
-// weight tensor of the graph op, and the model data will be loaded again
-// by the graph computing kernel when the graph op is executed for inference.
-// Due to the lack of XPU APIs for building and outputing the model data,
-// the compiled XPU runtime object will be managed by the global variable
-// 'DeviceInfo' and the key name for finding the runtime object will be
-// stored in the weight tensor of graph op.
-// TODO(hong19860320) Compile the XPU subgraph and output the compiled model
-// data to the weight tensor of graph op.
-bool BuildModel(
-    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
-    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
-    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
-    lite::Tensor* model) {
-  LOG(INFO) << "[XPU] Build Model.";
-  CHECK(builder != nullptr);
-  CHECK(outputs != nullptr);
-  CHECK_GT(outputs->size(), 0);
-  CHECK(model != nullptr);
-
-  // build graph and fill all of constant params
-  xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
-  auto target = xtcl::Target::Create("llvm");
-  auto compiler = xtcl::network::xTensorCompiler(network, target);
-  compiler.SetParams(*params);  // set the data of constant tensors
-  compiler.Build();
-
-  // create and register runtime
-  auto runtime = std::make_shared<xtcl::network::xRuntimeInstance>(
-      compiler.CreateRuntimeInstance());
-  if (runtime == nullptr) {
-    LOG(WARNING) << "[XPU] Build Model failed!";
-    return false;
+xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const std::vector<int64_t>& input) {
+  xtcl::Array<xtcl::Integer> output;
+  for (auto i : input) {
+    output.push_back(i);
   }
-  std::string name = UniqueName("xpu");
-  LOG(INFO) << "[XPU] Model Name: " << name;
-  DeviceInfo::Global().Insert(name, runtime);
-  model->Resize({static_cast<int64_t>(name.length() + 1)});
-  memcpy(model->mutable_data<int8_t>(),
-         reinterpret_cast<const int8_t*>(name.c_str()),
-         name.length() + 1);
-  return true;
+  return output;
+}
+
+xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const DDim& input) {
+  return Cvt2ArrayInt(input.Vectorize());
 }
 
 }  // namespace xpu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/backends/xpu/builder.h b/lite/kernels/xpu/bridges/utility.h
similarity index 80%
rename from lite/backends/xpu/builder.h
rename to lite/kernels/xpu/bridges/utility.h
index f0ac2b303aac7fa7f827e6e2f8f0fdf614b604b5..f04488d2c3895f0c1847b3c7e80057252aeeb112 100644
--- a/lite/backends/xpu/builder.h
+++ b/lite/kernels/xpu/bridges/utility.h
@@ -17,22 +17,20 @@
 #include <xtcl/xtcl.h>
 #include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
 #include "lite/core/tensor.h"
 
 namespace paddle {
 namespace lite {
+namespace subgraph {
 namespace xpu {
 
+// Type/tensor converters for converting Paddle type/tensor to XPU type/tensor
 bool HasInputArg(const OpInfo* op_info,
                  const Scope* scope,
                  const std::string& argname);
 
-std::string UniqueName(const std::string& prefix);
-
 xtcl::DataType CvtPrecisionType(PrecisionType in_type);
 
 DLDataType CvtDataType(PrecisionType in_type);
@@ -44,17 +42,15 @@ xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape);
 xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims);
 
 std::shared_ptr<xtcl::xNDArray> CvtTensor(
-    Tensor* in_tensor,
+    const Tensor& in_tensor,
     std::vector<int64_t> out_shape = {},
     PrecisionType in_ptype = PRECISION(kFloat),
     DataLayoutType in_ltype = DATALAYOUT(kNCHW));
 
-bool BuildModel(
-    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
-    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
-    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
-    lite::Tensor* model);
+xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const std::vector<int64_t>& input);
+xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const DDim& input);
 
 }  // namespace xpu
+}  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/xpu/graph_compute.cc b/lite/kernels/xpu/graph_compute.cc
deleted file mode 100644
index b9e5be1a1d5c764c378f3fdf29d73148743962a4..0000000000000000000000000000000000000000
--- a/lite/kernels/xpu/graph_compute.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/xpu/graph_compute.h"
-#include <sys/time.h>
-#include <time.h>
-#include <string>
-#include <vector>
-#include "lite/backends/xpu/runtime.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-
-void GraphCompute::PrepareForRun() {
-  // auto& ctx = this->ctx_->template As<XPUContext>();
-  auto& param = this->Param<param_t>();
-  CHECK(param.weight);
-  CHECK(lite::xpu::LoadModel(*param.weight, &runtime_));
-  CHECK(runtime_ != nullptr);
-}
-
-void GraphCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  auto start_time = GetCurrentUS();
-  for (int i = 0; i < param.inputs.size(); i++) {
-    auto input_var_name = param.inputs[i].first;
-    auto input_tensor = param.inputs[i].second;
-    LOG(INFO) << "input dims[" << i << ":" << input_var_name
-              << "]: " << input_tensor->dims();
-    auto input_tensor_data = input_tensor->data<float>();
-    for (int j = 0; j < input_tensor->dims().production(); j++) {
-      VLOG(3) << input_tensor_data[j];
-    }
-    auto input_ndarray = xtcl::xNDArray::Empty(
-        input_tensor->dims().Vectorize(), {kDLFloat, 32, 1}, {kDLCPU, 0});
-    auto input_ndarray_data =
-        static_cast<float*>(input_ndarray.ToDLPack()->dl_tensor.data);
-    std::memcpy(input_ndarray_data,
-                input_tensor_data,
-                sizeof(float) * input_tensor->dims().production());
-    runtime_->SetInputZeroCopy(input_var_name,
-                               &input_ndarray.ToDLPack()->dl_tensor);
-  }
-  runtime_->Run();
-  for (int i = 0; i < param.outputs.size(); i++) {
-    auto output_ndarray = runtime_->GetOutput(i);
-    auto output_var_name = param.outputs[i].first;
-    auto output_tensor = param.outputs[i].second;
-    output_tensor->Resize(output_ndarray.Shape());
-    LOG(INFO) << "output dims[" << i << ":" << output_var_name
-              << "]: " << output_tensor->dims();
-    auto output_ndarray_data =
-        static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data);
-    auto output_tensor_data = output_tensor->mutable_data<float>();
-    std::memcpy(output_tensor_data,
-                output_ndarray_data,
-                sizeof(float) * output_tensor->dims().production());
-    for (int j = 0; j < output_tensor->dims().production(); j++) {
-      VLOG(3) << output_tensor_data[j];
-    }
-  }
-  LOG(INFO) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
-}
-
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(graph_op,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::GraphCompute,
-                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..899fb074b34cacb13ddec1a44b1796425021208b
--- /dev/null
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/subgraph_compute.h"
+#include <sys/time.h>
+#include <time.h>
+#include <utility>
+#include "lite/backends/xpu/device.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+int SubgraphEngine::BuildDeviceProgram() {
+  int status = 0;
+  // Convert all of input data vars and added into the XPU IR graph
+  subgraph::xpu::Graph graph;
+  for (auto& input_name : input_names_) {
+    auto input_tensor = scope_->FindMutableTensor(input_name);
+    CHECK(input_tensor);
+    auto input_node =
+        graph.AddNode(input_name, input_tensor->dims().Vectorize());
+    CHECK(input_node);
+    // XTCL doesn't support dynamic dimensions/shapes, so need to rebuild
+    // the program when the shape of any input tensor is changed.
+    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
+  }
+  // Convert all of ops and its weights and added into the XPU IR graph
+  const auto& bridges = subgraph::Registry::Instance();
+  for (auto& inst : origin_program_) {
+    auto op = inst.op();
+    CHECK(op);
+    op->CheckShape();
+    op->InferShape();
+    std::string op_type = op->op_info()->Type();
+    if (!bridges.Exists("XPU", op_type)) {
+      return subgraph::FAILED;
+    }
+    status |= bridges.Select("XPU", op_type)(reinterpret_cast<void*>(&graph),
+                                             const_cast<OpLite*>(op));
+    if (subgraph::CHECK_FAILED(status)) {
+      return subgraph::FAILED;
+    }
+  }
+  // Obtain the output nodes of the XPU IR graph and build the graph to XPU
+  // runtime
+  std::vector<xtcl::xExpr*> output_nodes;
+  std::vector<std::string> valid_output_names;
+  for (auto& output_name : output_names_) {
+    if (graph.HasNode(output_name)) {
+      output_nodes.push_back(graph.GetNode(output_name).get());
+      valid_output_names.push_back(output_name);
+    }
+  }
+  CHECK(!valid_output_names.empty()) << "[XPU] no valid output names";
+  device_program_ = lite::xpu::Device::Global().Build(
+      &graph.builder_, &graph.params_, &output_nodes);
+  if (device_program_ == nullptr) {
+    LOG(WARNING) << "[XPU] Build model failed!";
+    return subgraph::FAILED;
+  }
+
+  // Query and check the dimensions of input and output tensors
+  origin_idims_.resize(input_names_.size());
+  origin_itensors_.resize(input_names_.size());
+  origin_odims_.resize(valid_output_names.size());
+  origin_otensors_.resize(valid_output_names.size());
+  for (int i = 0; i < input_names_.size(); i++) {
+    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
+    CHECK(origin_itensors_[i]);
+    origin_idims_[i] = origin_itensors_[i]->dims();
+    VLOG(3) << "[XPU] Input dims[" << i << "]: " << origin_idims_[i];
+  }
+  for (int i = 0; i < valid_output_names.size(); i++) {
+    origin_otensors_[i] = scope_->FindMutableTensor(valid_output_names[i]);
+    CHECK(origin_otensors_[i]);
+    origin_odims_[i] = origin_otensors_[i]->dims();
+    VLOG(3) << "[XPU] Output dims[" << i << "]: " << origin_odims_[i];
+  }
+  return status;
+}
+
+int SubgraphEngine::LaunchDeviceProgram() {
+  // Copy the data of origin input tensors to the buffer of input XPU tensors
+  for (size_t i = 0; i < input_names_.size(); i++) {
+    auto input_ndarray =
+        xtcl::xNDArray::Empty(origin_itensors_[i]->dims().Vectorize(),
+                              {kDLFloat, 32, 1},
+                              {kDLCPU, 0});
+    std::memcpy(static_cast<float*>(input_ndarray.ToDLPack()->dl_tensor.data),
+                origin_itensors_[i]->mutable_data<float>(),
+                sizeof(float) * origin_itensors_[i]->dims().production());
+    device_program_->SetInputZeroCopy(input_names_[i],
+                                      &input_ndarray.ToDLPack()->dl_tensor);
+  }
+  // Run the XPU model
+  auto GetCurrentUS = []() -> double {
+    struct timeval time;
+    gettimeofday(&time, NULL);
+    return 1e+6 * time.tv_sec + time.tv_usec;
+  };
+  auto start_time = GetCurrentUS();
+  device_program_->Run();
+  VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
+  // Copy the data of output XPU tensor to the buffer of origin output tensors
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
+    auto output_ndarray = device_program_->GetOutput(i);
+    std::memcpy(origin_otensors_[i]->mutable_data<float>(),
+                static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data),
+                sizeof(float) * origin_otensors_[i]->dims().production());
+  }
+  return 0;
+}
+
+void SubgraphCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  engine_.reset(new SubgraphEngine(param.sub_block_idx,
+                                   param.sub_block_desc,
+                                   param.input_data_names,
+                                   param.output_data_names,
+                                   param.scope));
+  CHECK(engine_);
+  engine_->Build();
+}
+
+void SubgraphCompute::Run() {
+  CHECK(engine_);
+  engine_->Launch();
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(subgraph,
+                     kXPU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SubgraphCompute,
+                     def)
+    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d7108a8665adb06ab6c68412a006bd284d52d2c
--- /dev/null
+++ b/lite/kernels/xpu/subgraph_compute.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <xtcl/xtcl.h>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/kernels/npu/bridges/engine.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SubgraphEngine : public subgraph::Engine {
+ public:
+  SubgraphEngine(int block_idx,
+                 cpp::BlockDesc *block_desc,
+                 const std::vector<std::string> &input_names,
+                 const std::vector<std::string> &output_names,
+                 Scope *scope)
+      : subgraph::Engine(
+            block_idx, block_desc, input_names, output_names, scope) {}
+
+ protected:
+  int BuildDeviceProgram() override;
+  int LaunchDeviceProgram() override;
+
+  std::unique_ptr<xtcl::network::xRuntimeInstance> device_program_{nullptr};
+};
+
+class SubgraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SubgraphParam;
+
+  void PrepareForRun() override;
+
+  void Run() override;
+
+  virtual ~SubgraphCompute() = default;
+
+ private:
+  std::unique_ptr<SubgraphEngine> engine_;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/model_parser/compatible_pb.cc b/lite/model_parser/compatible_pb.cc
index 2df4a92270466b1f3b56dec8deecf8e9a8e62390..d1131539bf30abba22feeba8abf009f95ab70a00 100644
--- a/lite/model_parser/compatible_pb.cc
+++ b/lite/model_parser/compatible_pb.cc
@@ -32,14 +32,6 @@ namespace lite {
 /// For VarDesc transfrom
 #define TRANS_VAR_ANY_WITH_CPP_IMPL(T)                           \
   template <>                                                    \
-  void TransformVarDescAnyToCpp<T>(const T &any_desc,            \
-                                   cpp::VarDesc *cpp_desc) {     \
-    cpp_desc->SetName(any_desc.Name());                          \
-    cpp_desc->SetType(any_desc.GetType());                       \
-    cpp_desc->SetPersistable(any_desc.Persistable());            \
-  }                                                              \
-                                                                 \
-  template <>                                                    \
   void TransformVarDescCppToAny<T>(const cpp::VarDesc &cpp_desc, \
                                    T *any_desc) {                \
     any_desc->SetName(cpp_desc.Name());                          \
@@ -47,6 +39,25 @@ namespace lite {
     any_desc->SetPersistable(cpp_desc.Persistable());            \
   }
 
+#ifndef LITE_ON_TINY_PUBLISH
+template <>
+void TransformVarDescAnyToCpp<pb::VarDesc>(const pb::VarDesc &any_desc,
+                                           cpp::VarDesc *cpp_desc) {
+  cpp_desc->SetName(any_desc.Name());
+  cpp_desc->SetType(any_desc.GetType());
+  cpp_desc->SetPersistable(any_desc.Persistable());
+  cpp_desc->SetDataType(any_desc.GetDataType());
+}
+#endif
+
+template <>
+void TransformVarDescAnyToCpp<naive_buffer::VarDesc>(
+    const naive_buffer::VarDesc &any_desc, cpp::VarDesc *cpp_desc) {
+  cpp_desc->SetName(any_desc.Name());
+  cpp_desc->SetType(any_desc.GetType());
+  cpp_desc->SetPersistable(any_desc.Persistable());
+}
+
 /// For OpDesc transform
 template <typename OpDescType>
 void OpInputsAnyToCpp(const OpDescType &any_desc, cpp::OpDesc *cpp_desc) {
diff --git a/lite/model_parser/cpp/var_desc.h b/lite/model_parser/cpp/var_desc.h
index c346934dfd721bcd6424fcf2b9d22a0ded9dab14..9232bba3e8620b2e5e769c9f7a0f50969abe8421 100644
--- a/lite/model_parser/cpp/var_desc.h
+++ b/lite/model_parser/cpp/var_desc.h
@@ -42,9 +42,14 @@ class VarDesc : public VarDescAPI {
 
   void SetPersistable(bool persistable) override { persistable_ = persistable; }
 
+  Type GetDataType() const { return data_type_; }
+
+  void SetDataType(Type data_type) { data_type_ = data_type; }
+
  private:
   std::string name_;
   Type type_;
+  Type data_type_;
   bool persistable_;
 };
 
diff --git a/lite/model_parser/naive_buffer/var_desc.cc b/lite/model_parser/naive_buffer/var_desc.cc
index cccf7582912d1edff2c91fbfa5ed602f028be648..86b6dd72844c694dee1781d322491bf922f32d09 100644
--- a/lite/model_parser/naive_buffer/var_desc.cc
+++ b/lite/model_parser/naive_buffer/var_desc.cc
@@ -99,6 +99,32 @@ const proto::VarType& VarDesc::GetVarType() const {
   return desc_->GetField<proto::VarType>("type");
 }
 
+VarDescAPI::VarDataType VarDesc::GetDataType() const {
+  using data_type_builder_t = EnumBuilder<proto::VarDataType>;
+
+  auto data_type = desc_->GetField<proto::TensorDesc>("tensor_desc")
+                       .GetField<data_type_builder_t>("data_type")
+                       .data();
+#define GET_DATA_TYPE_CASE_ITEM(type__) \
+  case proto::VarDataType::type__:      \
+    return VarDescAPI::VarDataType::type__
+
+  switch (data_type) {
+    // Only support primary data type now.
+    GET_DATA_TYPE_CASE_ITEM(UINT8);
+    GET_DATA_TYPE_CASE_ITEM(INT8);
+    GET_DATA_TYPE_CASE_ITEM(INT16);
+    GET_DATA_TYPE_CASE_ITEM(INT32);
+    GET_DATA_TYPE_CASE_ITEM(INT64);
+    GET_DATA_TYPE_CASE_ITEM(FP32);
+    GET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var data type";
+  }
+  return VarDescAPI::VarDataType();
+#undef GET_DATA_TYPE_CASE_ITEM
+}
+
 proto::VarType* VarDesc::GetMutableVarType() {
   auto* builder = desc_->GetMutableField<proto::VarType>("type");
   CHECK(builder);
diff --git a/lite/model_parser/naive_buffer/var_desc.h b/lite/model_parser/naive_buffer/var_desc.h
index 92a0cfe3cdc5bf8a397bb1b8140dba0312791730..b638afd79d085e64ef7f1174f0d27975b827e76a 100644
--- a/lite/model_parser/naive_buffer/var_desc.h
+++ b/lite/model_parser/naive_buffer/var_desc.h
@@ -51,6 +51,8 @@ class VarDesc : public VarDescAPI {
 
   void SetPersistable(bool persistable) override;
 
+  VarDescAPI::VarDataType GetDataType() const;
+
  private:
   const proto::VarType &GetVarType() const;
   proto::VarType *GetMutableVarType();
diff --git a/lite/model_parser/pb/var_desc.cc b/lite/model_parser/pb/var_desc.cc
index 517f4cc6dcefbb5e517b6f84ac1b695dbbbc5925..a3f28d00b94054addd728775e9373d73f9b7b729 100644
--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
@@ -151,8 +151,36 @@ void VarDesc::SetDataTypes(
   }
 }
 
-proto::VarType::Type VarDesc::GetDataType() const {
-  return tensor_desc().data_type();
+// proto::VarType::Type VarDesc::GetDataType() const {
+//   return tensor_desc().data_type();
+// }
+VarDescAPI::VarDataType VarDesc::GetDataType() const {
+  CHECK(desc_->has_type()) << "The var's type hasn't been set.";
+  CHECK(desc_->type().has_type()) << "The var type hasn't been set.";
+  if (desc_->type().type() != proto::VarType::LOD_TENSOR) {
+    return VarDescAPI::Type();
+  }
+  auto type = tensor_desc().data_type();
+#define GET_DATA_TYPE_CASE_ITEM(type__)             \
+  case proto::VarType::Type::VarType_Type_##type__: \
+    return VarDescAPI::Type::type__
+
+  switch (type) {
+    GET_DATA_TYPE_CASE_ITEM(BOOL);
+    GET_DATA_TYPE_CASE_ITEM(SIZE_T);
+    GET_DATA_TYPE_CASE_ITEM(UINT8);
+    GET_DATA_TYPE_CASE_ITEM(INT8);
+    GET_DATA_TYPE_CASE_ITEM(INT16);
+    GET_DATA_TYPE_CASE_ITEM(INT32);
+    GET_DATA_TYPE_CASE_ITEM(INT64);
+    GET_DATA_TYPE_CASE_ITEM(FP16);
+    GET_DATA_TYPE_CASE_ITEM(FP32);
+    GET_DATA_TYPE_CASE_ITEM(FP64);
+    default:
+      LOG(FATAL) << "Unknown var type: " << static_cast<int>(type);
+      return VarDescAPI::Type();
+  }
+#undef GET_DATA_TYPE_CASE_ITEM
 }
 
 std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
diff --git a/lite/model_parser/pb/var_desc.h b/lite/model_parser/pb/var_desc.h
index c0ac6316016df3cdb06ddede9c78d58540a40864..bbf78b75d3f1b1a4a6488e28380f2587ca77bbc4 100644
--- a/lite/model_parser/pb/var_desc.h
+++ b/lite/model_parser/pb/var_desc.h
@@ -89,7 +89,8 @@ class VarDesc : public VarDescAPI {
   void SetDataTypes(
       const std::vector<framework::proto::VarType::Type> &multiple_data_type);
 
-  framework::proto::VarType::Type GetDataType() const;
+  // framework::proto::VarType::Type GetDataType() const;
+  VarDescAPI::VarDataType GetDataType() const;
 
   std::vector<framework::proto::VarType::Type> GetDataTypes() const;
 
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 7c4048c204b0889f9a9bd72a7e94da3777441d37..34b364ae3989728515186bf5c09d75bfe542f537 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -47,7 +47,8 @@ add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_act
 add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
 add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
 add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
-add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
+add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
+add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
 
 # 2.basic ops not used in basic models
 add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
@@ -83,6 +84,9 @@ add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS})
 
 add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
+add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS})
+add_operator(merge_lod_tensor_op_lite extra SRCS merge_lod_tensor_op.cc DEPS ${op_DEPS})
+add_operator(reduce_prod_op_lite extra SRCS reduce_prod_op.cc DEPS ${op_DEPS})
 add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS})
 add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS})
@@ -94,13 +98,15 @@ add_operator(sequence_concat_op_lite extra SRCS sequence_concat_op.cc DEPS ${op_
 add_operator(var_conv_2d_op_lite extra SRCS var_conv_2d_op.cc DEPS ${op_DEPS})
 add_operator(attention_padding_mask_op_lite extra SRCS attention_padding_mask_op.cc DEPS ${op_DEPS})
 add_operator(sequence_arithmetic_op_lite extra SRCS sequence_arithmetic_op.cc DEPS ${op_DEPS})
+add_operator(conditional_block_op_lite extra SRCS conditional_block_op.cc DEPS ${op_DEPS})
+add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.cc DEPS ${op_DEPS})
+add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS})
 add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
-add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS})
 add_operator(logical_xor  extra SRCS logical_op.cc DEPS ${op_DEPS})
 add_operator(logical_and  extra SRCS logical_op.cc DEPS ${op_DEPS})
 add_operator(logical_or  extra SRCS logical_op.cc DEPS ${op_DEPS})
diff --git a/lite/operators/collect_fpn_proposals_op.cc b/lite/operators/collect_fpn_proposals_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4731d4bf81c241c6733b1403699874c1053d2b7f
--- /dev/null
+++ b/lite/operators/collect_fpn_proposals_op.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/collect_fpn_proposals_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool CollectFpnProposalsOpLite::CheckShape() const {
+  CHECK_OR_FALSE(!param_.multi_level_rois.empty());
+  CHECK_OR_FALSE(!param_.multi_level_scores.empty());
+  CHECK_OR_FALSE(param_.fpn_rois);
+
+  for (auto item : param_.multi_level_rois) {
+    auto dims = item->dims();
+    CHECK_OR_FALSE(dims[1] == 4);
+  }
+  for (auto item : param_.multi_level_scores) {
+    auto dims = item->dims();
+    CHECK_OR_FALSE(dims[1] == 1);
+  }
+  for (int i = 0; i < param_.multi_level_rois.size(); i++) {
+    auto roi = param_.multi_level_rois[i];
+    auto roi_lod = roi->lod();
+    auto score = param_.multi_level_scores[i];
+    auto score_lod = score->lod();
+    CHECK_OR_FALSE(roi_lod == score_lod);
+  }
+  return true;
+}
+
+bool CollectFpnProposalsOpLite::InferShape() const {
+  param_.fpn_rois->Resize({param_.post_nms_topN, 4});
+
+  return true;
+}
+
+bool CollectFpnProposalsOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                           lite::Scope* scope) {
+  auto rois_names = op_desc.Input("MultiLevelRois");
+  for (const auto& var_name : rois_names) {
+    param_.multi_level_rois.push_back(
+        scope->FindVar(var_name)->GetMutable<lite::Tensor>());
+  }
+  auto scores_names = op_desc.Input("MultiLevelScores");
+  for (const auto& var_name : scores_names) {
+    param_.multi_level_scores.push_back(
+        scope->FindVar(var_name)->GetMutable<lite::Tensor>());
+  }
+
+  auto fpn_rois = op_desc.Output("FpnRois").front();
+  param_.fpn_rois = scope->FindVar(fpn_rois)->GetMutable<lite::Tensor>();
+
+  param_.post_nms_topN = op_desc.GetAttr<int>("post_nms_topN");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(collect_fpn_proposals,
+                 paddle::lite::operators::CollectFpnProposalsOpLite);
diff --git a/lite/operators/collect_fpn_proposals_op.h b/lite/operators/collect_fpn_proposals_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ae7bb269ff53bb8add92d9afc8d462c45cb5f0b
--- /dev/null
+++ b/lite/operators/collect_fpn_proposals_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class CollectFpnProposalsOpLite : public OpLite {
+ public:
+  CollectFpnProposalsOpLite() {}
+
+  explicit CollectFpnProposalsOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "collect_fpn_proposals"; }
+
+ private:
+  mutable CollectFpnProposalsParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c79c4e20a29834e858bc670104e2a09e55888c85
--- /dev/null
+++ b/lite/operators/conditional_block_op.cc
@@ -0,0 +1,60 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/conditional_block_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ConditionalBlockOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.cond);
+  CHECK_OR_FALSE(param_.sub_block);
+  CHECK_OR_FALSE(param_.scope);
+  return true;
+}
+
+bool ConditionalBlockOpLite::InferShape() const { return true; }
+
+bool ConditionalBlockOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                        lite::Scope *scope) {
+  auto condition = op_desc.Input("Cond").front();
+  param_.cond = scope->FindVar(condition)->GetMutable<lite::Tensor>();
+
+  auto inputs = op_desc.Input("Input");
+  for (auto var : inputs) {
+    param_.x.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+
+  auto outs = op_desc.Output("Out");
+  for (auto var : outs) {
+    param_.outs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+
+  param_.is_scalar_condition = op_desc.GetAttr<bool>("is_scalar_condition");
+  // obtain sub_block in core program.cc
+  param_.sub_block = sub_block_;
+  param_.scope = scope;
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(conditional_block,
+                 paddle::lite::operators::ConditionalBlockOpLite);
diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5518c255c5799aa5b44557a4493275794fd598f5
--- /dev/null
+++ b/lite/operators/conditional_block_op.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ConditionalBlockOpLite : public OpLite {
+ public:
+  ConditionalBlockOpLite() {}
+  explicit ConditionalBlockOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "conditional_block"; }
+
+  void SetSubBlock(cpp::BlockDesc *desc) { sub_block_ = desc; }
+
+ private:
+  mutable ConditionalBlockParam param_;
+  cpp::BlockDesc *sub_block_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index 6dab55ff3b6c55e7763484d78c6c36bf85017128..d9c0ecb4fd8457782ac90850b8b6a002c7dfcffe 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -52,6 +52,34 @@ inline int ConvOutputSize(int input_size,
   return output_size;
 }
 
+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const lite::DDim data_dims,
+                                     const lite::DDim& ksize) {
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum = std::max(
+          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
+          (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      // pad
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+      // dilation
+      *(dilations->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto& it : *paddings) {
+      it = 0;
+    }
+  }
+}
+
 bool ConvOpLite::InferShape() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index 3ab34bc1d0bd631b0641cebd3db29cfff9316bb0..24848803fb7ea2139f87aa5b5f2119592dc00084 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -137,34 +137,6 @@ class ConvOpLite : public OpLite {
   std::string padding_algorithm_{""};
 };
 
-inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
-                                     std::vector<int>* dilations,
-                                     const std::vector<int>& strides,
-                                     const std::string padding_algorithm,
-                                     const lite::DDim data_dims,
-                                     const lite::DDim& ksize) {
-  // when padding_desc is "VALID" or "SAME"
-  if (padding_algorithm == "SAME") {
-    for (size_t i = 0; i < strides.size(); ++i) {
-      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
-      int pad_sum = std::max(
-          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
-          (int64_t)0);
-      int pad_0 = pad_sum / 2;
-      int pad_1 = pad_sum - pad_0;
-      // pad
-      *(paddings->begin() + i * 2) = pad_0;
-      *(paddings->begin() + i * 2 + 1) = pad_1;
-      // dilation
-      *(dilations->begin() + i) = 1;
-    }
-  } else if (padding_algorithm == "VALID") {
-    for (auto& it : *paddings) {
-      it = 0;
-    }
-  }
-}
-
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/distribute_fpn_proposals_op.cc b/lite/operators/distribute_fpn_proposals_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5d6a0fca923dd38fd456e024ec14ba7c2685163d
--- /dev/null
+++ b/lite/operators/distribute_fpn_proposals_op.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/distribute_fpn_proposals_op.h"
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool DistributeFpnProposalsOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.fpn_rois);
+  CHECK_OR_FALSE(param_.restore_index);
+  CHECK_OR_FALSE(param_.multi_fpn_rois.size() > 1);
+  CHECK_OR_FALSE(param_.max_level >= param_.min_level);
+  size_t num_out_rois =
+      static_cast<size_t>(param_.max_level - param_.min_level + 1);
+  CHECK_OR_FALSE(num_out_rois == param_.multi_fpn_rois.size());
+  return true;
+}
+
+bool DistributeFpnProposalsOpLite::InferShape() const {
+  int num_out_rois = param_.max_level - param_.min_level + 1;
+  for (int i = 0; i < num_out_rois; i++) {
+    param_.multi_fpn_rois[i]->Resize({-1, 4});
+  }
+  param_.restore_index->Resize({-1, 1});
+  return true;
+}
+
+bool DistributeFpnProposalsOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                              lite::Scope *scope) {
+  auto fpn_rois = op_desc.Input("FpnRois").front();
+  param_.fpn_rois = scope->FindVar(fpn_rois)->GetMutable<lite::Tensor>();
+
+  auto multi_fpn_rois = op_desc.Output("MultiFpnRois");
+  for (const auto &name : multi_fpn_rois) {
+    param_.multi_fpn_rois.push_back(
+        scope->FindVar(name)->GetMutable<lite::Tensor>());
+  }
+  auto restore_index = op_desc.Output("RestoreIndex").front();
+  param_.restore_index =
+      scope->FindVar(restore_index)->GetMutable<lite::Tensor>();
+  param_.min_level = op_desc.GetAttr<int>("min_level");
+  param_.max_level = op_desc.GetAttr<int>("max_level");
+  param_.refer_level = op_desc.GetAttr<int>("refer_level");
+  param_.refer_scale = op_desc.GetAttr<int>("refer_scale");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(distribute_fpn_proposals,
+                 paddle::lite::operators::DistributeFpnProposalsOpLite);
diff --git a/lite/operators/distribute_fpn_proposals_op.h b/lite/operators/distribute_fpn_proposals_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2390e329329f7406f05ba69b3768556f94a02bec
--- /dev/null
+++ b/lite/operators/distribute_fpn_proposals_op.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class DistributeFpnProposalsOpLite : public OpLite {
+ public:
+  DistributeFpnProposalsOpLite() {}
+
+  explicit DistributeFpnProposalsOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override {
+    return "distribute_fpn_proposals";
+  }
+
+ private:
+  mutable DistributeFpnProposalsParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index 3f2a69dfbc76a3a7c0bdcac69866b901b239d1e4..141436fbf77a0994cd31a4a018f0770e310428b0 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -90,6 +90,9 @@ bool FcOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
   if (op_desc.HasAttr("activation_type")) {
     param_.activation_type = op_desc.GetAttr<std::string>("activation_type");
   }
+  if (op_desc.HasAttr("padding_weights")) {
+    param_.activation_type = op_desc.GetAttr<bool>("padding_weights");
+  }
 
   // For Int8
   if (op_desc.HasAttr("enable_int8")) {
diff --git a/lite/operators/graph_op.cc b/lite/operators/graph_op.cc
deleted file mode 100644
index 018ce264e2f18862549a4abc0444d02dcbb573ee..0000000000000000000000000000000000000000
--- a/lite/operators/graph_op.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/graph_op.h"
-#include <utility>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool GraphOpLite::CheckShape() const {
-  CHECK_GE_OR_FALSE(param_.inputs.size(), 1UL);
-  CHECK_GE_OR_FALSE(param_.outputs.size(), 1UL);
-  return true;
-}
-
-bool GraphOpLite::InferShape() const { return CheckShape(); /* enrich me */ }
-
-bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto inputs = op_desc.Input("Inputs");
-  auto weight = op_desc.Input("Weight");
-  auto outputs = op_desc.Output("Outputs");
-
-  for (auto var : inputs) {
-    CHECK(scope->FindVar(var));
-    param_.inputs.push_back(
-        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
-  }
-
-  param_.weight = scope->FindVar(weight.front())->GetMutable<lite::Tensor>();
-  CHECK(param_.weight);
-
-  for (auto var : outputs) {
-    CHECK(scope->FindVar(var));
-    param_.outputs.push_back(
-        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
-  }
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(graph_op, paddle::lite::operators::GraphOpLite);
diff --git a/lite/operators/instance_norm_op.cc b/lite/operators/instance_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..510402ba1fb363f383b3cba8eb322a4ff7975c18
--- /dev/null
+++ b/lite/operators/instance_norm_op.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/instance_norm_op.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool InstanceNormOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.scale);
+  CHECK_OR_FALSE(param_.bias);
+  CHECK_OR_FALSE(param_.out);
+  CHECK_OR_FALSE(param_.saved_mean);
+  CHECK_OR_FALSE(param_.saved_variance);
+  auto x_dims = param_.x->dims();
+  auto scale_dims = param_.scale->dims();
+  auto bias_dims = param_.bias->dims();
+  CHECK(x_dims.size() >= 2 && x_dims.size() <= 5)
+      << "Input X must have 2 to 5 dimensions.";
+  CHECK_EQ(scale_dims.size(), 1UL) << "Input Scale must have 1 dimensions.";
+  CHECK_EQ(bias_dims.size(), 1UL) << "Input Bias must have 1 dimensions.";
+  CHECK_GT(param_.epsilon, 0.f) << "epsilon should be greater than 0.f";
+  CHECK_LT(param_.epsilon, 0.01f) << "epsilon should be less than 0.01f";
+  return true;
+}
+
+bool InstanceNormOp::InferShape() const {
+  auto x_dims = param_.x->dims();
+  int64_t batch_size = x_dims[0];
+  int64_t channel_size = x_dims[1];
+  param_.saved_mean->Resize({batch_size * channel_size});
+  param_.saved_variance->Resize({batch_size * channel_size});
+  param_.out->Resize(x_dims);
+  return true;
+}
+
+bool InstanceNormOp::AttachImpl(const cpp::OpDesc& op_desc,
+                                lite::Scope* scope) {
+  param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
+  param_.scale =
+      scope->FindVar(op_desc.Input("Scale").front())->GetMutable<Tensor>();
+  param_.bias =
+      scope->FindVar(op_desc.Input("Bias").front())->GetMutable<Tensor>();
+  param_.saved_mean =
+      scope->FindVar(op_desc.Output("SavedMean").front())->GetMutable<Tensor>();
+  param_.saved_variance =
+      scope->FindVar(op_desc.Output("SavedVariance").front())
+          ->GetMutable<Tensor>();
+  param_.out =
+      scope->FindVar(op_desc.Output("Y").front())->GetMutable<Tensor>();
+  param_.epsilon = op_desc.GetAttr<float>("epsilon");
+  return true;
+}
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
+
+REGISTER_LITE_OP(instance_norm, paddle::lite::operators::InstanceNormOp);
diff --git a/lite/operators/instance_norm_op.h b/lite/operators/instance_norm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d128345805cf77ac2a4123a8549c92051593fff0
--- /dev/null
+++ b/lite/operators/instance_norm_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class InstanceNormOp : public OpLite {
+ public:
+  InstanceNormOp() {}
+
+  explicit InstanceNormOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "instance_norm"; }
+
+ private:
+  mutable InstanceNormParam param_;
+};
+
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
diff --git a/lite/operators/merge_lod_tensor_op.cc b/lite/operators/merge_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4258715b1d1aa6bf7fac160dcd6fc8ca6dd3754d
--- /dev/null
+++ b/lite/operators/merge_lod_tensor_op.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/merge_lod_tensor_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MergeLodTensorOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.mask);
+  CHECK_OR_FALSE(param_.in_true);
+  CHECK_OR_FALSE(param_.in_false);
+  CHECK_OR_FALSE(param_.out);
+
+  const auto mask_dims = param_.mask->dims();
+  CHECK_OR_FALSE(mask_dims.size() == 2);
+  CHECK_OR_FALSE(mask_dims[1] == 1);
+
+  return true;
+}
+
+bool MergeLodTensorOpLite::InferShape() const {
+  auto dims = param_.in_true->dims();
+  param_.out->Resize(dims);
+  return true;
+}
+
+bool MergeLodTensorOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                      lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto mask = op_desc.Input("Mask").front();
+  auto in_true = op_desc.Input("InTrue").front();
+  auto in_false = op_desc.Input("InFalse").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.mask = scope->FindVar(mask)->GetMutable<lite::Tensor>();
+  param_.in_true = scope->FindVar(in_true)->GetMutable<lite::Tensor>();
+  param_.in_false = scope->FindVar(in_false)->GetMutable<lite::Tensor>();
+
+  auto out = op_desc.Output("Out").front();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  param_.level = op_desc.GetAttr<int>("level");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(merge_lod_tensor,
+                 paddle::lite::operators::MergeLodTensorOpLite);
diff --git a/lite/operators/merge_lod_tensor_op.h b/lite/operators/merge_lod_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..788a3451685cd0f42b72ee01e93e17da49507957
--- /dev/null
+++ b/lite/operators/merge_lod_tensor_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MergeLodTensorOpLite : public OpLite {
+ public:
+  MergeLodTensorOpLite() {}
+
+  explicit MergeLodTensorOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "merge_lod_tensor"; }
+
+ private:
+  mutable MergeLodTensorParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 4f0c707484f6a66148dabc80968665c1d38de445..bd2ba937ea11038ce67da790d2733f5ba6d53b54 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -70,10 +70,14 @@ struct CalibParam {
   float scale;
 };
 
-struct GraphParam {
-  std::vector<std::pair<std::string, const lite::Tensor*>> inputs{};
-  lite::Tensor* weight{};
-  std::vector<std::pair<std::string, lite::Tensor*>> outputs{};
+struct SubgraphParam {
+  std::vector<std::string> input_names{};
+  std::vector<std::string> output_names{};
+  std::vector<std::string> input_data_names{};
+  std::vector<std::string> output_data_names{};
+  int sub_block_idx{-1};
+  cpp::BlockDesc* sub_block_desc{nullptr};
+  Scope* scope{nullptr};
 };
 
 /// -------------------------- NN operators ------------------------------------
@@ -86,6 +90,7 @@ struct FcParam {
   lite::DDim in_mat_dims;
   int in_num_col_dims{1};
   std::string activation_type{""};
+  bool padding_weights{false};
   // for int8
   WITH_INT8_CONFIG
 };
@@ -281,6 +286,8 @@ struct ConvParam {
   std::string data_format{"Anylayout"};
   // for activation
   ActivationParam activation_param;
+  // support var_length or not
+  bool var_length{false};
   // for int8
   WITH_INT8_CONFIG
 };
@@ -857,6 +864,8 @@ struct VarConv2DParam {
   int stride_w;
   int kernel_h;
   int kernel_w;
+
+  bool fuse_relu{false};
 };
 
 /// ----------------------- shape operators ----------------------
@@ -1056,6 +1065,60 @@ struct SearchGrnnParam {
   lite::Tensor* layout_input{};
 };
 
+struct SplitLodTensorParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* mask{};
+  lite::Tensor* out_true{};
+  lite::Tensor* out_false{};
+  int level{};
+};
+
+struct MergeLodTensorParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* mask{};
+  const lite::Tensor* in_true{};
+  const lite::Tensor* in_false{};
+  lite::Tensor* out{};
+  int level{};
+};
+
+struct ConditionalBlockParam {
+  const lite::Tensor* cond{};
+  std::vector<lite::Tensor*> x{};
+  std::vector<lite::Tensor*> outs{};
+  cpp::BlockDesc* sub_block{};
+  Scope* scope{};
+  bool is_scalar_condition{};
+};
+
+struct CollectFpnProposalsParam {
+  std::vector<lite::Tensor*> multi_level_rois{};
+  std::vector<lite::Tensor*> multi_level_scores{};
+  lite::Tensor* fpn_rois{};
+  int post_nms_topN{};
+};
+
+struct DistributeFpnProposalsParam {
+  const lite::Tensor* fpn_rois{};
+  std::vector<lite::Tensor*> multi_fpn_rois{};
+  lite::Tensor* restore_index{};
+  int min_level{};
+  int max_level{};
+  int refer_level{};
+  int refer_scale{};
+};
+
+/// --------------------- instance_norm operators --------------------
+struct InstanceNormParam {
+  lite::Tensor* x{};
+  lite::Tensor* out{};
+  lite::Tensor* bias{};
+  lite::Tensor* scale{};
+  lite::Tensor* saved_mean{};
+  lite::Tensor* saved_variance{};
+  float epsilon;
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/reduce_prod_op.cc b/lite/operators/reduce_prod_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90da13c8643fa030c376ca25cb3a67b70f3485a4
--- /dev/null
+++ b/lite/operators/reduce_prod_op.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/reduce_prod_op.h"
+#include <algorithm>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool ReduceProdOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  return true;
+}
+
+bool ReduceProdOpLite::InferShape() const {
+  auto x = param_.x;
+  auto out = param_.output;
+  std::vector<int> dim = param_.dim;
+  bool reduce_all = param_.reduce_all;
+  bool keep_dim = param_.keep_dim;
+
+  auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
+  CHECK_OR_FALSE(x_rank <= 6U);
+  for (size_t i = 0; i < dim.size(); i++) {
+    if (dim[i] < 0) {
+      dim[i] = x_rank + dim[i];
+    }
+    CHECK_OR_FALSE(static_cast<size_t>(dim[i]) < x_rank);
+  }
+  std::sort(dim.begin(), dim.end());
+
+  if (reduce_all || dim.size() == 0) {
+    if (keep_dim) {
+      out->Resize({static_cast<int64_t>(x_rank), 1});
+    } else {
+      out->Resize({1});
+    }
+  } else {
+    auto dims_vector = x_dims.Vectorize();
+    if (keep_dim) {
+      for (size_t i = 0; i < dim.size(); ++i) {
+        dims_vector[dim[i]] = 1;
+      }
+    } else {
+      const int kDelFlag = -2;
+      for (size_t i = 0; i < dim.size(); ++i) {
+        dims_vector[dim[i]] = kDelFlag;
+      }
+      dims_vector.erase(
+          std::remove(dims_vector.begin(), dims_vector.end(), kDelFlag),
+          dims_vector.end());
+    }
+    if (!keep_dim && dims_vector.size() == 0) {
+      dims_vector.push_back(1);
+    }
+    out->Resize(dims_vector);
+    if (dim.size() > 0 && dim[0] != 0) {
+      out->set_lod(x->lod());
+    }
+  }
+  return true;
+}
+
+bool ReduceProdOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                  lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+
+  auto output = op_desc.Output("Out").front();
+  param_.output = scope->FindVar(output)->GetMutable<lite::Tensor>();
+
+  param_.dim = op_desc.GetAttr<std::vector<int>>("dim");
+  param_.keep_dim = op_desc.GetAttr<bool>("keep_dim");
+  param_.reduce_all = op_desc.GetAttr<bool>("reduce_all");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(reduce_prod, paddle::lite::operators::ReduceProdOpLite);
diff --git a/lite/operators/reduce_prod_op.h b/lite/operators/reduce_prod_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f7a6dcdf98eb99d9145b7e3108972f4debeaeb5
--- /dev/null
+++ b/lite/operators/reduce_prod_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class ReduceProdOpLite : public OpLite {
+ public:
+  ReduceProdOpLite() {}
+
+  explicit ReduceProdOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "reduce_prod"; }
+
+ private:
+  mutable ReduceParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_concat_op.cc b/lite/operators/sequence_concat_op.cc
index 2a54df890cc6b90910713ed7d6d44f9218e72e28..88afe5e00fe2bfc173a8a1d1d0e63562cfb52518 100644
--- a/lite/operators/sequence_concat_op.cc
+++ b/lite/operators/sequence_concat_op.cc
@@ -23,47 +23,10 @@ bool SequenceConcatOp::CheckShape() const {
   CHECK_GT(param_.X.size(), 1)
       << "The number of input sequences is at least two.";
   CHECK_OR_FALSE(param_.Out);
-  size_t lod_size = 0;
-  for (const auto &t : param_.X) {
-    CHECK_EQ(t->lod().empty(), false)
-        << "Input Tensor of X does not contain LoD information.";
-    // CHECK_EQ(t->lod().size(), 1) << "Only support one level sequence now.";
-    if (lod_size == 0) {
-      lod_size = t->lod()[0].size();
-    } else {
-      CHECK_EQ(t->lod()[0].size(), lod_size)
-          << "The number of sequence must be same between each input";
-    }
-  }
-  CHECK_NE(lod_size, 0) << "Each input must have sequence information";
   return true;
 }
 
-bool SequenceConcatOp::InferShape() const {
-  int64_t batch_size = 0;
-  int64_t feature_size = 0;
-  std::vector<int64_t> out_dims;
-  for (const auto &tensor : param_.X) {
-    const auto x_dims = tensor->dims();
-    if (out_dims.empty()) {
-      out_dims = x_dims.Vectorize();
-    }
-    batch_size += x_dims[0];
-    if (feature_size == 0) {
-      feature_size = x_dims.production() / x_dims[0];
-    } else {
-      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
-          << "Inputs of sequence concat must have same feature size";
-    }
-  }
-  if (batch_size < 0) {
-    batch_size = -1;  // Normalize batch size for compile time.
-  }
-  out_dims[0] = batch_size;
-  param_.Out->Resize(out_dims);
-  // LoD info will be computed in Kernel.
-  return true;
-}
+bool SequenceConcatOp::InferShape() const { return true; }
 
 bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
                                   lite::Scope *scope) {
diff --git a/lite/operators/slice_op.cc b/lite/operators/slice_op.cc
index 43d0b063069b3393f39ce02dc938c77ae98054a1..bbc3d1429e202dac7b9a53c00d83ee34de7ef3d1 100644
--- a/lite/operators/slice_op.cc
+++ b/lite/operators/slice_op.cc
@@ -83,7 +83,6 @@ bool SliceOp::InferShape() const {
   if (axes[0] != 0) {
     param_.Out->set_lod(param_.X->lod());
   }
-  LOG(INFO) << "infer shape done";
   return true;
 }
 
@@ -162,7 +161,6 @@ bool SliceOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
     CHECK_EQ(ends_size, param_.axes.size())
         << "The size of ends must be equal to the size of axes.";
   }
-  LOG(INFO) << "attach impl done";
   return true;
 }
 
diff --git a/lite/operators/split_lod_tensor_op.cc b/lite/operators/split_lod_tensor_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9b665b6026a44caa31b89ec7806188f90f5f1595
--- /dev/null
+++ b/lite/operators/split_lod_tensor_op.cc
@@ -0,0 +1,64 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/split_lod_tensor_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SplitLodTensorOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.mask);
+  CHECK_OR_FALSE(param_.out_true);
+  CHECK_OR_FALSE(param_.out_false);
+
+  const auto mask_dims = param_.mask->dims();
+  CHECK_OR_FALSE(mask_dims.size() == 2);
+  CHECK_OR_FALSE(mask_dims[1] == 1);
+
+  return true;
+}
+
+bool SplitLodTensorOpLite::InferShape() const {
+  auto x_dims = param_.x->dims();
+  param_.out_true->Resize(x_dims);
+  param_.out_false->Resize(x_dims);
+  return true;
+}
+
+bool SplitLodTensorOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                      lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto mask = op_desc.Input("Mask").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.mask = scope->FindVar(mask)->GetMutable<lite::Tensor>();
+
+  auto out_true = op_desc.Output("OutTrue").front();
+  auto out_false = op_desc.Output("OutFalse").front();
+  param_.out_true = scope->FindVar(out_true)->GetMutable<lite::Tensor>();
+  param_.out_false = scope->FindVar(out_false)->GetMutable<lite::Tensor>();
+
+  param_.level = op_desc.GetAttr<int>("level");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(split_lod_tensor,
+                 paddle::lite::operators::SplitLodTensorOpLite);
diff --git a/lite/operators/split_lod_tensor_op.h b/lite/operators/split_lod_tensor_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7feef4f85df652d0c24f830076a078e20c111f9
--- /dev/null
+++ b/lite/operators/split_lod_tensor_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SplitLodTensorOpLite : public OpLite {
+ public:
+  SplitLodTensorOpLite() {}
+
+  explicit SplitLodTensorOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "split_lod_tensor"; }
+
+ private:
+  mutable SplitLodTensorParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58388669afa060d48ea4c3d674dff94c386f104a
--- /dev/null
+++ b/lite/operators/subgraph_op.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/subgraph_op.h"
+#include <utility>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SubgraphOp::CheckShape() const { return true; }
+
+bool SubgraphOp::InferShape() const { return CheckShape(); /* enrich me */ }
+
+bool SubgraphOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  param_.input_names = op_desc.Input("Inputs");
+  param_.output_names = op_desc.Output("Outputs");
+  for (auto& input_name : param_.input_names) {
+    CHECK(scope->FindVar(input_name));
+    scope->FindVar(input_name)->GetMutable<lite::Tensor>();
+  }
+  for (auto& output_name : param_.output_names) {
+    CHECK(scope->FindVar(output_name));
+    scope->FindVar(output_name)->GetMutable<lite::Tensor>();
+  }
+  param_.input_data_names =
+      op_desc.GetAttr<std::vector<std::string>>("input_data_names");
+  param_.output_data_names =
+      op_desc.GetAttr<std::vector<std::string>>("output_data_names");
+  CHECK(param_.sub_block_desc);
+  param_.sub_block_idx = op_desc.GetAttr<int32_t>("sub_block");
+  param_.scope = scope;
+  CHECK(param_.scope);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(subgraph, paddle::lite::operators::SubgraphOp);
diff --git a/lite/operators/graph_op.h b/lite/operators/subgraph_op.h
similarity index 77%
rename from lite/operators/graph_op.h
rename to lite/operators/subgraph_op.h
index 20a7cd9b8da9a6d4e01411f9cff9e9a3aabc6ff7..7f593159c8651cc18fbea17e559f62297d5022e9 100644
--- a/lite/operators/graph_op.h
+++ b/lite/operators/subgraph_op.h
@@ -27,11 +27,11 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-class GraphOpLite : public OpLite {
+class SubgraphOp : public OpLite {
  public:
-  GraphOpLite() {}
+  SubgraphOp() {}
 
-  explicit GraphOpLite(const std::string &type) : OpLite(type) {}
+  explicit SubgraphOp(const std::string &type) : OpLite(type) {}
 
   bool CheckShape() const override;
 
@@ -41,10 +41,13 @@ class GraphOpLite : public OpLite {
 
   void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
 
-  std::string DebugString() const override { return "graph_op"; }
+  std::string DebugString() const override { return "subgraph"; }
+
+  void SetSubBlock(cpp::BlockDesc *desc) { param_.sub_block_desc = desc; }
+  cpp::BlockDesc *GetSubBlock() { return param_.sub_block_desc; }
 
  private:
-  mutable GraphParam param_;
+  mutable SubgraphParam param_;
 };
 
 }  // namespace operators
diff --git a/lite/operators/transpose_op.cc b/lite/operators/transpose_op.cc
index ce850be5334d596104cf545dc82abd44c62c88cc..71086b492b538e293a1f08ed7f492a46d6eb02f8 100644
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
@@ -135,6 +135,15 @@ bool Transpose2Op::InferShape() const {
     out_dims[i] = x_dims[axis[i]];
   }
   param_.output->Resize(out_dims);
+
+  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  param_.xshape->Resize(xshape_dims);
+  auto xshape_lod = param_.xshape->mutable_lod();
+  *xshape_lod = param_.x->lod();
+
   return true;
 }
 
diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc
index 5c7fe374fc90b20ee44df3d1619f44109b7387c0..51f43c709990d7ac1e664336e252ed684479b783 100644
--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
@@ -19,28 +19,7 @@ namespace paddle {
 namespace lite {
 namespace operators {
 
-bool VarConv2dOp::CheckShape() const {
-  auto x_dims = param_.X->dims();
-  CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) can't be less than 2.";
-  auto w_dims = param_.W->dims();
-  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor";
-  CHECK_EQ(w_dims[0], param_.output_channel)
-      << "W dim[0] should be equal to OutputChannel";
-  CHECK_EQ(w_dims[1], param_.input_channel * param_.kernel_h * param_.kernel_w)
-      << "W dim[1] should be equal to InputChannel * KernelH * KernelW";
-  LoD x_lod = param_.X->lod();
-  CHECK_EQ(x_lod.empty(), false) << "The Input(X) must hold lod info.";
-  // CHECK_GE(x_lod.size(), 1) << "The Input(X)'s lod info is corrupted.";
-  CHECK_GE(x_lod.size(), 3) << "The Input(X)'s lod info is corrupted.";
-  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()))
-      << "The Input(X)'s lod info mismatches the actual tensor shape.";
-  // LoD row_lod = param_.ROW->lod();
-  // CHECK_EQ(row_lod.empty(), false) << "The Input(ROW) must hold lod info.";
-  // LoD col_lod = param_.COLUMN->lod();
-  // CHECK_EQ(col_lod.empty(), false) << "The Input(COLUMN) must hold lod
-  // info.";
-  return true;
-}
+bool VarConv2dOp::CheckShape() const { return true; }
 
 bool VarConv2dOp::InferShape() const { return true; }
 
@@ -69,6 +48,10 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   param_.kernel_w = opdesc.GetAttr<int>("KernelW");
   param_.stride_h = opdesc.GetAttr<int>("StrideH");
   param_.stride_w = opdesc.GetAttr<int>("StrideW");
+
+  if (opdesc.HasAttr("fuse_relu")) {
+    param_.fuse_relu = opdesc.GetAttr<bool>("fuse_relu");
+  }
   return true;
 }
 
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 549fabab5a20b7757585eacdc2fe4db64e0aaadf..1671397ecfeef73e1088958189cc00eecc916d02 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -1,4 +1,4 @@
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
     lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -8,12 +8,13 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE
     lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -23,6 +24,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE
     #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
 if(LITE_BUILD_EXTRA)
     lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -33,6 +35,7 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -42,16 +45,16 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
-    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc
index 5aaca9083aea5afabf5171d13f666e7bd41d00c1..dc5252cdad38735a78715b6ed12a0af4e8785ab7 100644
--- a/lite/tests/kernels/activation_compute_test.cc
+++ b/lite/tests/kernels/activation_compute_test.cc
@@ -243,38 +243,53 @@ class ActivationComputeTester : public arena::TestCase {
 
 TEST(Activation_relu, precision) {
   LOG(INFO) << "test relu op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
       for (auto h : {9, 18}) {
         for (auto w : {9, 18}) {
-          for (auto slope : {0.01, 0.1}) {
-            std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
-                place,
-                "def",
-                0.01,
-                6.,
-                "all",
-                0.,
-                DDim(std::vector<int64_t>({n, c, h, w})),
-                "relu",
-                RELU));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
+          std::unique_ptr<arena::TestCase> tester(new ActivationComputeTester(
+              place,
+              "def",
+              0.01,
+              6.,
+              "all",
+              0.,
+              DDim(std::vector<int64_t>({n, c, h, w})),
+              "relu",
+              RELU));
+          arena::Arena arena(std::move(tester), place, abs_error);
+          arena.TestPrecision();
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_leaky_relu, precision) {
   LOG(INFO) << "test leaky_relu op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
@@ -291,20 +306,27 @@ TEST(Activation_leaky_relu, precision) {
                 DDim(std::vector<int64_t>({n, c, h, w})),
                 "leaky_relu",
                 LEAKY_RELU));
-            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena::Arena arena(std::move(tester), place, abs_error);
             arena.TestPrecision();
           }
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_relu_clipped, precision) {
   LOG(INFO) << "test relu clipped op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
@@ -321,14 +343,13 @@ TEST(Activation_relu_clipped, precision) {
                 DDim(std::vector<int64_t>({n, c, h, w})),
                 "relu_clipped",
                 RELU_CLIPPED));
-            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena::Arena arena(std::move(tester), place, abs_error);
             arena.TestPrecision();
           }
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_prelu, precision) {
@@ -363,8 +384,16 @@ TEST(Activation_prelu, precision) {
 
 TEST(Activation_sigmoid, precision) {
   LOG(INFO) << "test sigmoid op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
@@ -380,19 +409,26 @@ TEST(Activation_sigmoid, precision) {
               DDim(std::vector<int64_t>({n, c, h, w})),
               "sigmoid",
               SIGMOID));
-          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena::Arena arena(std::move(tester), place, abs_error);
           arena.TestPrecision();
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_tanh, precision) {
   LOG(INFO) << "test tanh op";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
 
   for (auto n : {1, 3}) {
     for (auto c : {3, 6}) {
@@ -408,13 +444,12 @@ TEST(Activation_tanh, precision) {
               DDim(std::vector<int64_t>({n, c, h, w})),
               "tanh",
               TANH));
-          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena::Arena arena(std::move(tester), place, abs_error);
           arena.TestPrecision();
         }
       }
     }
   }
-#endif
 }
 
 TEST(Activation_swish, precision) {
diff --git a/lite/tests/kernels/cast_compute_test.cc b/lite/tests/kernels/cast_compute_test.cc
index 7c83aed164260b8d16c1d9f5bf8e63fbe08fc4eb..fea3452dbcf8bc9c46e034f2277fcd6e9d0edca0 100644
--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -80,6 +80,7 @@ class CastComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
+    SetPrecisionType(output_, PRECISION(kFloat));
     if (in_dtype_ == 20) {
       std::vector<unsigned char> x_data(x_dims_.production());
       for (int i = 0; i < x_dims_.production(); i++) {
diff --git a/lite/tests/kernels/instance_norm_compute_test.cc b/lite/tests/kernels/instance_norm_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d2f7e964eeb86efe4a7eaa2101b14ed025519111
--- /dev/null
+++ b/lite/tests/kernels/instance_norm_compute_test.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class InstanceNormComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "x";
+  std::string output_ = "y";
+  std::string saved_mean_ = "saved_mean";
+  std::string saved_variance_ = "saved_variance";
+  std::string scale_ = "scale";
+  std::string bias_ = "bias";
+
+  DDim dims_{{4, 5, 19, 19}};
+  float epsilon_ = 1e-5f;
+
+ public:
+  InstanceNormComputeTest(const Place& place,
+                          const std::string& alias,
+                          DDim dims,
+                          float epsilon)
+      : TestCase(place, alias), dims_(dims), epsilon_(epsilon) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(input_);
+    auto scale = scope->FindTensor(scale_);
+    auto bias = scope->FindTensor(bias_);
+    auto out = scope->NewTensor(output_);
+    auto saved_mean = scope->NewTensor(saved_mean_);
+    auto saved_variance = scope->NewTensor(saved_variance_);
+    CHECK(out);
+    CHECK(saved_mean);
+    CHECK(saved_variance);
+    DDim saved_dim({dims_[0] * dims_[1]});
+    out->Resize(dims_);
+    saved_mean->Resize(saved_dim);
+    saved_variance->Resize(saved_dim);
+
+    auto x_data = x->data<float>();
+    auto scale_data = scale->data<float>();
+    auto bias_data = bias->data<float>();
+    auto out_data = out->mutable_data<float>();
+    auto saved_mean_data = saved_mean->mutable_data<float>();
+    auto saved_variance_data = saved_variance->mutable_data<float>();
+
+    int n = x->dims()[0];
+    int c = x->dims()[1];
+    int spatial_size = x->dims()[2] * x->dims()[3];
+
+    // compute mean
+    for (int i = 0; i < n * c; ++i) {
+      const float* x_ptr = x_data + i * spatial_size;
+      float sum = 0.f;
+      for (int j = 0; j < spatial_size; ++j) {
+        sum += x_ptr[j];
+      }
+      saved_mean_data[i] = sum / spatial_size;
+    }
+    // compute variance
+    for (int i = 0; i < n * c; ++i) {
+      const float* x_ptr = x_data + i * spatial_size;
+      float sum = 0.f;
+      for (int j = 0; j < spatial_size; ++j) {
+        sum +=
+            (x_ptr[j] - saved_mean_data[i]) * (x_ptr[j] - saved_mean_data[i]);
+      }
+      saved_variance_data[i] = 1.f / sqrtf(sum / spatial_size + epsilon_);
+    }
+    // compute out
+    for (int i = 0; i < n * c; ++i) {
+      const float* x_ptr = x_data + i * spatial_size;
+      float* out_ptr = out_data + i * spatial_size;
+      float scale_val = scale_data[i % c];
+      float bias_val = bias_data[i % c];
+      for (int j = 0; j < spatial_size; ++j) {
+        out_ptr[j] = scale_val * (x_ptr[j] - saved_mean_data[i]) *
+                         saved_variance_data[i] +
+                     bias_val;
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("instance_norm");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetInput("Bias", {bias_});
+    op_desc->SetInput("Scale", {scale_});
+    op_desc->SetOutput("Y", {output_});
+    op_desc->SetOutput("SavedMean", {saved_mean_});
+    op_desc->SetOutput("SavedVariance", {saved_variance_});
+    op_desc->SetAttr("epsilon", epsilon_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+
+    DDim scale_dim{{dims_[1]}};
+    std::vector<float> scale(scale_dim.production());
+    fill_data_rand(scale.data(), -1.f, 1.f, scale_dim.production());
+
+    std::vector<float> bias(scale_dim.production());
+    fill_data_rand(bias.data(), -1.f, 1.f, scale_dim.production());
+
+    SetCommonTensor(input_, dims_, din.data());
+    SetCommonTensor(scale_, scale_dim, scale.data());
+    SetCommonTensor(bias_, scale_dim, bias.data());
+  }
+};
+
+void test_instance_norm(Place place) {
+  for (auto& n : {1, 3, 16}) {
+    for (auto& c : {1, 4, 16}) {
+      for (auto& h : {1, 16, 33, 56}) {
+        for (auto& w : {1, 17, 34, 55}) {
+          DDim dim_in({n, c, h, w});
+          float epsilon = 1e-5f;
+          std::unique_ptr<arena::TestCase> tester(
+              new InstanceNormComputeTest(place, "def", dim_in, epsilon));
+#ifdef LITE_WITH_ARM
+          auto& ctx = tester->context()->As<ARMContext>();
+          ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 4);
+#endif
+          arena::Arena arena(std::move(tester), place, 6e-5);
+          if (!arena.TestPrecision()) {
+            LOG(ERROR) << "run n: " << n << ", c: " << c << ", h: " << h
+                       << ", w: " << w;
+            return;
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(InstanceNorm, precision) {
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_instance_norm(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/reduce_prod_compute_test.cc b/lite/tests/kernels/reduce_prod_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..27cfea6f5496b5346134e756fea10fd23b90663e
--- /dev/null
+++ b/lite/tests/kernels/reduce_prod_compute_test.cc
@@ -0,0 +1,342 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+void reduce_prod_n(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = 1.0;
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+void reduce_prod_c(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = 1.0;
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+void reduce_prod_h(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = 1.0;
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+void reduce_prod_w(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = 1.0;
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] *= src[src_index];
+        }
+      }
+    }
+  }
+}
+
+void reduce_prod_all(const float* src, float* dst, int64_t total_num) {
+  dst[0] = 1.0;
+  for (int64_t n = 0; n < total_num; ++n) {
+    dst[0] *= src[n];
+  }
+}
+
+void reduce_prod_nc(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_prod_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+void reduce_prod_ch(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_prod_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+void reduce_prod_hw(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_prod_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_prod_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+class ReduceProdComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::vector<int> dim_{};
+  bool keep_dim_{};
+  DDim x_dims_{};
+  bool reduce_all_{};
+
+ public:
+  ReduceProdComputeTester(const Place& place,
+                          const std::string& alias,
+                          std::vector<int> dim,
+                          bool keep_dim,
+                          DDim x_dims,
+                          bool reduce_all)
+      : TestCase(place, alias),
+        dim_(dim),
+        keep_dim_(keep_dim),
+        x_dims_(x_dims),
+        reduce_all_(reduce_all) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* x = scope->FindMutableTensor(input_);
+    auto* x_data = x->data<float>();
+    auto x_rank = x_dims_.size();
+    auto* out = scope->NewTensor(output_);
+
+    if (!dim_.empty()) {
+      for (size_t i = 0; i < dim_.size(); i++) {
+        if (dim_[i] < 0) {
+          dim_[i] += x_rank;
+        }
+      }
+    }
+    sort(dim_.begin(), dim_.end());
+
+    if (reduce_all_ || dim_.size() == 0) {
+      if (keep_dim_) {
+        out->Resize({static_cast<int64_t>(x_rank), 1});
+      } else {
+        out->Resize({1});
+      }
+    } else {
+      std::vector<int64_t> out_dims;
+      for (size_t i = 0; i < x_dims_.size(); i++) {
+        out_dims.push_back(x_dims_[i]);
+      }
+      if (keep_dim_) {
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = 1L;
+        }
+      } else {
+        int64_t kDelFlag = -2;
+        for (size_t i = 0; i < dim_.size(); ++i) {
+          out_dims[dim_[i]] = kDelFlag;
+        }
+        out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                       out_dims.end());
+      }
+      if (!keep_dim_ && out_dims.empty()) {
+        out_dims.push_back(1);
+      }
+      out->Resize(out_dims);
+    }
+
+    auto* out_data = out->mutable_data<float>();
+    if (reduce_all_ || dim_.empty()) {
+      reduce_prod_all(x_data, out_data, x_dims_.production());
+    } else {
+      CHECK_EQ(x_rank, 4U);
+      int in_n = x_dims_[0];
+      int in_c = x_dims_[1];
+      int in_h = x_dims_[2];
+      int in_w = x_dims_[3];
+
+      if (dim_.size() == 1) {
+        switch (dim_[0]) {
+          case 0:
+            reduce_prod_n(x_data, out_data, in_n, in_c, in_h, in_w);
+            break;
+          case 1:
+            reduce_prod_c(x_data, out_data, in_n, in_c, in_h, in_w);
+            break;
+          case 2:
+            reduce_prod_h(x_data, out_data, in_n, in_c, in_h, in_w);
+            break;
+          case 3:
+            reduce_prod_w(x_data, out_data, in_n, in_c, in_h, in_w);
+            break;
+          default:
+            LOG(FATAL) << "error!!!";
+        }
+      } else if (dim_.size() == 2) {
+        if (dim_[0] == 0 && dim_[1] == 1) {
+          reduce_prod_nc(x_data, out_data, in_n, in_c, in_h, in_w);
+        } else if (dim_[0] == 1 && dim_[1] == 2) {
+          reduce_prod_ch(x_data, out_data, in_n, in_c, in_h, in_w);
+        } else if (dim_[0] == 2 && dim_[1] == 3) {
+          reduce_prod_hw(x_data, out_data, in_n, in_c, in_h, in_w);
+        } else {
+          LOG(FATAL) << "invalid dims_!!";
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("reduce_prod");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+    op_desc->SetAttr("dim", dim_);
+    op_desc->SetAttr("keep_dim", keep_dim_);
+    op_desc->SetAttr("reduce_all", reduce_all_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      data[i] = (i + 1) * 1.0;
+    }
+    SetCommonTensor(input_, x_dims_, data.data());
+  }
+};
+
+void test_reduce_prod(Place place) {
+  std::vector<std::vector<int>> reduce_dim{
+      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
+  for (auto n : {1, 3}) {
+    for (auto c : {1, 2}) {
+      for (auto h : {1, 3}) {
+        for (auto w : {1, 3}) {
+          for (bool keep_dim : {false, true}) {
+            for (auto dim : reduce_dim) {
+              auto x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
+              std::unique_ptr<arena::TestCase> tester(
+                  new ReduceProdComputeTester(
+                      place, "def", dim, keep_dim, x_dims, false));
+              arena::Arena arena(std::move(tester), place, 2e-5);
+              arena.TestPrecision();
+            }
+          }
+        }
+      }
+    }
+  }
+  std::vector<int> dim = {0};
+  bool keep_dim = false;
+  bool reduce_all = true;
+  auto x_dims = DDim({2, 2});
+  std::unique_ptr<arena::TestCase> tester(new ReduceProdComputeTester(
+      place, "def", dim, keep_dim, x_dims, reduce_all));
+  arena::Arena arena(std::move(tester), place, 2e-5);
+  arena.TestPrecision();
+}
+
+TEST(ReduceProd, precision) {
+// #ifdef LITE_WITH_X86
+//   Place place(TARGET(kX86));
+// #endif
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_reduce_prod(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62e0fc8e410092975eed3ea5fec441a7859de81f
--- /dev/null
+++ b/lite/tests/kernels/transpose_compute_test.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+int data_index(std::vector<int> pos, DDimLite dims) {
+  int d1 = dims[1];
+  int d2 = dims[2];
+  int d3 = dims[3];
+  return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1;
+}
+
+std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
+  std::vector<int> out_pos(in_pos.size());
+  for (int i = 0; i < axis.size(); i++) {
+    out_pos[axis[i]] = in_pos[i];
+  }
+  return out_pos;
+}
+
+class TransposeComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "transpose2";
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::string xshape_ = "xshape";
+  DDim x_dims_;
+  std::vector<int> axis_;
+
+ public:
+  TransposeComputeTester(const Place& place,
+                         const std::string& alias,
+                         DDim x_dims,
+                         std::vector<int> axis)
+      : TestCase(place, alias), x_dims_(x_dims), axis_(axis) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(output_);
+    CHECK(out);
+
+    auto* x = scope->FindTensor(input_);
+    auto x_dims = x->dims();
+
+    std::vector<int64_t> out_shape(x_dims.size(), 0);
+    for (size_t i = 0; i < x_dims.size(); i++) {
+      out_shape[i] = x_dims[axis_[i]];
+    }
+    out->Resize(out_shape);
+
+    auto y_dims = out->dims();
+
+    int input_n = x_dims[0];
+    int input_c = x_dims[1];
+    int input_h = x_dims[2];
+    int input_w = x_dims[3];
+
+    auto input_data = x->data<float>();
+    auto output_data = out->mutable_data<float>();
+
+    for (int n = 0; n < input_n; ++n) {
+      for (int c = 0; c < input_c; ++c) {
+        for (int h = 0; h < input_h; ++h) {
+          for (int w = 0; w < input_w; ++w) {
+            std::vector<int> in_pos{n, c, h, w};
+            std::vector<int> out_pos = pos_trans(in_pos, axis_);
+            int in_index = data_index(in_pos, x_dims);
+            int out_index = data_index(out_pos, y_dims);
+            output_data[out_index] = input_data[in_index];
+          }
+        }
+      }
+    }
+
+    if (op_type_ == "transpose2") {
+      auto* xshape = scope->NewTensor(xshape_);
+      auto xshape_dims = x_dims.Vectorize();
+      xshape_dims.insert(xshape_dims.begin(), 0);
+      xshape->Resize(xshape_dims);
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+    if (op_type_ == "transpose2") {
+      op_desc->SetOutput("XShape", {xshape_});
+    }
+    op_desc->SetAttr("axis", axis_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      data[i] = i * 1.1;
+    }
+    SetCommonTensor(input_, x_dims_, data.data());
+  }
+};
+
+TEST(Transpose, precision) {
+  LOG(INFO) << "test Transpose op";
+  float abs_error = 2e-5;
+  Place place;
+#ifdef LITE_WITH_XPU
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+
+  DDim x_dims{{2, 3, 4, 5}};
+  // [XPU]: {3, 1, 0, 2} is unsupported
+  std::vector<std::vector<int>> axes{
+      {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}};
+  for (auto axis : axes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new TransposeComputeTester(place, "def", x_dims, axis));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc
index bda50d35633c853ba6e8c8695d0175da38865d1c..367eb6c34761b8d0989da0d2e99aa00442d0c76b 100644
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -59,6 +59,8 @@ DEFINE_bool(flag_bias, true, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
+typedef paddle::lite::operators::ActivationParam ActivationParam;
+
 using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
@@ -118,6 +120,13 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
   param.dilations = std::make_shared<std::vector<int>>(dilas);
   param.fuse_relu = flag_relu;
   param.groups = group;
+  if (flag_relu) {
+    ActivationParam act_param;
+    act_param.has_active = true;
+    act_param.active_type =
+        (paddle::lite_api::ActivationType)1;  // 2-relu6 4-leakyrelu
+    param.activation_param = act_param;
+  }
 
   param.output = new Tensor;
   param.output->set_precision(PRECISION(kFloat));
@@ -243,6 +252,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                          << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
+                         << ", group: " << group
                          << ", bias: " << (flag_bias ? "true" : "false")
                          << ", relu: " << (flag_relu ? "true" : "false")
                          << ", threads: " << th << ", power_mode: " << cls
@@ -255,6 +265,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                   << ", pad: " << pads[0] << ", " << pads[1]
                   << ", stride: " << strides[0] << ", " << strides[1]
                   << ", dila_: " << dilas[0] << ", " << dilas[1]
+                  << ", group: " << group
                   << ", bias: " << (flag_bias ? "true" : "false")
                   << ", relu: " << (flag_relu ? "true" : "false")
                   << ", threads: " << th << ", power_mode: " << cls
diff --git a/lite/tests/math/pool_compute_test.cc b/lite/tests/math/pool_compute_test.cc
index 73a5ba5606c2635c2df2792a3ccb6544715384a9..e0d4de61747d5772edd94f7ad66cfe99e8cf0457 100644
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
@@ -355,7 +355,8 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test fp32 pool: input: " << dim_in
                          << ", output: " << dim_out
                          << ", kernel dim: " << ksize[0] << ", " << ksize[1]
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", global_pooling: "
                          << (flag_global ? "global" : "false")
@@ -370,6 +371,7 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
         LOG(INFO) << "test fp32 pool: input: " << dim_in
                   << ", output: " << dim_out << ", kernel dim: " << ksize[0]
                   << ", " << ksize[1] << ", pad: " << pads[0] << ", " << pads[1]
+                  << ", " << pads[2] << ", " << pads[3]
                   << ", stride: " << strides[0] << ", " << strides[1]
                   << ", global_pooling: " << (flag_global ? "global" : "false")
                   << ", pooling_type: " << pooling_type
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 319f26ff82dd47718a7fc69d64522ca622ecaf3e..e1610b60d3b1b104699ab175bca3bb3cf81bd40b 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -26,6 +26,12 @@ readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/t
 
 readonly workspace=$PWD
 
+# if operating in mac env, we should expand the maximum file num
+os_nmae=`uname -s`
+if [ ${os_nmae} == "Darwin" ]; then
+   ulimit -n 1024
+fi
+
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
 # here we fake an empty file to make cmake works.
 function prepare_workspace {
diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh
index 1509f563b2e4f2008e7ea4f37ca4e5491464e9cc..1515cfcdd3e69391b4d1a96688c7dc75f40e6dc2 100755
--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
@@ -7,7 +7,7 @@ ARM_ABI="armv8"                     # armv8, armv7
 ARM_LANG="gcc"                      # gcc only yet
 ANDROID_STL="c++_shared"            # c++_shared/c++_static, c++_shared is used by HiAI DDK 310
 DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/
-TARGET_NAME="test_npu_pass"         # default target
+TARGET_NAME="test_subgraph_pass"    # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
 WITH_JAVA=ON                        # ON(build jar and jni so)/OFF
 WITH_TESTING=ON                     # ON/OFF
diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh
index 62a123c82b2945147fa8616ad8faf0af33a32302..9f28274471f0d0a06c755e41a4a92448a96873af 100755
--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh
@@ -3,7 +3,7 @@ set -ex
 
 # global variables with default value
 XPU_SDK_ROOT="$(pwd)/../XPU_SDK"    # XPU SDK
-TARGET_NAME="lite_compile_deps"     # default target
+TARGET_NAME="test_subgraph_pass"    # default target
 BUILD_EXTRA=ON                      # ON(with sequence ops)/OFF
 WITH_TESTING=ON                     # ON/OFF
 
@@ -73,8 +73,8 @@ function build_xpu {
         -DWITH_MKLDNN=OFF \
         -DLITE_WITH_X86=ON \
         -DWITH_MKL=ON \
-        -DLITE_BUILD_EXTRA=ON \
         -DLITE_WITH_XPU=ON \
+        -DLITE_BUILD_EXTRA=${BUILD_EXTRA} \
         -DWITH_TESTING=${WITH_TESTING} \
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT}
 
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 8b5741a7a68bee3e783dff68e4bd4a8fc7cd8527..0fdb472d784995dd73eafd307274597c9f0f15d3 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -14,6 +14,16 @@ readonly workspace=$PWD
 
 NUM_CORES_FOR_COMPILE=${LITE_BUILD_THREADS:-8}
 
+# global variables
+#whether to use emulator as adb devices,when USE_ADB_EMULATOR=ON we use emulator, else we will use connected mobile phone as adb devices.
+USE_ADB_EMULATOR=ON
+
+# if operating in mac env, we should expand the maximum file num
+os_nmae=`uname -s`
+if [ ${os_nmae} == "Darwin" ]; then
+   ulimit -n 1024
+fi
+
 function prepare_thirdparty {
     if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
         rm -rf $workspace/third-party
@@ -27,6 +37,28 @@ function prepare_thirdparty {
     fi
 }
 
+# prepare adb devices
+# if USE_ADB_EMULATOR=ON , we create adb emulator port_armv8 and port_armv7 for usage, else we will use actual mobilephone according to adbindex.
+function prepare_adb_devices {
+    port_armv8=5554
+    port_armv7=5556
+    if [ $USE_ADB_EMULATOR == "ON" ]; then
+       prepare_emulator $port_armv8 $port_armv7
+       device_armv8=emulator-$port_armv8
+       device_armv7=emulator-$port_armv7
+    else
+       adb_devices=($(adb devices |grep -v devices |grep device | awk -F " " '{print $1}'))
+       # adbindex is the env variable registered in ci agent to tell which mobile is to used as adb
+       adbindex_pos=`expr ${adbindex} + 1`
+       if [ ${adbindex_pos} -gt ${#adb_devices[@]} ]; then
+           echo -e "Error: the adb devices on ci agent are not enough, at least ${adbindex_pos} adb devices are needed."
+           exit 1
+       fi
+       echo ${adb_devices[${adbindex}]}
+       device_armv8=${adb_devices[${adbindex}]}
+       device_armv7=${adb_devices[${adbindex}]}
+    fi
+}
 
 # for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
 # here we fake an empty file to make cmake works.
@@ -77,7 +109,7 @@ function cmake_opencl {
 }
 
 function run_gen_code_test {
-    local port=$1
+    local device=$1
     local gen_code_file_name="__generated_code__.cc"
     local gen_code_file_path="./lite/gen_code/${gen_code_file_path}"
     local adb_work_dir="/data/local/tmp"
@@ -87,20 +119,20 @@ function run_gen_code_test {
 
     # 2. run test_cxx_api_lite in emulator to get opt model 
     local test_cxx_api_lite_path=$(find ./lite -name test_cxx_api)
-    adb -s emulator-${port} push "./third_party/install/lite_naive_model" ${adb_work_dir}
-    adb -s emulator-${port} push ${test_cxx_api_lite_path} ${adb_work_dir}
-    adb -s emulator-${port} shell "${adb_work_dir}/test_cxx_api --model_dir=${adb_work_dir}/lite_naive_model --optimized_model=${adb_work_dir}/lite_naive_model_opt"
+    adb -s ${device} push "./third_party/install/lite_naive_model" ${adb_work_dir}
+    adb -s ${device} push ${test_cxx_api_lite_path} ${adb_work_dir}
+    adb -s ${device} shell "${adb_work_dir}/test_cxx_api --model_dir=${adb_work_dir}/lite_naive_model --optimized_model=${adb_work_dir}/lite_naive_model_opt"
 
     # 3. build test_gen_code
     make test_gen_code -j$NUM_CORES_FOR_COMPILE
 
     # 4. run test_gen_code_lite in emulator to get __generated_code__.cc
     local test_gen_code_lite_path=$(find ./lite -name test_gen_code)
-    adb -s emulator-${port} push ${test_gen_code_lite_path} ${adb_work_dir}
-    adb -s emulator-${port} shell "${adb_work_dir}/test_gen_code --optimized_model=${adb_work_dir}/lite_naive_model_opt --generated_code_file=${adb_work_dir}/${gen_code_file_name}"
+    adb -s ${device} push ${test_gen_code_lite_path} ${adb_work_dir}
+    adb -s ${device} shell "${adb_work_dir}/test_gen_code --optimized_model=${adb_work_dir}/lite_naive_model_opt --generated_code_file=${adb_work_dir}/${gen_code_file_name}"
 
     # 5. pull __generated_code__.cc down and mv to build real path
-    adb -s emulator-${port} pull "${adb_work_dir}/${gen_code_file_name}" .
+    adb -s ${device} pull "${adb_work_dir}/${gen_code_file_name}" .
     mv ${gen_code_file_name} ${gen_code_file_path}
 
     # 6. build test_generated_code
@@ -324,12 +356,12 @@ function build_test_xpu {
 # test_arm_android <some_test_name> <adb_port_number>
 function test_arm_android {
     local test_name=$1
-    local port=$2
+    local device=$2
     if [[ "${test_name}x" == "x" ]]; then
         echo "test_name can not be empty"
         exit 1
     fi
-    if [[ "${port}x" == "x" ]]; then
+    if [[ "${device}x" == "x" ]]; then
         echo "Port can not be empty"
         exit 1
     fi
@@ -344,20 +376,20 @@ function test_arm_android {
 
     local testpath=$(find ./lite -name ${test_name})
 
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell "cd ${adb_work_dir} && ./${test_name}"
-    adb -s emulator-${port} shell "rm ${adb_work_dir}/${test_name}"
+    adb -s ${device} push ${testpath} ${adb_work_dir}
+    adb -s ${device} shell "cd ${adb_work_dir} && ./${test_name}"
+    adb -s ${device} shell "rm ${adb_work_dir}/${test_name}"
 }
 
 # test_npu <some_test_name> <adb_port_number>
 function test_npu {
     local test_name=$1
-    local port=$2
+    local device=$2
     if [[ "${test_name}x" == "x" ]]; then
         echo "test_name can not be empty"
         exit 1
     fi
-    if [[ "${port}x" == "x" ]]; then
+    if [[ "${device}x" == "x" ]]; then
         echo "Port can not be empty"
         exit 1
     fi
@@ -373,33 +405,33 @@ function test_npu {
     local testpath=$(find ./lite -name ${test_name})
 
     # note the ai_ddk_lib is under paddle-lite root directory
-    adb -s emulator-${port} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s ${device} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
+    adb -s ${device} push ${testpath} ${adb_work_dir}
 
     if [[ ${test_name} == "test_npu_pass" ]]; then
         local model_name=mobilenet_v1
-        adb -s emulator-${port} push "./third_party/install/${model_name}" ${adb_work_dir}
-        adb -s emulator-${port} shell "rm -rf ${adb_work_dir}/${model_name}_opt "
-        adb -s emulator-${port} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name} --optimized_model=./${model_name}_opt"
+        adb -s ${device} push "./third_party/install/${model_name}" ${adb_work_dir}
+        adb -s ${device} shell "rm -rf ${adb_work_dir}/${model_name}_opt "
+        adb -s ${device} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name} --optimized_model=./${model_name}_opt"
     elif [[ ${test_name} == "test_subgraph_pass" ]]; then
         local model_name=mobilenet_v1
-        adb -s emulator-${port} push "./third_party/install/${model_name}" ${adb_work_dir}
-        adb -s emulator-${port} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name}"
+        adb -s ${device} push "./third_party/install/${model_name}" ${adb_work_dir}
+        adb -s ${device} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; export GLOG_v=0; ./${test_name} --model_dir=./${model_name}"
     else
-        adb -s emulator-${port} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; ./${test_name}"
+        adb -s ${device} shell "cd ${adb_work_dir}; export LD_LIBRARY_PATH=./ ; ./${test_name}"
     fi
 }
 
 function test_npu_model {
     local test_name=$1
-    local port=$2
+    local device=$2
     local model_dir=$3
 
     if [[ "${test_name}x" == "x" ]]; then
         echo "test_name can not be empty"
         exit 1
     fi
-    if [[ "${port}x" == "x" ]]; then
+    if [[ "${device}x" == "x" ]]; then
         echo "Port can not be empty"
         exit 1
     fi
@@ -412,17 +444,17 @@ function test_npu_model {
     adb_work_dir="/data/local/tmp"
 
     testpath=$(find ./lite -name ${test_name})
-    adb -s emulator-${port} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
-    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
+    adb -s ${device} push ../ai_ddk_lib/lib64/* ${adb_work_dir}
+    adb -s ${device} push ${model_dir} ${adb_work_dir}
+    adb -s ${device} push ${testpath} ${adb_work_dir}
+    adb -s ${device} shell chmod +x "${adb_work_dir}/${test_name}"
     local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
-    adb -s emulator-${port} shell "export LD_LIBRARY_PATH=${adb_work_dir}; ${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
+    adb -s ${device} shell "export LD_LIBRARY_PATH=${adb_work_dir}; ${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
 }
 
 # test the inference high level api
 function test_arm_api {
-    local port=$1
+    local device=$1
     local test_name="test_paddle_api"
 
     make $test_name -j$NUM_CORES_FOR_COMPILE
@@ -431,23 +463,23 @@ function test_arm_api {
     local remote_model=${adb_work_dir}/paddle_api
     local testpath=$(find ./lite -name ${test_name})
 
-    arm_push_necessary_file $port $model_path $remote_model
-    adb -s emulator-${port} shell mkdir -p $remote_model
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
-    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir $remote_model"
+    arm_push_necessary_file $device $model_path $remote_model
+    adb -s ${device} shell mkdir -p $remote_model
+    adb -s ${device} push ${testpath} ${adb_work_dir}
+    adb -s ${device} shell chmod +x "${adb_work_dir}/${test_name}"
+    adb -s ${device} shell "${adb_work_dir}/${test_name} --model_dir $remote_model"
 }
 
 function test_arm_model {
     local test_name=$1
-    local port=$2
+    local device=$2
     local model_dir=$3
 
     if [[ "${test_name}x" == "x" ]]; then
         echo "test_name can not be empty"
         exit 1
     fi
-    if [[ "${port}x" == "x" ]]; then
+    if [[ "${device}x" == "x" ]]; then
         echo "Port can not be empty"
         exit 1
     fi
@@ -460,11 +492,11 @@ function test_arm_model {
     adb_work_dir="/data/local/tmp"
 
     testpath=$(find ./lite -name ${test_name})
-    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
-    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
+    adb -s ${device} push ${model_dir} ${adb_work_dir}
+    adb -s ${device} push ${testpath} ${adb_work_dir}
+    adb -s ${device} shell chmod +x "${adb_work_dir}/${test_name}"
     local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
-    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
+    adb -s ${device} shell "${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
 }
 
 # function _test_model_optimize_tool {
@@ -491,11 +523,11 @@ function test_model_optimize_tool_compile {
 }
 
 function _test_paddle_code_generator {
-    local port=$1
+    local device=$1
     local test_name=paddle_code_generator
     local remote_test=$ADB_WORK_DIR/$test_name
     local remote_model=$ADB_WORK_DIR/lite_naive_model.opt
-    local adb="adb -s emulator-${port}"
+    local adb="adb -s ${device}"
 
     make paddle_code_generator -j$NUM_CORES_FOR_COMPILE
     local test_path=$(find . -name $test_name | head -n1)
@@ -576,8 +608,6 @@ function build_arm {
     cmake_arm ${os} ${abi} ${lang}
     build $TESTS_FILE
 
-    # test publish inference lib
-    make publish_inference
 }
 
 # $1: ARM_TARGET_OS in "android"
@@ -614,7 +644,7 @@ function test_arm {
     os=$1
     abi=$2
     lang=$3
-    port=$4
+    device=$4
 
     if [[ ${os} == "armlinux" ]]; then
         # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
@@ -628,16 +658,16 @@ function test_arm {
     fi
 
     # prepare for CXXApi test
-    local adb="adb -s emulator-${port}"
+    local adb="adb -s ${device}"
     $adb shell mkdir -p /data/local/tmp/lite_naive_model_opt
 
     echo "test file: ${TESTS_FILE}"
     for _test in $(cat $TESTS_FILE); do
-        test_arm_android $_test $port
+        test_arm_android $_test $device
     done
 
     # test finally
-    test_arm_api $port
+    test_arm_api $device
 
     # _test_model_optimize_tool $port
     # _test_paddle_code_generator $port
@@ -661,11 +691,11 @@ function prepare_emulator {
 }
 
 function arm_push_necessary_file {
-    local port=$1
+    local device=$1
     local testpath=$2
     local adb_work_dir=$3
 
-    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+    adb -s ${device} push ${testpath} ${adb_work_dir}
 }
 
 function build_test_arm_opencl {
@@ -688,15 +718,13 @@ function build_test_arm_opencl {
 function build_test_arm_subtask_android {
     ########################################################################
     # job 1-4 must be in one runner
-    port_armv8=5554
-    port_armv7=5556
-
-    prepare_emulator $port_armv8 $port_armv7
+    prepare_adb_devices
 
     # job 1
     build_arm "android" "armv8" "gcc"
-    run_gen_code_test ${port_armv8}
-    test_arm "android" "armv8" "gcc" ${port_armv8}
+    adb -s $device_armv8 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv8}
+    test_arm "android" "armv8" "gcc" ${device_armv8}
     cd -
 
     # job 2
@@ -707,8 +735,9 @@ function build_test_arm_subtask_android {
 
     # job 3
     build_arm "android" "armv7" "gcc"
-    run_gen_code_test ${port_armv7}
-    test_arm "android" "armv7" "gcc" ${port_armv7}
+    adb -s $device_armv7 shell 'rm -rf /data/local/tmp/*'
+    run_gen_code_test ${device_armv7}
+    test_arm "android" "armv7" "gcc" ${device_armv7}
     cd -
 
     # job 4
@@ -717,7 +746,9 @@ function build_test_arm_subtask_android {
     #test_arm "android" "armv7" "clang" ${port_armv7}
     #cd -
 
-    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    if [ $USE_ADB_EMULATOR == "ON" ]; then
+        adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    fi
     echo "Done"
 }
 
@@ -726,17 +757,17 @@ function build_test_arm_subtask_armlinux {
     cur=$PWD
     # job 5
     build_arm "armlinux" "armv8" "gcc"
-    test_arm "armlinux" "armv8" "gcc" $port_armv8
+    test_arm "armlinux" "armv8" "gcc" $device_armv8
     cd $cur
 
     # job 6
     build_arm "armlinux" "armv7" "gcc"
-    test_arm "armlinux" "armv7" "gcc" $port_armv8
+    test_arm "armlinux" "armv7" "gcc" $device_armv8
     cd $cur
 
     # job 7
     build_arm "armlinux" "armv7hf" "gcc"
-    test_arm "armlinux" "armv7hf" "gcc" $port_armv8
+    test_arm "armlinux" "armv7hf" "gcc" $device_armv8
     cd $cur
 
     echo "Done"
@@ -744,8 +775,6 @@ function build_test_arm_subtask_armlinux {
 
 # sub-task-model
 function build_test_arm_subtask_model {
-    local port_armv8=5554
-    local port_armv7=5556
     # We just test following single one environment to limit the CI time.
     local os=android
     local abi=armv8
@@ -761,12 +790,16 @@ function build_test_arm_subtask_model {
     cmake_arm $os $abi $lang
     make $test_name -j$NUM_CORES_FOR_COMPILE
 
-    prepare_emulator $port_armv8 $port_armv7
+    # prepare adb devices
+    prepare_adb_devices
+    adb -s $device_armv8 shell 'rm -rf /data/local/tmp/*'
 
     # just test the model on armv8
-    test_arm_model $test_name $port_armv8 "./third_party/install/$model_name"
+    test_arm_model $test_name $device_armv8 "./third_party/install/$model_name"
 
-    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    if [ $USE_ADB_EMULATOR == "ON" ]; then
+        adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    fi
     echo "Done"
     cd -
     rm -rf $build_dir
@@ -775,16 +808,16 @@ function build_test_arm_subtask_model {
 
 # this test load a model, optimize it and check the prediction result of both cxx and light APIS.
 function test_arm_predict_apis {
-    local port=$1
+    local device=$1
     local workspace=$2
     local naive_model_path=$3
     local api_test_path=$(find . -name "test_apis")
     # the model is pushed to ./lite_naive_model
-    adb -s emulator-${port} push ${naive_model_path} ${workspace}
-    adb -s emulator-${port} push $api_test_path ${workspace}
+    adb -s ${device} push ${naive_model_path} ${workspace}
+    adb -s ${device} push $api_test_path ${workspace}
 
     # test cxx_api first to store the optimized model.
-    adb -s emulator-${port} shell ./test_apis --model_dir ./lite_naive_model --optimized_model ./lite_naive_model_opt
+    adb -s ${device} shell ./test_apis --model_dir ./lite_naive_model --optimized_model ./lite_naive_model_opt
 }
 
 
@@ -792,9 +825,6 @@ function test_arm_predict_apis {
 function build_test_arm {
     ########################################################################
     # job 1-4 must be in one runner
-    port_armv8=5554
-    port_armv7=5556
-
     build_test_arm_subtask_android
     build_test_arm_subtask_armlinux
 }
@@ -816,16 +846,19 @@ function build_test_npu {
     # just test the model on armv8
     # prepare_emulator $port_armv8
 
+    prepare_emulator $port_armv8 $port_armv7
+    local device_armv8=emulator-$port_armv8
+
     if [[ "${test_name}x" != "x" ]]; then
-        test_npu ${test_name} ${port_armv8}
+        test_npu ${test_name} ${device_armv8}
     else
         # run_gen_code_test ${port_armv8}
         for _test in $(cat $TESTS_FILE | grep npu); do
-            test_npu $_test $port_armv8
+            test_npu $_test $device_armv8
         done
     fi
 
-    test_npu_model $test_model_name $port_armv8 "./third_party/install/$model_name"
+    test_npu_model $test_model_name $device_armv8 "./third_party/install/$model_name"
     cd -
     # just test the model on armv8
     # adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
@@ -910,6 +943,10 @@ function main {
                 ARM_PORT="${i#*=}"
                 shift
                 ;;
+            --use_adb_emulator=*)
+                USE_ADB_EMULATOR="${i#*=}"
+                shift
+                ;;
             build)
                 build $TESTS_FILE
                 build $LIBS_FILE
diff --git a/lite/tools/debug/CMakeLists.txt b/lite/tools/debug/CMakeLists.txt
index 43c0812ab91f6ddcba02f93d2eea60f5a5268341..a82c15405062172aa9adcecb68d95db6d7ed2330 100644
--- a/lite/tools/debug/CMakeLists.txt
+++ b/lite/tools/debug/CMakeLists.txt
@@ -1,7 +1,9 @@
-lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser)
+if(NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
+    return()
+endif()
 
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
-  lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
+lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser)
+lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
     DEPS
     cxx_api
     debug_utils
@@ -16,4 +18,3 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
     XPU_DEPS ${xpu_kernels}
     FPGA_DEPS ${fpga_kernels}
     CL_DEPS ${opencl_kernels})
-endif()
diff --git a/lite/tools/debug/debug_utils.h b/lite/tools/debug/debug_utils.h
index ff08c47e524cacee37e95572a7f7a2fb444d4d16..d2659c2c7f9a156cde5a0dd5e57efe12787a43d0 100644
--- a/lite/tools/debug/debug_utils.h
+++ b/lite/tools/debug/debug_utils.h
@@ -27,7 +27,7 @@
 #include "lite/model_parser/pb/var_desc.h"
 #include "lite/utils/all.h"
 
-DEFINE_string(model_path, "", "Model dir path");
+DEFINE_string(model_dir, "", "Model dir path");
 DEFINE_string(input_file, "", "Input datas file path");
 DEFINE_string(topo_output_file, "", "Runtime topology order output file path");
 DEFINE_bool(output_topo, true, "Dump runtime topology or not");
@@ -148,8 +148,8 @@ void PrepareModelInputTensor(const DebugConfig& conf,
     auto* input_tensor = &feed_var->at(item.first);
     input_tensor->Resize(DDim(dim));
     switch (val_type) {
-#define FILL_TENSOR_BY_TYPE_ONCE(pb_type__, type__)         \
-  case framework::proto::VarType::pb_type__:                \
+#define FILL_TENSOR_BY_TYPE_ONCE(var_type__, type__)        \
+  case VarDescAPI::Type::var_type__:                        \
     FillTensorData<type__>(input_tensor, conf, item.first); \
     break
 
@@ -185,7 +185,7 @@ void ParseConfig(DebugConfig* conf) {
   CHECK(conf);
 #define CHECK_NON_EMPTY(name__) \
   CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty."
-  CHECK_NON_EMPTY(model_path);
+  CHECK_NON_EMPTY(model_dir);
   if (FLAGS_output_topo) {
     CHECK_NON_EMPTY(topo_output_file);
   }
@@ -193,7 +193,7 @@ void ParseConfig(DebugConfig* conf) {
     CHECK_NON_EMPTY(tensor_output_file);
   }
 #undef CHECK_NON_EMPTY
-  conf->model_dir = FLAGS_model_path;
+  conf->model_dir = FLAGS_model_dir;
   conf->topo_output_file = FLAGS_topo_output_file;
   conf->tensor_output_file = FLAGS_tensor_output_file;
   conf->input_file = FLAGS_input_file;
diff --git a/mobile/src/common/types.cpp b/mobile/src/common/types.cpp
index 42a98450a3220bfee9bea4811a9b153ce8ac5b2f..00a4369010248586c9957e9a5d97e22a6d9ab9eb 100755
--- a/mobile/src/common/types.cpp
+++ b/mobile/src/common/types.cpp
@@ -134,6 +134,8 @@ const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
     "fill_constant_batch_size_like";
 const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";
 const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle";
+const char *G_OP_TYPE_EXPAND = "expand";
+const char *G_OP_TYPE_GRID_SAMPLER = "grid_sampler";
 
 std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -156,7 +158,7 @@ std::unordered_map<
         {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
         {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
         {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Y"}}},
         {G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}},
         {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
         {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
@@ -258,5 +260,7 @@ std::unordered_map<
          {{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}},
         {G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}},
         {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}}};
+        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_EXPAND, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_GRID_SAMPLER, {{"X", "Grid"}, {"Output"}}}};
 }  // namespace paddle_mobile
diff --git a/mobile/src/common/types.h b/mobile/src/common/types.h
index d876f3b116cbb397ffa8019b1a8d9a637606ec10..cc49182adb75be6d81d403971d53dca6f0b46627 100644
--- a/mobile/src/common/types.h
+++ b/mobile/src/common/types.h
@@ -265,6 +265,8 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;
 extern const char *G_OP_TYPE_PIXEL_SHUFFLE;
+extern const char *G_OP_TYPE_EXPAND;
+extern const char *G_OP_TYPE_GRID_SAMPLER;
 
 extern std::unordered_map<
     std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h
index 6e885adca886b62099946590d52941d8de2550f0..f41d0ed659e1fe529d8662fe6ebb3e9f56e2d09d 100644
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
@@ -162,6 +162,9 @@ class CLImage {
     CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
     // use real image dims to create mem
     real_image_dims_ = real_image_dims;
+    // when init fake size image ,
+    // reinit image is allow , it is disallowed after this..
+    shared_mem_ = false;
     InitCLImage(context, real_image_dims_[0], real_image_dims_[1], nullptr);
     // cheat cl_image they got what they wanted
     image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
diff --git a/mobile/src/framework/load_ops.h b/mobile/src/framework/load_ops.h
index b871d2af140730850dfac0fd43383e48012c9ef0..e04db5d1e8d6e2a75343cbee15269d607f71b7c9 100755
--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
@@ -246,7 +246,7 @@ LOAD_OP2(fusion_conv_bn, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_bn);
 #endif
 #ifdef ELEMENTWISESUB_OP
-LOAD_OP1(elementwise_sub, CPU)
+LOAD_OP2(elementwise_sub, CPU, GPU_CL)
 #endif
 #ifdef TOP_K_OP
 LOAD_OP1(top_k, CPU)
@@ -380,3 +380,9 @@ LOAD_OP1(reduce_prod, CPU);
 #ifdef PIXEL_SHUFFLE_OP
 LOAD_OP1(pixel_shuffle, GPU_CL);
 #endif
+#ifdef EXPAND_OP
+LOAD_OP1(expand, GPU_CL);
+#endif
+#ifdef GRID_SAMPLER_OP
+LOAD_OP1(grid_sampler, GPU_CL);
+#endif
diff --git a/mobile/src/operators/elementwise_sub_op.cpp b/mobile/src/operators/elementwise_sub_op.cpp
index 9b9d89073a637fb769687684ead23829e5445c90..6962e69a8de5522aeff912fe84484e36879300d4 100644
--- a/mobile/src/operators/elementwise_sub_op.cpp
+++ b/mobile/src/operators/elementwise_sub_op.cpp
@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(elementwise_sub, ops::ElementwiseSubOp);
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif
 
diff --git a/mobile/src/operators/expand_op.cpp b/mobile/src/operators/expand_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1d8b76fd6299cadfd3977f8e804c7aa0e7b5cc6
--- /dev/null
+++ b/mobile/src/operators/expand_op.cpp
@@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef EXPAND_OP
+
+#include "operators/expand_op.h"
+#include <framework/ddim.h>
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void ExpandOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+
+  int expand_size = this->param_.expand_times.size();
+  int x_dims_size = x_dim.size();
+  PADDLE_MOBILE_ENFORCE(expand_size == x_dims_size,
+                        "The number of expand_times size must be qual to the "
+                        "rank of Input(X). The number of expand_times size "
+                        "must be qual to the rank of Input(X).")
+
+  framework::DDim out_dims(this->param_.InputX()->dims());
+  for (size_t i = 0; i < this->param_.expand_times.size(); ++i) {
+    out_dims[i] *= this->param_.expand_times[i];
+  }
+  this->param_.Out()->Resize(out_dims);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(expand, ops::ExpandOp);
+#endif
+
+#endif
diff --git a/mobile/src/operators/expand_op.h b/mobile/src/operators/expand_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d504000079bc79564c4f58e0133d37ff8634e5c4
--- /dev/null
+++ b/mobile/src/operators/expand_op.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef EXPAND_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/expand_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#ifdef EXPAND_OP
+DECLARE_OPERATOR(Expand, ExpandParam, ExpandKernel);
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/mobile/src/operators/fusion_instancenorm_relu_op.h b/mobile/src/operators/fusion_instancenorm_relu_op.h
index ce2623e4dda46a0952fede3e1a25012ed5da4394..91551e65586b822d75336450b4cd0db2a7dd7d26 100644
--- a/mobile/src/operators/fusion_instancenorm_relu_op.h
+++ b/mobile/src/operators/fusion_instancenorm_relu_op.h
@@ -45,7 +45,7 @@ class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher {
 template <typename DeviceType, typename T>
 class FusionInstanceNormReluOp
     : public framework::OperatorWithKernel<
-          DeviceType, InstanceNormParam<DeviceType>,
+          DeviceType, FusionInstanceNormReluParam<DeviceType>,
           operators::InstanceNormReluKernel<DeviceType, T>> {
  public:
   FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs,
@@ -53,7 +53,7 @@ class FusionInstanceNormReluOp
                            const framework::AttributeMap &attrs,
                            framework::Scope *scope)
       : framework::OperatorWithKernel<
-            DeviceType, InstanceNormParam<DeviceType>,
+            DeviceType, FusionInstanceNormReluParam<DeviceType>,
             operators::InstanceNormReluKernel<DeviceType, T>>(
             type, inputs, outputs, attrs, scope) {}
 
diff --git a/mobile/src/operators/grid_sampler_op.cpp b/mobile/src/operators/grid_sampler_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90809f1d4c6495ab06a9a630681f9b07a31d2f01
--- /dev/null
+++ b/mobile/src/operators/grid_sampler_op.cpp
@@ -0,0 +1,36 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef GRID_SAMPLER_OP
+
+#include "operators/grid_sampler_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void GridSamplerOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Output()->Resize(x_dim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(grid_sampler, ops::GridSamplerOp);
+#endif
+
+#endif
diff --git a/mobile/src/operators/grid_sampler_op.h b/mobile/src/operators/grid_sampler_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d142b9d47466d472234dfc5d214de1032b0c6e9
--- /dev/null
+++ b/mobile/src/operators/grid_sampler_op.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef GRID_SAMPLER_OP
+
+#pragma once
+
+#include <string>
+
+#include "framework/operator.h"
+#include "operators/kernel/grid_sampler_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#ifdef GRID_SAMPLER_OP
+DECLARE_OPERATOR(GridSampler, GridSamplerParam, GridSamplerKernel);
+#endif
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/mobile/src/operators/instancenorm_op.cpp b/mobile/src/operators/instancenorm_op.cpp
index 82cdf36f47414771eb6829751e04bd559c6ff29e..42af75ca21ba4a70a78c50fa34ab674278bea743 100644
--- a/mobile/src/operators/instancenorm_op.cpp
+++ b/mobile/src/operators/instancenorm_op.cpp
@@ -24,7 +24,7 @@ namespace operators {
 template <typename Dtype, typename T>
 void InstanceNormOp<Dtype, T>::InferShape() const {
   auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
+  this->param_.OutputY()->Resize(x_dims);
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
index 84c3230d82bd2bfb54210e3e57ecf95bb43b7ff9..eabbfe5be4f67345a0919665def8509d640ed386 100644
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
@@ -17,17 +17,17 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 void InstanceNorm(framework::CLHelper *cl_helper,
-                  const InstanceNormParam<GPU_CL> &param) {
+                  const framework::CLImage *input, framework::CLImage *output,
+                  float epsilon) {
   auto kernel = cl_helper->KernelAt(0);
 
-  auto &dims = param.Out()->dims();
+  auto &dims = output->dims();
   const int n = dims[0];
   const int c_group = (dims[1] + 3) / 4;
   const int h = dims[2];
   const int w = dims[3];
-  auto epsilon = param.Epsilon();
-  auto input = param.InputX()->GetCLImage();
-  auto out = param.Out()->GetCLImage();
+  auto input_image = input->GetCLImage();
+  auto out_image = output->GetCLImage();
 
   //      DLOG << "Epsilon: " << epsilon;
 
@@ -66,9 +66,9 @@ void InstanceNorm(framework::CLHelper *cl_helper,
   CL_CHECK_ERRORS(status);
   clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
   CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input_image);
   CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
+  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out_image);
   CL_CHECK_ERRORS(status);
   clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
                          work_size, local_work_size, 0, NULL, NULL);
diff --git a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
index 45c0bcd4e8e8ea0d6c24904b4fa7fc763d3e9bc1..1e46ebf4ba497b44699a33adf27dd21830e1e3a4 100644
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
@@ -21,7 +21,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 void InstanceNorm(framework::CLHelper *cl_helper,
-                  const InstanceNormParam<GPU_CL> &param);
+                  const framework::CLImage *input, framework::CLImage *output,
+                  float epsilon);
 }
 }  // namespace paddle_mobile
 #endif
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
index bdace5b5408d8676f9fe981c7c4de6415aba8868..d3078e6a5c09a400fe90c8cb9eda7cf091eda381 100755
--- a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -1178,7 +1178,7 @@ __kernel void conv_1x1_wrapped(
     __private const int output_height,
     __private const int old_w
     ) {
-  half zero = 0.0f;
+
   const int out_c = get_global_id(0);
   const int out_w = get_global_id(1);
   const int out_nh = get_global_id(2);
@@ -1217,14 +1217,14 @@ __kernel void conv_1x1_wrapped(
 
 #ifdef BIASE_CH
     half4 output0 = read_imageh(bias, sampler, (int2)(out_c, 0));
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+    half4 output1 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output2 = read_imageh(bias, sampler, (int2)(out_c, 0));
+    half4 output3 = read_imageh(bias, sampler, (int2)(out_c, 0));
 #elif defined(BIASE_ELE)
     half4 output0 = read_imageh(bias, sampler, output_pos0);
-    half4 output1 = output0;
-    half4 output2 = output0;
-    half4 output3 = output0;
+    half4 output1 = read_imageh(bias, sampler, output_pos1);
+    half4 output2 = read_imageh(bias, sampler, output_pos2);
+    half4 output3 = read_imageh(bias, sampler, output_pos3);
 
 #else
   half4 output0 = 0.0f;
@@ -1235,10 +1235,6 @@ __kernel void conv_1x1_wrapped(
 
   int max_w_bound = input_c * input_width;
   int burndary_index = input_c * 4 - input_c_origin;
-  bool burndary_index_w = burndary_index==1||burndary_index==2||burndary_index==3;
-  bool burndary_index_z = burndary_index==2||burndary_index==3;
-  bool burndary_index_y = burndary_index==3;
-
   for (int i = 0; i < input_c; ++i) {
     // ------------0---------------
     int2 pos_in = (int2)(i * input_width + in_pos_in_one_block0.x, in_pos_in_one_block0.y);
@@ -1248,63 +1244,138 @@ __kernel void conv_1x1_wrapped(
     half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
     half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
     half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
-    int bound_gap = max_w_bound - pos_in.x - 1;
 
-    bool outof_bound =  bound_gap < input_width && bound_gap >= 0;
-    input0.w = select(input0.w,zero,outof_bound && burndary_index_w);
-    input0.z = select(input0.z,zero,outof_bound && burndary_index_z);
-    input0.y = select(input0.y,zero,outof_bound && burndary_index_y);
+    if ((max_w_bound - pos_in.x-1) < input_width && (max_w_bound - pos_in.x-1)>=0 ){
+      if (burndary_index==0){
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(input0.z, weight2, output0);
+        output0 = mad(input0.w, weight3, output0);
+      } else if (burndary_index==1){
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(input0.z, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+
+      } else if (burndary_index==2){
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(input0.y, weight1, output0);
+        output0 = mad(0.0f, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+      } else if (burndary_index==3){
+        output0 = mad(input0.x, weight0, output0);
+        output0 = mad(0.0f, weight1, output0);
+        output0 = mad(0.0f, weight2, output0);
+        output0 = mad(0.0f, weight3, output0);
+      }
+    }else {
+      output0 = mad(input0.x, weight0, output0);
+      output0 = mad(input0.y, weight1, output0);
+      output0 = mad(input0.z, weight2, output0);
+      output0 = mad(input0.w, weight3, output0);
+    }
 
-    output0 = mad(input0.x, weight0, output0);
-    output0 = mad(input0.y, weight1, output0);
-    output0 = mad(input0.z, weight2, output0);
-    output0 = mad(input0.w, weight3, output0);
     // -------------1--------------
     pos_in = (int2)(i * input_width + in_pos_in_one_block1.x, in_pos_in_one_block1.y);
     half4 input1 = read_imageh(input_image, sampler, pos_in);
 
-    bound_gap = max_w_bound - pos_in.x - 1;
-
-    outof_bound =  bound_gap < input_width && bound_gap >= 0;
-    input1.w = select(input1.w,zero,outof_bound && burndary_index_w);
-    input1.z = select(input1.z,zero,outof_bound && burndary_index_z);
-    input1.y = select(input1.y,zero,outof_bound && burndary_index_y);
-
-    output1 = mad(input1.x, weight0, output1);
-    output1 = mad(input1.y, weight1, output1);
-    output1 = mad(input1.z, weight2, output1);
-    output1 = mad(input1.w, weight3, output1);
+    if (abs(max_w_bound - pos_in.x) < input_width){
+      if (burndary_index==0){
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(input1.z, weight2, output1);
+        output1 = mad(input1.w, weight3, output1);
+      } else if (burndary_index==1){
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(input1.z, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+
+      } else if (burndary_index==2){
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(input1.y, weight1, output1);
+        output1 = mad(0.0f, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+      } else if (burndary_index==3){
+        output1 = mad(input1.x, weight0, output1);
+        output1 = mad(0.0f, weight1, output1);
+        output1 = mad(0.0f, weight2, output1);
+        output1 = mad(0.0f, weight3, output1);
+      }
+    }else {
+      output1 = mad(input1.x, weight0, output1);
+      output1 = mad(input1.y, weight1, output1);
+      output1 = mad(input1.z, weight2, output1);
+      output1 = mad(input1.w, weight3, output1);
+    }
 
     // -------------2--------------
     pos_in = (int2)(i * input_width + in_pos_in_one_block2.x, in_pos_in_one_block2.y);
     half4 input2 = read_imageh(input_image, sampler, pos_in);
 
-    bound_gap = max_w_bound - pos_in.x - 1;
-
-    outof_bound =  bound_gap < input_width && bound_gap >= 0;
-    input2.w = select(input2.w,zero,outof_bound && burndary_index_w);
-    input2.z = select(input2.z,zero,outof_bound && burndary_index_z);
-    input2.y = select(input2.y,zero,outof_bound && burndary_index_y);
-
-    output2 = mad(input2.x, weight0, output2);
-    output2 = mad(input2.y, weight1, output2);
-    output2 = mad(input2.z, weight2, output2);
-    output2 = mad(input2.w, weight3, output2);
+    if (abs(max_w_bound - pos_in.x) < input_width){
+      if (burndary_index==0){
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(input2.z, weight2, output2);
+        output2 = mad(input2.w, weight3, output2);
+      } else if (burndary_index==1){
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(input2.z, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+
+      } else if (burndary_index==2){
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(input2.y, weight1, output2);
+        output2 = mad(0.0f, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+      } else if (burndary_index==3){
+        output2 = mad(input2.x, weight0, output2);
+        output2 = mad(0.0f, weight1, output2);
+        output2 = mad(0.0f, weight2, output2);
+        output2 = mad(0.0f, weight3, output2);
+      }
+    }else {
+      output2 = mad(input2.x, weight0, output2);
+      output2 = mad(input2.y, weight1, output2);
+      output2 = mad(input2.z, weight2, output2);
+      output2 = mad(input2.w, weight3, output2);
+    }
 
     // -------------3--------------
     pos_in = (int2)(i * input_width + in_pos_in_one_block3.x, in_pos_in_one_block3.y);
     half4 input3 = read_imageh(input_image, sampler, pos_in);
-    bound_gap = max_w_bound - pos_in.x - 1;
 
-    outof_bound =  bound_gap < input_width && bound_gap >= 0;
-    input3.w = select(input3.w,zero,outof_bound && (burndary_index==1||burndary_index==2||burndary_index==3));
-    input3.z = select(input3.z,zero,outof_bound && (burndary_index==2||burndary_index==3));
-    input3.y = select(input3.y,zero,outof_bound && burndary_index==3);
-
-    output3 = mad(input3.x, weight0, output3);
-    output3 = mad(input3.y, weight1, output3);
-    output3 = mad(input3.z, weight2, output3);
-    output3 = mad(input3.w, weight3, output3);
+    if (abs(max_w_bound - pos_in.x) < input_width){
+      if (burndary_index==0){
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(input3.z, weight2, output3);
+        output3 = mad(input3.w, weight3, output3);
+      } else if (burndary_index==1){
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(input3.z, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+
+      } else if (burndary_index==2){
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(input3.y, weight1, output3);
+        output3 = mad(0.0f, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+      } else if (burndary_index==3){
+        output3 = mad(input3.x, weight0, output3);
+        output3 = mad(0.0f, weight1, output3);
+        output3 = mad(0.0f, weight2, output3);
+        output3 = mad(0.0f, weight3, output3);
+      }
+    }else {
+      output3 = mad(input3.x, weight0, output3);
+      output3 = mad(input3.y, weight1, output3);
+      output3 = mad(input3.z, weight2, output3);
+      output3 = mad(input3.w, weight3, output3);
+    }
   }
 
 #ifdef BATCH_NORM
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..1f62ff377a7f8fddaeae108a8cfaa6d98847f9af
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
@@ -0,0 +1,27 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void elementwise_sub(__global image2d_t inputImage, __global image2d_t bias, __write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     half4 input = read_imageh(inputImage, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords);
+     half4 output = input - biase;
+     write_imageh(outputImage, coords, output);
+ }
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
new file mode 100644
index 0000000000000000000000000000000000000000..8c74477b6abca02d81cb38db2412ee55175f642c
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
@@ -0,0 +1,159 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void expend_c1(
+    __private const int OUT_C, __private const int OUT_W,
+    __private const int OUT_NH,
+
+    __private const int IN_C, __private const int IN_W,
+    __private const int IN_NH,
+
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const int n_times, __private const int c_times,
+    __private const int h_times, __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+
+  //  const real_in_c = out_c * 4 / c_times;
+  //  const int in_c = real_in_c / 4;
+  const int in_c = 0;
+
+  //  const int in_c = out_c / c_times;
+  const int in_w = out_w / w_times;
+
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, input_pos);
+  in.y = in.x;
+  in.z = in.x;
+  in.w = in.x;
+  write_imageh(output, output_pos, in);
+}
+
+__kernel void expend_c2(
+    __private const int OUT_C, __private const int OUT_W,
+    __private const int OUT_NH,
+
+    __private const int IN_C, __private const int IN_W,
+    __private const int IN_NH,
+
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const int n_times, __private const int c_times,
+    __private const int h_times, __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+
+  //  const real_in_c = out_c * 4 / c_times;
+  //  const int in_c = real_in_c / 4;
+  const int in_c = 0;
+
+  //  const int in_c = out_c / c_times;
+  const int in_w = out_w / w_times;
+
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, input_pos);
+  in.z = in.x;
+  in.w = in.y;
+  write_imageh(output, output_pos, in);
+}
+
+
+__kernel void expend_c4(
+    __private const int OUT_C, __private const int OUT_W,
+    __private const int OUT_NH,
+
+    __private const int IN_C, __private const int IN_W,
+    __private const int IN_NH,
+
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const int n_times, __private const int c_times,
+    __private const int h_times, __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+
+  //  const real_in_c = out_c * 4 / c_times;
+  //  const int in_c = real_in_c / 4;
+  const int in_c = 0;
+
+  //  const int in_c = out_c / c_times;
+  const int in_w = out_w / w_times;
+
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+
+  half4 in = read_imageh(input, sampler, input_pos);
+  write_imageh(output, output_pos, in);
+}
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..0512ce9beab00f1d9b8036d65385a14743ff7e31
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
@@ -0,0 +1,99 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "cl_common.h"
+
+__kernel void grid_sampler(__private const int out_height,
+                           __private const int out_width,
+                           __read_only image2d_t input,
+                           __read_only image2d_t grid,
+                           __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2) * 4;
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int x_grid = out_h / 4 * 2;
+  int y_grid = out_n * out_width + out_w;
+  float4 g1 = read_imagef(grid, sampler, (int2)(x_grid, y_grid));
+  float4 g2 = read_imagef(grid, sampler, (int2)(x_grid + 1, y_grid));
+
+  float x = (g1.x + 1) * (out_width - 1) / 2;
+  float y = (g2.x + 1) * (out_height - 1) / 2;
+  float x0 = floor(x);
+  float y0 = floor(y);
+  int x_p = out_c * out_width + x0;
+  int y_p = out_n * out_height + y0;
+  int x_out = out_c * out_width + out_w;
+  int y_out = out_n * out_height + out_h;
+  float4 input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  float4 input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  float4 input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  float4 input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  float4 out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out), convert_half4(out_val));
+
+  x = (g1.y + 1) * (out_width - 1) / 2;
+  y = (g2.y + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out + 1), convert_half4(out_val));
+
+  x = (g1.z + 1) * (out_width - 1) / 2;
+  y = (g2.z + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out + 2), convert_half4(out_val));
+
+  x = (g1.w + 1) * (out_width - 1) / 2;
+  y = (g2.w + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out + 3), convert_half4(out_val));
+}
diff --git a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b107b3de3c1df163e9f987c9a8cdff23b6a71c43
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef ELEMENTWISESUB_OP
+
+#include "operators/kernel/elementwise_sub_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ElementwiseSubKernel<GPU_CL, float>::Init(
+    ElementwiseSubParam<GPU_CL> *param) {
+  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
+      const_cast<framework::CLImage *>(param->InputY()));
+  if (bias->dims().size() == 4) {
+    if (!bias->isInit()) {
+      bias->InitNormalCLImage(cl_helper_.CLContext(),
+                              this->cl_helper_.CLCommandQueue());
+    }
+    DLOG << " bias: " << *bias;
+    this->cl_helper_.AddKernel("elementwise_sub", "elementwise_sub_kernel.cl");
+  } else {
+    DLOG << "error:bias dims not support";
+  }
+  return true;
+}
+
+template <>
+void ElementwiseSubKernel<GPU_CL, float>::Compute(
+    const ElementwiseSubParam<GPU_CL> &param) {
+  auto input = param.InputX();
+  auto bias = param.InputY();
+  auto output = param.Out();
+  cl_int status;
+  auto kernel = this->cl_helper_.KernelAt(0);
+  if (bias->dims().size() == 4) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bias_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    DLOG << "error:bias dims not support";
+  }
+}
+
+template class ElementwiseSubKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
diff --git a/mobile/src/operators/kernel/cl/expand_kernel.cpp b/mobile/src/operators/kernel/cl/expand_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f424a31b4f5e143b8662376ddfcec122beffb408
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/expand_kernel.cpp
@@ -0,0 +1,130 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef EXPAND_OP
+
+#include "operators/kernel/expand_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ExpandKernel<GPU_CL, float>::Init(ExpandParam<GPU_CL>* param) {
+  const framework::DDim& input_dims = param->InputX()->dims();
+  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4,
+                        "expend now support 4 size dims");
+  if (input_dims[1] == 1) {
+    this->cl_helper_.AddKernel("expend_c1", "expend.cl");
+  } else if (input_dims[1] == 2) {
+    this->cl_helper_.AddKernel("expend_c2", "expend.cl");
+  } else if (input_dims[1] == 4) {
+    this->cl_helper_.AddKernel("expend_c4", "expend.cl");
+  } else {
+    PADDLE_MOBILE_ENFORCE(false, "expend did not supported this type");
+  }
+  return true;
+}
+
+template <>
+void ExpandKernel<GPU_CL, float>::Compute(const ExpandParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  DLOG << "param.Out()->dims():  " << param.Out()->dims();
+  const framework::DDim& image_dims = param.Out()->ImageDims();
+  DLOG << "param.Out()->image_dims():  " << image_dims;
+
+  auto out_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+  DLOG << "out_work_size:  " << out_work_size;
+
+  int out_c_block = out_work_size[0];
+  int out_w = out_work_size[1];
+  int out_nh = out_work_size[2];
+
+  auto in_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
+  int in_c_block = in_work_size[0];
+  int in_w = in_work_size[1];
+  int in_nh = in_work_size[2];
+
+  int input_width = param.InputX()->dims()[3];
+  int input_height = param.InputX()->dims()[2];
+  int output_width = param.Out()->dims()[3];
+  int output_height = param.Out()->dims()[2];
+
+  const auto* input = param.InputX();
+  auto* output = param.Out();
+  vector<int> expandTimes = {1, 1, 1, 1};
+  DLOG << "param.expand_times: " << param.expand_times;
+
+  for (int i = 0; i < param.expand_times.size(); ++i) {
+    expandTimes[i] = param.expand_times[i];
+  }
+
+  DLOG << "expandTimes: " << expandTimes;
+
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+
+  input->dims();
+
+  int idx = 0;
+
+  cl_int status;
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &inputImage);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &outputImage);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[0]);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[1]);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[2]);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[3]);
+  CL_CHECK_ERRORS(status);
+
+  status =
+      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                             out_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+
+  DLOG << *output;
+}
+
+template class ExpandKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3a20ebd94e02ca489d9364ab3673d2bec866db2e
--- /dev/null
+++ b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRID_SAMPLER_OP
+
+#include "operators/kernel/grid_sampler_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool GridSamplerKernel<GPU_CL, float>::Init(GridSamplerParam<GPU_CL>* param) {
+  this->cl_helper_.AddKernel("grid_sampler", "grid_sampler_kernel.cl");
+  return true;
+}
+
+template <>
+void GridSamplerKernel<GPU_CL, float>::Compute(
+    const GridSamplerParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Output()));
+  cl_int status;
+  auto output = param.Output();
+  auto input = param.InputX();
+  auto grid = param.Grid();
+  auto output_image = output->GetCLImage();
+  auto input_image = input->GetCLImage();
+  auto grid_image = grid->GetCLImage();
+  const int out_H = output->dims()[2];
+  const int out_W = output->dims()[3];
+
+  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &out_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &grid_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+
+  const size_t work_size[3] = {default_work_size[0], default_work_size[1],
+                               default_work_size[2] / 4};
+
+  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3,
+                                  NULL, work_size, NULL, 0, NULL, NULL);
+
+  CL_CHECK_ERRORS(status);
+}
+
+template class GridSamplerKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
diff --git a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
index f068d36133e826e8caa79d8f4852bbaac4415cdd..439554ec10696913b42923177828870790f0f711 100644
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {
 
 template <>
 bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
-  auto &dims = param->Out()->dims();
+  auto &dims = param->OutputY()->dims();
   const int h = dims[2];
   std::string build_options = "";
   if (h == 128) {
@@ -41,7 +41,8 @@ bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
 template <>
 void InstanceNormKernel<GPU_CL, float>::Compute(
     const InstanceNormParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param);
+  InstanceNorm(&this->cl_helper_, param.InputX(), param.OutputY(),
+               param.Epsilon());
 }
 
 template class InstanceNormKernel<GPU_CL, float>;
diff --git a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
index c265454d0ea67c7a6aec8f1017bc5455d328a756..270d77c4a051df227719338f6793e64aa2920f9f 100644
--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {
 
 template <>
 bool InstanceNormReluKernel<GPU_CL, float>::Init(
-    InstanceNormParam<GPU_CL> *param) {
+    FusionInstanceNormReluParam<GPU_CL> *param) {
   auto &dims = param->Out()->dims();
   const int h = dims[2];
   std::string build_options = "-DRELU";
@@ -41,8 +41,8 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(
 
 template <>
 void InstanceNormReluKernel<GPU_CL, float>::Compute(
-    const InstanceNormParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param);
+    const FusionInstanceNormReluParam<GPU_CL> &param) {
+  InstanceNorm(&this->cl_helper_, param.InputX(), param.Out(), param.Epsilon());
 }
 
 template class InstanceNormReluKernel<GPU_CL, float>;
diff --git a/mobile/src/operators/kernel/expand_kernel.h b/mobile/src/operators/kernel/expand_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..00c12a9372eeb533c03bcc038edeec01eff3f3bf
--- /dev/null
+++ b/mobile/src/operators/kernel/expand_kernel.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#ifdef EXPAND_OP
+DECLARE_KERNEL(Expand, ExpandParam);
+#endif  // EXPAND_OP
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/grid_sampler_kernel.h b/mobile/src/operators/kernel/grid_sampler_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbadb6b54abb3a800729444052ad6095e3384cb1
--- /dev/null
+++ b/mobile/src/operators/kernel/grid_sampler_kernel.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "framework/operator.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+#ifdef GRID_SAMPLER_OP
+DECLARE_KERNEL(GridSampler, GridSamplerParam);
+#endif  // GRID_SAMPLER_OP
+
+}  // namespace operators
+}  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/instancenorm_relu_kernel.h b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
index 9a4bedb564ea68e252f65372c38f3cfce13f339f..cb2a0e1f3cb739847cdf4f635de74c223896106b 100644
--- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+++ b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
@@ -30,10 +30,10 @@ using framework::OpKernelBase;
 
 template <typename DeviceType, typename T>
 class InstanceNormReluKernel
-    : public OpKernelBase<DeviceType, InstanceNormParam<DeviceType>> {
+    : public OpKernelBase<DeviceType, FusionInstanceNormReluParam<DeviceType>> {
  public:
-  void Compute(const InstanceNormParam<DeviceType> &param);
-  bool Init(InstanceNormParam<DeviceType> *param);
+  void Compute(const FusionInstanceNormReluParam<DeviceType> &param);
+  bool Init(FusionInstanceNormReluParam<DeviceType> *param);
 };
 
 }  // namespace operators
diff --git a/mobile/src/operators/nearest_interp_op.cpp b/mobile/src/operators/nearest_interp_op.cpp
index 14e71b78f123befd26125f9daa18e2e510844cdb..e885ea26adbcc42ed0feeefeb9077d22c734fcb2 100644
--- a/mobile/src/operators/nearest_interp_op.cpp
+++ b/mobile/src/operators/nearest_interp_op.cpp
@@ -24,8 +24,9 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
                         "Input(X) of BilinearInterOp should not be null.");
   PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
                         "Output(Out) of BilinearInterOp should not be null.");
-
   auto dim_x = this->param_.InputX()->dims();  // NCHW format
+  DLOG << "dim_x :" << dim_x;
+
   int out_h = this->param_.OutH();
   int out_w = this->param_.OutW();
   PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
@@ -37,8 +38,22 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
                           "OutSize's dimension size must be 1");
     PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
   }
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
-  this->param_.Out()->Resize(framework::make_ddim(dim_out));
+
+  DLOG << "this->param_.HasScale(): " << this->param_.HasScale();
+  if (this->param_.HasScale()) {
+    const float scale = this->param_.Scale();
+    DLOG << "scale_:  " << scale;
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1],
+                                  static_cast<int>(dim_x[2] * scale),
+                                  static_cast<int>(dim_x[3] * scale)});
+    this->param_.Out()->Resize(framework::make_ddim(dim_out));
+    DLOG << "interp -- dim_out: " << dim_out;
+
+  } else {
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+    this->param_.Out()->Resize(framework::make_ddim(dim_out));
+    DLOG << "interp -- dim_out: " << dim_out;
+  }
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h
index 2651a0f69766544a0ec09250248682c5b559ef01..0415291a73eae356b0c25e1b98de471246e0e6c2 100644
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -337,6 +337,11 @@ class OpParam {
     return GetVarValue<T>("Filter", inputs, scope);
   }
 
+  template <typename T>
+  static T *GridFrom(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue<T>("Grid", inputs, scope);
+  }
+
   template <typename T>
   static const T GetAttr(const string &key, const AttributeMap &map) {
     return ((Attribute)map.at(key)).Get<T>();
@@ -927,6 +932,35 @@ class InstanceNormParam : public OpParam {
                     Scope *scope)
       : OpParam(inputs, outputs, attrs, scope) {
     input_x_ = InputXFrom<GType>(inputs, *scope);
+    output_y_ = OutputYFrom<GType>(outputs, *scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  GType *OutputY() const { return output_y_; }
+
+  const float &Epsilon() const { return epsilon_; }
+
+ private:
+  GType *input_x_;
+  GType *output_y_;
+  float epsilon_;
+};
+#endif
+
+#ifdef FUSION_INSTANCENORM_RELU_OP
+template <typename Dtype>
+class FusionInstanceNormReluParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  FusionInstanceNormReluParam(const VariableNameMap &inputs,
+                              const VariableNameMap &outputs,
+                              const AttributeMap &attrs, Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
     out_ = OutFrom<GType>(outputs, *scope);
     epsilon_ = GetAttr<float>("epsilon", attrs);
   }
@@ -3008,7 +3042,7 @@ class SplitParam : public OpParam {
   int axis;
   int num;
   std::vector<int> sections;
-  //  std::vector<GType> out_ts_;
+//  std::vector<GType> out_ts_;
 #ifdef PADDLE_MOBILE_FPGA
 
  private:
@@ -3069,12 +3103,20 @@ class NearestInterpolationParam : public OpParam {
     out_ = OutFrom<GType>(outputs, *scope);
     out_h_ = GetAttr<int>("out_h", attrs);
     out_w_ = GetAttr<int>("out_w", attrs);
+    if (HasAttr("scale", attrs)) {
+      has_scale_ = true;
+      scale_ = GetAttr<float>("scale", attrs);
+    }
+    DLOG << "has_scale_:  " << has_scale_;
+    DLOG << "scale_:  " << scale_;
   }
   const GType *InputX() const { return input_x_; }
   const GType *InputOutPutSize() const { return input_outsize_; }
   GType *Out() const { return out_; }
   int OutH() const { return out_h_; }
   int OutW() const { return out_w_; }
+  float Scale() const { return scale_; }
+  bool HasScale() const { return has_scale_; }
 
  private:
   GType *input_x_;
@@ -3082,6 +3124,8 @@ class NearestInterpolationParam : public OpParam {
   GType *out_;
   int out_h_;
   int out_w_;
+  float scale_;
+  bool has_scale_;
 };
 #endif
 
@@ -3658,5 +3702,60 @@ class PixelShuffleParam : public OpParam {
 };
 #endif
 
+#ifdef GRID_SAMPLER_OP
+template <typename Dtype>
+class GridSamplerParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  GridSamplerParam(const VariableNameMap &inputs,
+                   const VariableNameMap &outputs, const AttributeMap &attrs,
+                   Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    grid_ = GridFrom<GType>(inputs, *scope);
+    output_ = OutputFrom<GType>(outputs, *scope);
+  }
+
+  const GType *InputX() const { return input_x_; }
+  const GType *Grid() const { return grid_; }
+
+  GType *Output() const { return output_; }
+
+ private:
+  GType *input_x_;
+  GType *grid_;
+  GType *output_;
+};
+#endif
+
+#ifdef EXPAND_OP
+template <typename Dtype>
+class ExpandParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+
+ public:
+  ExpandParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+              const AttributeMap &attrs, Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    out_ = OutFrom<GType>(outputs, *scope);
+    expand_times = OpParam::GetAttr<std::vector<int>>("expand_times", attrs);
+  }
+
+  const GType *InputX() const { return input_x_; }
+
+  GType *Out() const { return out_; }
+
+  std::vector<int> expand_times;
+
+ private:
+  GType *input_x_;
+  GType *out_;
+};
+
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt
index ccc609ff8300b9220285d73527145379edb30b5a..b2c7fb98f84ca225d1a4e360403a09c16366c409 100644
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
@@ -236,7 +236,11 @@ if (ENABLE_ALL_TEST)
         # gen test
         ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-conv-op paddle-mobile)
-    
+
+        # gen test
+        ADD_EXECUTABLE(test-expend-op operators/test_expend_op.cpp test_helper.h test_include.h executor_for_test_opencl.h )
+        target_link_libraries(test-expend-op paddle-mobile)
+
         # gen test
         ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h)
         target_link_libraries(test-mul-op paddle-mobile)
diff --git a/mobile/test/executor_for_test.h b/mobile/test/executor_for_test.h
index bcb5006084dff0713cde15acd90514f3facf5ce5..6f1680c5135d3d7a02f572b36b8256000a6d6dee 100644
--- a/mobile/test/executor_for_test.h
+++ b/mobile/test/executor_for_test.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <string>
 #include <vector>
-
+#include <memory>
 #include "common/log.h"
 #include "framework/executor.h"
 #include "framework/op_registry.h"
@@ -74,8 +74,11 @@ class Executor4Test : public Executor<DeviceType> {
         break;
       }
     }
-
-    this->InitMemory();
+    if (this->program_.combined) {
+      this->InitCombineMemory();
+    } else {
+      this->InitMemory();
+    }
     for (const auto &op : this->ops_of_block0_) {
       op->Init();
     }
diff --git a/mobile/test/executor_for_test_opencl.h b/mobile/test/executor_for_test_opencl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc24541f13a4c619964e72fd43cfa1de0c771875
--- /dev/null
+++ b/mobile/test/executor_for_test_opencl.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_MOBILE_CL
+
+#include <string>
+#include <vector>
+#include <memory>
+
+#include "common/log.h"
+#include "framework/cl/cl_helper.h"
+#include "framework/cl/cl_tensor.h"
+#include "framework/executor.h"
+#include "framework/op_registry.h"
+#include "operators/feed_op.h"
+#include "operators/fetch_op.h"
+#include "./test_helper.h"
+
+using paddle_mobile::framework::BlockDesc;
+using paddle_mobile::framework::DDim;
+using paddle_mobile::framework::Executor;
+using paddle_mobile::framework::LoDTensor;
+using paddle_mobile::framework::OpDesc;
+using paddle_mobile::framework::Program;
+using paddle_mobile::framework::Tensor;
+using paddle_mobile::framework::Variable;
+using paddle_mobile::framework::OperatorBase;
+using paddle_mobile::framework::AttributeMap;
+using std::string;
+using std::vector;
+namespace paddle_mobile {
+template <typename OpType>
+class OpenClOpTester {
+ public:
+  OpenClOpTester() {
+    framework::CLEngine::Instance()->setClPath("/data/local/tmp/bin");
+    scope_ = std::make_shared<paddle_mobile::framework::Scope>();
+    feed_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
+    fetch_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
+    this->feed_clhelper_.AddKernel("feed", "feed_kernel.cl");
+    this->fetch_clhelper_.AddKernel("fetch", "fetch_kernel.cl");
+
+    feed_var = scope_.get()->Var("feed");
+    fetch_var = scope_.get()->Var("fetch");
+    op_in_var = scope_.get()->Var("op_in");
+    op_out_var = scope_.get()->Var("op_out");
+  }
+
+  void Predict(string op_type, DDim feed_dims, DDim fetch_dims,
+               VariableNameMap inputs_feed, VariableNameMap outputs_feed,
+               AttributeMap attrs_feed) {
+    framework::CLImage *const op_in_cl_image =
+        op_in_var->template GetMutable<framework::CLImage>();
+    op_in_cl_image->Resize(feed_dims);
+    op_in_cl_image->InitEmptyImage(feed_clhelper_.CLContext(),
+                                   feed_clhelper_.CLCommandQueue(), feed_dims);
+    framework::CLImage *const op_out_cl_image =
+        op_out_var->template GetMutable<framework::CLImage>();
+    op_out_cl_image->Resize(fetch_dims);
+    framework::CLScope *const clScpoe = scope_->GetCLScpoe();
+    op_out_cl_image->InitEmptyImage(clScpoe->Context(), clScpoe->CommandQueue(),
+                                    fetch_dims);
+
+    Feed(feed_dims);
+    auto *op = new OpType(op_type, inputs_feed, outputs_feed, attrs_feed,
+                          scope_.get());
+    op->InferShape();
+    op->Init();
+    op->Run();
+    Fetch(fetch_dims);
+  }
+  void Feed(DDim feed_dims) {
+    auto *feed_var = scope_->Var("feed");
+    auto *_var = scope_->Var("op_in");
+    auto *const input = feed_var->template GetMutable<framework::LoDTensor>();
+    DLOG << "feed_dims: " << feed_dims;
+    SetupTensor<float>(input, feed_dims, -100.0, 100.0);
+    framework::CLImage *const op_in_cl_image =
+        op_in_var->template GetMutable<framework::CLImage>();
+    DLOG << "FeedKernel run ";
+    DLOG << "params.input " << *input;
+    DLOG << "params.op_in_cl_image " << *op_in_cl_image;
+    auto kernel = this->feed_clhelper_.KernelAt(0);
+    DLOG << "kernel get success ";
+
+    auto default_work_size =
+        this->feed_clhelper_.DefaultWorkSize(*(op_in_cl_image));
+
+    DLOG << "op_in_cl_image: " << *op_in_cl_image;
+    DLOG << "default_work_size: " << default_work_size;
+    cl_int status;
+    int numel = input->numel();
+    cl_mem output_image = op_in_cl_image->GetCLImage();
+    const int out_C = op_in_cl_image->dims()[1];
+    const int out_H = op_in_cl_image->dims()[2];
+    const int out_W = op_in_cl_image->dims()[3];
+    const int Stride2 = out_C * out_H * out_W;
+    const int Stride1 = out_H * out_W;
+    const int Stride0 = out_W;
+    framework::CLTensor input_cl_tensor(this->feed_clhelper_.CLContext(),
+                                        this->feed_clhelper_.CLCommandQueue());
+    input_cl_tensor.Resize(input->dims());
+    cl_mem inputBuffer;
+
+    inputBuffer =
+        input_cl_tensor.mutable_with_data<float>(input->data<float>());
+
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
+    CL_CHECK_ERRORS(status);
+
+    status = clEnqueueNDRangeKernel(
+        this->feed_clhelper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+
+    CL_CHECK_ERRORS(status);
+
+    DLOG << "*op_in_cl_image: " << *op_in_cl_image;
+  }
+
+  void Fetch(DDim fetch_dims) {
+    DLOG << "------------------  Fetch op ---------------------";
+
+    DLOG << "------------------  Fetch op end ---------------------";
+  }
+
+ private:
+  std::shared_ptr<paddle_mobile::framework::Scope> scope_;
+  framework::CLHelper feed_clhelper_;
+  framework::CLHelper fetch_clhelper_;
+
+  Variable *feed_var;
+  Variable *fetch_var;
+  Variable *op_in_var;
+  Variable *op_out_var;
+};
+}  // namespace paddle_mobile
+#endif
diff --git a/mobile/test/operators/test_expend_op.cpp b/mobile/test/operators/test_expend_op.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..80b5a461afe0b292e9e9cfee93061fe1acaa9090
--- /dev/null
+++ b/mobile/test/operators/test_expend_op.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_MOBILE_CL
+#include "../executor_for_test_opencl.h"
+#include "operators/expand_op.h"
+#include "operators/feed_op.h"
+#ifdef EXPAND_OP
+
+int main() {
+  const int IN_N = 1;
+  const int IN_C = 1;
+  const int IN_H = 2;
+  const int IN_W = 3;
+
+  const int EXPEND_N = 1;
+  const int EXPEND_C = 1;
+  const int EXPEND_H = 2;
+  const int EXPEND_W = 2;
+
+  const int OUT_N = IN_N * EXPEND_N;
+  const int OUT_C = IN_C * EXPEND_C;
+  const int OUT_H = IN_H * EXPEND_H;
+  const int OUT_W = IN_W * EXPEND_W;
+
+  framework::DDim in_dims = framework::make_ddim({IN_N, IN_C, IN_H, IN_W});
+  framework::DDim out_dims = framework::make_ddim({OUT_N, OUT_C, OUT_H, OUT_W});
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  AttributeMap attrs;
+  inputs["X"] = std::vector<std::string>({"op_in"});
+  outputs["Out"] = std::vector<std::string>({"op_out"});
+
+  std::vector<int> expand_times = {EXPEND_N, EXPEND_C, EXPEND_H, EXPEND_W};
+  attrs["expand_times"].Set<std::vector<int>>(expand_times);
+
+  OpenClOpTester<operators::ExpandOp<GPU_CL, float>> tester;
+  tester.Predict("expend", in_dims, out_dims, inputs, outputs, attrs);
+}
+#endif
+#endif
diff --git a/mobile/tools/op.cmake b/mobile/tools/op.cmake
index 923380940aa10147d65e374265c1073ec37cb11e..100ae78de1688638f8e44b03f8f8369b7fe45f4d 100755
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake
@@ -379,6 +379,8 @@ if(NOT FOUND_MATCH)
   set(REDUCE_PROD_OP ON)
   set(FUSION_INSTANCENORM_RELU_OP ON)
   set(PIXEL_SHUFFLE_OP ON)
+  set(EXPAND_OP ON)
+  set(GRID_SAMPLER_OP ON)
 endif()
 
   # option(BATCHNORM_OP "" ON)
@@ -755,3 +757,10 @@ endif()
 if (PIXEL_SHUFFLE_OP)
   add_definitions(-DPIXEL_SHUFFLE_OP)
 endif()
+if (EXPAND_OP)
+  add_definitions(-DEXPAND_OP)
+endif()
+if (GRID_SAMPLER_OP)
+  add_definitions(-DGRID_SAMPLER_OP)
+endif()
+