Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle-Lite into fpga_pr

a867dbbf · chonwhite · bd405d90 · e659e4ab · a867dbbf · a867dbbf
134 changed file
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -32,10 +32,9 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
    /usr/lib
-	${CUDA_TOOLKIT_ROOT_DIR}
+    ${CUDA_TOOLKIT_ROOT_DIR}
-        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
-	${CUDA_TOOLKIT_ROOT_DIR}/lib64
+    ${CUDA_TOOLKIT_ROOT_DIR}/lib64)
-	)
 if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))
    find_library(CUBLAS_LIBRARY  NAMES libcublas.so PATHS ${CUDNN_CHECK_LIBRARY_DIRS} NO_DEFAULT_PATH)

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -46,7 +46,6 @@ void OutputOptModel(const std::string& load_model_dir,
  config.set_model_dir(load_model_dir);
  std::vector<Place> vaild_places = {
      Place{TARGET(kARM), PRECISION(kFloat)},
-      Place{TARGET(kX86), PRECISION(kFloat)},
  };
  if (FLAGS_is_quantized_model) {
    vaild_places.insert(vaild_places.begin(),

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -47,7 +47,6 @@ void OutputOptModel(const std::string& load_model_dir,
  lite_api::CxxConfig config;
  config.set_model_dir(load_model_dir);
  config.set_valid_places({
-      Place{TARGET(kX86), PRECISION(kFloat)},
      Place{TARGET(kARM), PRECISION(kFloat)},
  });
  auto predictor = lite_api::CreatePaddlePredictor(config);

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -153,7 +153,7 @@ class LITE_API CxxConfig : public ConfigBase {
  std::string param_file() const { return param_file_; }
  bool model_from_memory() const { return model_from_memory_; }
-  void set_cpu_math_library_math_threads(int threads) {
+  void set_cpu_math_library_num_threads(int threads) {
    cpu_math_library_math_threads_ = threads;
  }
  int cpu_math_library_num_threads() const {

--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -31,9 +31,11 @@ USE_MIR_PASS(lite_fc_fuse_pass);
 USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
 USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
 USE_MIR_PASS(lite_interpolate_fuse_pass);
+USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
 USE_MIR_PASS(identity_scale_eliminate_pass);
 USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
 USE_MIR_PASS(lite_conv_activation_fuse_pass);
+USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
 USE_MIR_PASS(lite_elementwise_add_activation_fuse_pass);
 USE_MIR_PASS(lite_quant_dequant_fuse_pass);
 USE_MIR_PASS(type_precision_cast_pass);

--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
@@ -30,7 +30,7 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
  std::string model_dir = FLAGS_model_dir;
  lite_api::CxxConfig config;
  config.set_model_dir(model_dir);
-  config.set_cpu_math_library_math_threads(10);
+  config.set_cpu_math_library_num_threads(1);
  config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
                           lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
                           lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});

--- a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -25,7 +25,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
                                 const float *weights,
                                 const float *bias,
                                 bool flag_bias,
-                                 bool flag_relu,
                                 const int num,
                                 const int ch_in,
                                 const int h_in,
@@ -40,7 +39,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
                                   const float *weights,
                                   const float *bias,
                                   bool flag_bias,
-                                   bool flag_relu,
                                   const int num,
                                   const int ch_in,
                                   const int h_in,
@@ -55,7 +53,6 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
                                 const float *weights,
                                 const float *bias,
                                 bool flag_bias,
-                                 bool flag_relu,
                                 const int num,
                                 const int ch_in,
                                 const int h_in,
@@ -70,7 +67,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
                                   const float *weights,
                                   const float *bias,
                                   bool flag_bias,
-                                   bool flag_relu,
                                   const int num,
                                   const int ch_in,
                                   const int h_in,
@@ -93,7 +89,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                               const float *bias,
                               int pad,
                               bool flag_bias,
-                               bool flag_relu,
                               const operators::ActivationParam act_param,
                               ARMContext *ctx) {
  if (pad == 0) {
@@ -103,7 +98,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                  weights,
                                  bias,
                                  flag_bias,
-                                  flag_relu,
                                  num,
                                  ch_in,
                                  h_in,
@@ -118,7 +112,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                    weights,
                                    bias,
                                    flag_bias,
-                                    flag_relu,
                                    num,
                                    ch_in,
                                    h_in,
@@ -136,7 +129,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                  weights,
                                  bias,
                                  flag_bias,
-                                  flag_relu,
                                  num,
                                  ch_in,
                                  h_in,
@@ -151,7 +143,6 @@ void conv_depthwise_3x3s1_fp32(const float *din,
                                    weights,
                                    bias,
                                    flag_bias,
-                                    flag_relu,
                                    num,
                                    ch_in,
                                    h_in,
@@ -163,7 +154,7 @@ void conv_depthwise_3x3s1_fp32(const float *din,
    }
  }
 }
-// clang-format on
 #ifdef __aarch64__
 #define INIT_S1                                                   \
  "PRFM PLDL1KEEP, [%[din_ptr0]] \n"                              \
@@ -2318,7 +2309,6 @@ void act_switch_3x3s1p1(const float *din_ptr0,
  }
 }
 #endif
-// clang-format on
 /**
 * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
 * width > 4
@@ -2328,7 +2318,6 @@ void conv_depthwise_3x3s1p1_bias(float *dout,
                                 const float *weights,
                                 const float *bias,
                                 bool flag_bias,
-                                 bool flag_relu,
                                 const int num,
                                 const int ch_in,
                                 const int h_in,
@@ -2857,7 +2846,6 @@ void conv_depthwise_3x3s1p1_bias_s(float *dout,
                                   const float *weights,
                                   const float *bias,
                                   bool flag_bias,
-                                   bool flag_relu,
                                   const int num,
                                   const int ch_in,
                                   const int h_in,
@@ -3443,7 +3431,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
                                 const float *weights,
                                 const float *bias,
                                 bool flag_bias,
-                                 bool flag_relu,
                                 const int num,
                                 const int ch_in,
                                 const int h_in,
@@ -3579,129 +3566,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
        }
        int cnt = tile_w;
-        /*
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1
-              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  // vld1q_f32(din_ptr0)
-              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" // vld1q_f32(din_ptr0)
-              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   // v16 = 1234
-              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   // v17 = 2345
-              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       // vld1q_f32(din_ptr0)
-              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      // vld1q_f32(din_ptr0)
-              MID_COMPUTE_S1 MID_RESULT_S1_RELU
-              "cmp  %w[remain], #1             \n"
-              "blt 0f                         \n" RIGHT_COMPUTE_S1
-                  RIGHT_RESULT_S1_RELU "0: \n"
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero),
-                [remain] "r"(remain)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        } else {
-          asm volatile(
-              INIT_S1
-              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  // vld1q_f32(din_ptr0)
-              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" // vld1q_f32(din_ptr0)
-              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   // v16 = 1234
-              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   // v17 = 2345
-              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       // vld1q_f32(din_ptr0)
-              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      // vld1q_f32(din_ptr0)
-              MID_COMPUTE_S1 MID_RESULT_S1
-              "cmp  %w[remain], #1             \n"
-              "blt 0f                         \n" RIGHT_COMPUTE_S1
-                  RIGHT_RESULT_S1 "0: \n"
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero),
-                [remain] "r"(remain)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        }
-        */
        act_switch_3x3s1p0(din_ptr0,
                           din_ptr1,
                           din_ptr2,
@@ -3760,90 +3624,6 @@ void conv_depthwise_3x3s1p0_bias(float *dout,
        int cnt = tile_w;
        unsigned int *rmask_ptr = rmask;
        unsigned int *vmask_ptr = vmask;
-        /*
-        if (flag_relu) {
-          asm volatile(INIT_S1
-                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "vext.32  q6, q8, q9, #1     @ 0012\n"
-                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                           MID_RESULT_S1_RELU
-                       "cmp  %[remain], #1             \n"
-                       "blt 0f                         \n" RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1_RELU "0:                         \n"
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero),
-                         [remain] "r"(remain)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(INIT_S1
-                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "vext.32  q6, q8, q9, #1     @ 0012\n"
-                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                           MID_RESULT_S1
-                       "cmp  %[remain], #1             \n"
-                       "blt 0f                         \n" RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1 "0:                         \n"
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero),
-                         [remain] "r"(remain)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }*/
        act_switch_3x3s1p0(din_ptr0,
                           din_ptr1,
                           din_ptr2,
@@ -4174,7 +3954,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
                                   const float *weights,
                                   const float *bias,
                                   bool flag_bias,
-                                   bool flag_relu,
                                   const int num,
                                   const int ch_in,
                                   const int h_in,
@@ -4213,14 +3992,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      // #ifdef __aarch64__
-      //       float32x4_t wbias;
-      //       if (flag_bias) {
-      //         wbias = vdupq_n_f32(bias[i]);
-      //       } else {
-      //         wbias = vdupq_n_f32(0.f);
-      //       }
-      // #endif  // __aarch64__
      float32x4_t wbias;
      float bias_val = 0.f;
      if (flag_bias) {
@@ -4261,137 +4032,6 @@ void conv_depthwise_3x3s1p0_bias_s(float *dout,
              break;
          }
        }
-        /*
-        #ifdef __aarch64__
-                if (flag_relu) {
-                  asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                               : [din0] "+r"(dr0),
-                                 [din1] "+r"(dr1),
-                                 [din2] "+r"(dr2),
-                                 [din3] "+r"(dr3)
-                               : [wr0] "w"(wr0),
-                                 [wr1] "w"(wr1),
-                                 [wr2] "w"(wr2),
-                                 [vbias] "w"(wbias),
-                                 [mask1] "w"(vmask_rp1),
-                                 [mask2] "w"(vmask_rp2),
-                                 [vzero] "w"(vzero),
-                                 [out1] "r"(out_buf1),
-                                 [out2] "r"(out_buf2)
-                               : "cc",
-                                 "memory",
-                                 "v0",
-                                 "v1",
-                                 "v2",
-                                 "v3",
-                                 "v4",
-                                 "v5",
-                                 "v6",
-                                 "v7",
-                                 "v8",
-                                 "v9",
-                                 "v10",
-                                 "v11",
-                                 "v12",
-                                 "v13",
-                                 "v14",
-                                 "v15");
-                } else {
-                  asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                               : [din0] "+r"(dr0),
-                                 [din1] "+r"(dr1),
-                                 [din2] "+r"(dr2),
-                                 [din3] "+r"(dr3)
-                               : [wr0] "w"(wr0),
-                                 [wr1] "w"(wr1),
-                                 [wr2] "w"(wr2),
-                                 [vbias] "w"(wbias),
-                                 [mask1] "w"(vmask_rp1),
-                                 [mask2] "w"(vmask_rp2),
-                                 [vzero] "w"(vzero),
-                                 [out1] "r"(out_buf1),
-                                 [out2] "r"(out_buf2)
-                               : "cc",
-                                 "memory",
-                                 "v0",
-                                 "v1",
-                                 "v2",
-                                 "v3",
-                                 "v4",
-                                 "v5",
-                                 "v6",
-                                 "v7",
-                                 "v8",
-                                 "v9",
-                                 "v10",
-                                 "v11",
-                                 "v12",
-                                 "v13",
-                                 "v14",
-                                 "v15");
-                }
-        #else
-                unsigned int *vmask_ptr = vmask;
-                float bias_val = flag_bias ? bias[i] : 0.f;
-                if (flag_relu) {
-                  asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                               : [din0] "+r"(dr0),
-                                 [din1] "+r"(dr1),
-                                 [din2] "+r"(dr2),
-                                 [din3] "+r"(dr3),
-                                 [vmask] "+r"(vmask_ptr)
-                               : [wr0] "w"(wr0),
-                                 [wr1] "w"(wr1),
-                                 [wr2] "w"(wr2),
-                                 [vzero] "w"(vzero),
-                                 [bias_val] "r"(bias_val),
-                                 [out1] "r"(out_buf1),
-                                 [out2] "r"(out_buf2)
-                               : "cc",
-                                 "memory",
-                                 "q4",
-                                 "q5",
-                                 "q6",
-                                 "q7",
-                                 "q8",
-                                 "q9",
-                                 "q10",
-                                 "q11",
-                                 "q12",
-                                 "q13",
-                                 "q14",
-                                 "q15");
-                } else {
-                  asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                               : [din0] "+r"(dr0),
-                                 [din1] "+r"(dr1),
-                                 [din2] "+r"(dr2),
-                                 [din3] "+r"(dr3),
-                                 [vmask] "+r"(vmask_ptr)
-                               : [wr0] "w"(wr0),
-                                 [wr1] "w"(wr1),
-                                 [wr2] "w"(wr2),
-                                 [vzero] "w"(vzero),
-                                 [bias_val] "r"(bias_val),
-                                 [out1] "r"(out_buf1),
-                                 [out2] "r"(out_buf2)
-                               : "cc",
-                                 "memory",
-                                 "q4",
-                                 "q5",
-                                 "q6",
-                                 "q7",
-                                 "q8",
-                                 "q9",
-                                 "q10",
-                                 "q11",
-                                 "q12",
-                                 "q13",
-                                 "q14",
-                                 "q15");
-                }
-        #endif
-        */
        unsigned int *vmask_ptr = vmask;
        act_switch_3x3s1p0_s(dr0,
                             dr1,

--- a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
@@ -836,7 +836,6 @@ void conv_3x3s1_depthwise_fp32(const float* i_data,
      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
  /// get workspace

--- a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
 #include "lite/backends/arm/math/conv_depthwise.h"
 namespace paddle {
@@ -24,13 +25,13 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
                                 const float* weights,
                                 const float* bias,
                                 bool flag_bias,
-                                 bool flag_relu,
                                 const int num,
                                 const int ch_in,
                                 const int h_in,
                                 const int w_in,
                                 const int h_out,
                                 const int w_out,
+                                 const operators::ActivationParam act_param,
                                 ARMContext* ctx);
 void conv_depthwise_3x3s2p0_bias_s(float* dout,
@@ -38,13 +39,13 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                                   const float* weights,
                                   const float* bias,
                                   bool flag_bias,
-                                   bool flag_relu,
                                   const int num,
                                   const int ch_in,
                                   const int h_in,
                                   const int w_in,
                                   const int h_out,
                                   const int w_out,
+                                   const operators::ActivationParam act_param,
                                   ARMContext* ctx);
 void conv_depthwise_3x3s2p1_bias(float* dout,
@@ -52,13 +53,13 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
                                 const float* weights,
                                 const float* bias,
                                 bool flag_bias,
-                                 bool flag_relu,
                                 const int num,
                                 const int ch_in,
                                 const int h_in,
                                 const int w_in,
                                 const int h_out,
                                 const int w_out,
+                                 const operators::ActivationParam act_param,
                                 ARMContext* ctx);
 void conv_depthwise_3x3s2p1_bias_s(float* dout,
@@ -66,13 +67,13 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                                   const float* weights,
                                   const float* bias,
                                   bool flag_bias,
-                                   bool flag_relu,
                                   const int num,
                                   const int ch_in,
                                   const int h_in,
                                   const int w_in,
                                   const int h_out,
                                   const int w_out,
+                                   const operators::ActivationParam act_param,
                                   ARMContext* ctx);
 void conv_depthwise_3x3s2_fp32(const float* din,
@@ -88,7 +89,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                               const float* bias,
                               int pad,
                               bool flag_bias,
-                               bool flag_relu,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx) {
  if (pad == 0) {
    if (w_in > 7) {
@@ -97,13 +98,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                  weights,
                                  bias,
                                  flag_bias,
-                                  flag_relu,
                                  num,
                                  ch_in,
                                  h_in,
                                  w_in,
                                  h_out,
                                  w_out,
+                                  act_param,
                                  ctx);
    } else {
      conv_depthwise_3x3s2p0_bias_s(dout,
@@ -111,13 +112,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                    weights,
                                    bias,
                                    flag_bias,
-                                    flag_relu,
                                    num,
                                    ch_in,
                                    h_in,
                                    w_in,
                                    h_out,
                                    w_out,
+                                    act_param,
                                    ctx);
    }
  }
@@ -128,13 +129,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                  weights,
                                  bias,
                                  flag_bias,
-                                  flag_relu,
                                  num,
                                  ch_in,
                                  h_in,
                                  w_in,
                                  h_out,
                                  w_out,
+                                  act_param,
                                  ctx);
    } else {
      conv_depthwise_3x3s2p1_bias_s(dout,
@@ -142,13 +143,13 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                    weights,
                                    bias,
                                    flag_bias,
-                                    flag_relu,
                                    num,
                                    ch_in,
                                    h_in,
                                    w_in,
                                    h_out,
                                    w_out,
+                                    act_param,
                                    ctx);
    }
  }
@@ -412,6 +413,83 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
                                                    \
  "blt 1f                                     \n"
+#define LEFT_RESULT_S2_RELU6                        \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n"  \
+  "ld1 {v22.4s}, [%[six_ptr]]                  \n"  \
+                                                    \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+  "fmin v16.4s, v16.4s, v22.4s                  \n" \
+                                                    \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+                                                    \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n"  \
+                                                    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "fmin v17.4s, v17.4s, v22.4s                  \n" \
+                                                    \
+  "cmp %w[cnt], #1                             \n"  \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "blt 1f                                     \n"
+#define LEFT_RESULT_S2_LEAKY_RELU                        \
+  "ld1 {v22.4s}, [%[scale_ptr]]                  \n"     \
+  "cmhs v11.4s, v16.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+                                                         \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"          \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"          \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"          \
+                                                         \
+  "fmul v12.4s, v16.4s, v22.4s                  \n"      \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"      \
+                                                         \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"          \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"          \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"         \
+                                                         \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"      \
+  "bif  v16.16b, v12.16b, v11.16b \n" /* choose*/        \
+                                                         \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"         \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"         \
+                                                         \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"             \
+                                                         \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"      \
+  "cmhs v11.4s, v17.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v16.4s, v22.4s                  \n"      \
+                                                         \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"         \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"         \
+                                                         \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"         \
+  "bif v17.16b, v12.16b, v11.16b \n" /* choose*/         \
+                                                         \
+  "cmp %w[cnt], #1                             \n"       \
+                                                         \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"      \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"         \
+                                                         \
+  "blt 1f                                     \n"
 #define MID_RESULT_S2_RELU                                    \
  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
@@ -438,6 +516,58 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                                              \
  "bne  2b                                    \n"
+#define MID_RESULT_S2_RELU6                                   \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"              \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"              \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"              \
+                                                              \
+  "fmin v16.4s, v16.4s, v22.4s                  \n"           \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"                  \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+  "subs %w[cnt], %w[cnt], #1                    \n"           \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"              \
+  "fmin v17.4s, v17.4s, v22.4s                  \n"           \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+                                                              \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"              \
+                                                              \
+  "bne  2b                                    \n"
+#define MID_RESULT_S2_LEAKY_RELU                         \
+  "cmhs v11.4s, v16.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v16.4s, v22.4s                  \n"      \
+                                                         \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"      \
+                                                         \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"         \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"         \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"         \
+                                                         \
+  "bif  v16.16b, v12.16b, v11.16b \n" /* choose*/        \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"             \
+  "cmhs v11.4s, v17.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v17.4s, v22.4s                  \n"      \
+                                                         \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"      \
+  "subs %w[cnt], %w[cnt], #1                    \n"      \
+                                                         \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"         \
+  "bif  v17.16b, v12.16b, v11.16b \n" /* choose*/        \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"      \
+                                                         \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"         \
+                                                         \
+  "bne  2b                                    \n"
 #define RIGHT_RESULT_S2_RELU                                  \
  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
                                                              \
@@ -456,6 +586,47 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
  "4:                                          \n"
+#define RIGHT_RESULT_S2_RELU6                                 \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "fmin v16.4s, v16.4s, v22.4s                  \n"           \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+  "fmin v17.4s, v17.4s, v22.4s                  \n"           \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+  "4:                                          \n"
+#define RIGHT_RESULT_S2_LEAKY_RELU                       \
+  "cmhs v11.4s, v16.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v16.4s, v22.4s                  \n"      \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"      \
+                                                         \
+  "bif  v16.16b, v12.16b, v11.16b \n" /* choose*/        \
+                                                         \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"      \
+                                                         \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"             \
+                                                         \
+  "cmhs v11.4s, v17.4s,  %[vzero].4s \n" /* vcgeq_u32 */ \
+  "fmul v12.4s, v17.4s, v22.4s                  \n"      \
+                                                         \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"      \
+  "bif  v17.16b, v12.16b, v11.16b \n" /* choose*/        \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"             \
+                                                         \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"      \
+  "4:                                          \n"
 #define COMPUTE_S_S2                                  \
  "movi v9.4s, #0                                 \n" \
  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n" \
@@ -500,7 +671,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "fmax v4.4s, v4.4s, v9.4s                       \n" \
                                                      \
  "st1 {v4.4s}, [%[out]]                          \n"
 #define COMPUTE_S_S2_P0                                \
  "movi v9.4s, #0                                 \n"  \
  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"  \
@@ -537,7 +707,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "fadd v4.4s, v4.4s, v16.4s                       \n"
 #define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]]                          \n"
 #define RESULT_S_S2_P0_RELU                           \
  "fmax v4.4s, v4.4s, v9.4s                       \n" \
  "st1 {v4.4s}, [%[out]]                          \n"
@@ -682,7 +851,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
  "cmp %[cnt], #1                                 \n" \
  "blt 1f                                         \n"
 #define MID_RESULT_S2_RELU                            \
  "vmax.f32 q3, q3, q9                    @ relu \n"  \
  "subs %[cnt], #1                                \n" \
@@ -739,7 +907,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "vadd.f32 q3, q3, q5                            @ add \n"
 #define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]]                            \n"
 #define RESULT_S_S2_RELU                                    \
  "vmax.f32 q3, q3, q9                            @ relu\n" \
                                                            \
@@ -787,13 +954,233 @@ void conv_depthwise_3x3s2_fp32(const float* din,
  "vadd.f32 q3, q3, q5                            @ add \n"
 #define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]]                            \n"
 #define RESULT_S_S2_P0_RELU                                  \
  "vmax.f32 q3, q3, q9                            @ relu \n" \
  "vst1.32 {d6-d7}, [%[out]]                            \n"
 #endif
+#ifdef __aarch64__
+void act_switch_3x3s2p1(const float* din0_ptr,
+                        const float* din1_ptr,
+                        const float* din2_ptr,
+                        const float* din3_ptr,
+                        const float* din4_ptr,
+                        float* doutr0_ptr,
+                        float* doutr1_ptr,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        uint32x4_t vmask_rp1,
+                        uint32x4_t vmask_rp2,
+                        uint32x4_t wmask,
+                        float32x4_t wbias,
+                        float32x4_t vzero,
+                        int cnt,
+                        int cnt_remain,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU6 MID_COMPUTE_S2
+                MID_RESULT_S2_RELU6 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU6
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [six_ptr] "r"(vsix),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_LEAKY_RELU
+                         MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
+                             RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_LEAKY_RELU
+                     : [inptr0] "+r"(din0_ptr),
+                       [inptr1] "+r"(din1_ptr),
+                       [inptr2] "+r"(din2_ptr),
+                       [inptr3] "+r"(din3_ptr),
+                       [inptr4] "+r"(din4_ptr),
+                       [outptr0] "+r"(doutr0_ptr),
+                       [outptr1] "+r"(doutr1_ptr),
+                       [cnt] "+r"(cnt)
+                     : [vzero] "w"(vzero),
+                       [w0] "w"(wr0),
+                       [w1] "w"(wr1),
+                       [w2] "w"(wr2),
+                       [remain] "r"(cnt_remain),
+                       [scale_ptr] "r"(vscale),
+                       [mask1] "w"(vmask_rp1),
+                       [mask2] "w"(vmask_rp2),
+                       [wmask] "w"(wmask),
+                       [vbias] "w"(wbias)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
+                     MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                 : [inptr0] "+r"(din0_ptr),
+                   [inptr1] "+r"(din1_ptr),
+                   [inptr2] "+r"(din2_ptr),
+                   [inptr3] "+r"(din3_ptr),
+                   [inptr4] "+r"(din4_ptr),
+                   [outptr0] "+r"(doutr0_ptr),
+                   [outptr1] "+r"(doutr1_ptr),
+                   [cnt] "+r"(cnt)
+                 : [vzero] "w"(vzero),
+                   [w0] "w"(wr0),
+                   [w1] "w"(wr1),
+                   [w2] "w"(wr2),
+                   [remain] "r"(cnt_remain),
+                   [mask1] "w"(vmask_rp1),
+                   [mask2] "w"(vmask_rp2),
+                   [wmask] "w"(wmask),
+                   [vbias] "w"(wbias)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v9",
+                   "v10",
+                   "v11",
+                   "v12",
+                   "v13",
+                   "v14",
+                   "v15",
+                   "v16",
+                   "v17",
+                   "v18",
+                   "v19",
+                   "v20",
+                   "v21");
+  }
+}
+#endif
 /**
 * \brief depthwise convolution kernel 3x3, stride 2
 * w_in > 7
@@ -803,13 +1190,13 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
                                 const float* weights,
                                 const float* bias,
                                 bool flag_bias,
-                                 bool flag_relu,
                                 const int num,
                                 const int ch_in,
                                 const int h_in,
                                 const int w_in,
                                 const int h_out,
                                 const int w_out,
+                                 const operators::ActivationParam act_param,
                                 ARMContext* ctx) {
  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
  int out_pad_idx[4] = {0, 1, 2, 3};
@@ -821,7 +1208,7 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
    cnt_col++;
    size_right_remain -= 8;
  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
+  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);
  int size_right_pad = w_out * 2 - w_in;
@@ -935,96 +1322,24 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
          doutr1_ptr = write_ptr;
        }
        int cnt = cnt_col;
-        if (flag_relu) {
+        act_switch_3x3s2p1(din0_ptr,
-          asm volatile(
+                           din1_ptr,
-              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                           din2_ptr,
-                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+                           din3_ptr,
-              : [inptr0] "+r"(din0_ptr),
+                           din4_ptr,
-                [inptr1] "+r"(din1_ptr),
+                           doutr0_ptr,
-                [inptr2] "+r"(din2_ptr),
+                           doutr1_ptr,
-                [inptr3] "+r"(din3_ptr),
+                           wr0,
-                [inptr4] "+r"(din4_ptr),
+                           wr1,
-                [outptr0] "+r"(doutr0_ptr),
+                           wr2,
-                [outptr1] "+r"(doutr1_ptr),
+                           vmask_rp1,
-                [cnt] "+r"(cnt)
+                           vmask_rp2,
-              : [vzero] "w"(vzero),
+                           wmask,
-                [w0] "w"(wr0),
+                           wbias,
-                [w1] "w"(wr1),
+                           vzero,
-                [w2] "w"(wr2),
+                           cnt,
-                [remain] "r"(cnt_remain),
+                           cnt_remain,
-                [mask1] "w"(vmask_rp1),
+                           act_param);
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        } else {
-          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
-                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
-                       : [inptr0] "+r"(din0_ptr),
-                         [inptr1] "+r"(din1_ptr),
-                         [inptr2] "+r"(din2_ptr),
-                         [inptr3] "+r"(din3_ptr),
-                         [inptr4] "+r"(din4_ptr),
-                         [outptr0] "+r"(doutr0_ptr),
-                         [outptr1] "+r"(doutr1_ptr),
-                         [cnt] "+r"(cnt)
-                       : [vzero] "w"(vzero),
-                         [w0] "w"(wr0),
-                         [w1] "w"(wr1),
-                         [w2] "w"(wr2),
-                         [remain] "r"(cnt_remain),
-                         [mask1] "w"(vmask_rp1),
-                         [mask2] "w"(vmask_rp2),
-                         [wmask] "w"(wmask),
-                         [vbias] "w"(wbias)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17",
-                         "v18",
-                         "v19",
-                         "v20",
-                         "v21");
-        }
        doutr0 = doutr0 + 2 * w_out;
      }
 #else
@@ -1061,65 +1376,37 @@ void conv_depthwise_3x3s2p1_bias(float* dout,
        }
        int cnt = cnt_col;
        unsigned int* mask_ptr = dmask;
-        if (flag_relu) {
+        asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
-          asm volatile(
+                         MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
-              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                     : [din0_ptr] "+r"(din0_ptr),
-                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+                       [din1_ptr] "+r"(din1_ptr),
-              : [din0_ptr] "+r"(din0_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
-                [din1_ptr] "+r"(din1_ptr),
+                       [outptr] "+r"(doutr0_ptr),
-                [din2_ptr] "+r"(din2_ptr),
+                       [cnt] "+r"(cnt),
-                [outptr] "+r"(doutr0_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
-                [cnt] "+r"(cnt),
+                     : [remain] "r"(cnt_remain),
-                [mask_ptr] "+r"(mask_ptr)
+                       [wr0] "w"(wr0),
-              : [remain] "r"(cnt_remain),
+                       [wr1] "w"(wr1),
-                [wr0] "w"(wr0),
+                       [wr2] "w"(wr2),
-                [wr1] "w"(wr1),
+                       [bias] "r"(bias_c)
-                [wr2] "w"(wr2),
+                     : "cc",
-                [bias] "r"(bias_c)
+                       "memory",
-              : "cc",
+                       "q3",
-                "memory",
+                       "q4",
-                "q3",
+                       "q5",
-                "q4",
+                       "q6",
-                "q5",
+                       "q7",
-                "q6",
+                       "q8",
-                "q7",
+                       "q9",
-                "q8",
+                       "q10",
-                "q9",
+                       "q11",
-                "q10",
+                       "q12",
-                "q11",
+                       "q13",
-                "q12",
+                       "q14",
-                "q13",
+                       "q15");
-                "q14",
+        // do act
-                "q15");
+        if (act_param.has_active) {
-        } else {
+          act_switch_process(doutr0, doutr0, w_out, &act_param);
-          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
-                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [outptr] "+r"(doutr0_ptr),
-                         [cnt] "+r"(cnt),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [remain] "r"(cnt_remain),
-                         [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
        }
        doutr0 = doutr0 + w_out;
      }
@@ -1136,13 +1423,13 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
                                   const float* weights,
                                   const float* bias,
                                   bool flag_bias,
-                                   bool flag_relu,
                                   const int num,
                                   const int ch_in,
                                   const int h_in,
                                   const int w_in,
                                   const int h_out,
                                   const int w_out,
+                                   const operators::ActivationParam act_param,
                                   ARMContext* ctx) {
  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
  int out_pad_idx[4] = {0, 1, 2, 3};
@@ -1198,108 +1485,59 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
        unsigned int* mask_ptr = dmask;
 #ifdef __aarch64__
-        if (flag_relu) {
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
+                     : [din0_ptr] "+r"(din0_ptr),
-                       : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
-                         [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
-                       : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
-                         [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
-                         [wr2] "w"(wr2),
+                       [bias] "w"(vbias),
-                         [bias] "w"(vbias),
+                       [out] "r"(out_buf)
-                         [out] "r"(out_buf)
+                     : "v4",
-                       : "v4",
+                       "v5",
-                         "v5",
+                       "v6",
-                         "v6",
+                       "v7",
-                         "v7",
+                       "v8",
-                         "v8",
+                       "v9",
-                         "v9",
+                       "v10",
-                         "v10",
+                       "v11",
-                         "v11",
+                       "v12",
-                         "v12",
+                       "v13",
-                         "v13",
+                       "v14",
-                         "v14",
+                       "v15");
-                         "v15");
-        } else {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        }
 #else
-        if (flag_relu) {
+        asm volatile(COMPUTE_S_S2 RESULT_S_S2
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
+                     : [din0_ptr] "+r"(din0_ptr),
-                       : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
-                         [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
-                       : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
-                         [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
-                         [wr2] "w"(wr2),
+                       [bias] "r"(bias_c),
-                         [bias] "r"(bias_c),
+                       [out] "r"(out_buf)
-                         [out] "r"(out_buf)
+                     : "cc",
-                       : "cc",
+                       "memory",
-                         "memory",
+                       "q3",
-                         "q3",
+                       "q4",
-                         "q4",
+                       "q5",
-                         "q5",
+                       "q6",
-                         "q6",
+                       "q7",
-                         "q7",
+                       "q8",
-                         "q8",
+                       "q9",
-                         "q9",
+                       "q10",
-                         "q10",
+                       "q11",
-                         "q11",
+                       "q12",
-                         "q12",
+                       "q13",
-                         "q13",
+                       "q14",
-                         "q14",
+                       "q15");
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
 #endif
+        // do act
+        if (act_param.has_active) {
+          act_switch_process(out_buf, out_buf, w_out, &act_param);
+        }
        for (int w = 0; w < w_out; ++w) {
          *dout_channel++ = out_buf[w];
        }
@@ -1310,6 +1548,269 @@ void conv_depthwise_3x3s2p1_bias_s(float* dout,
  }
 }
+#ifdef __aarch64__
+void act_switch_3x3s2p0(const float* din0_ptr,
+                        const float* din1_ptr,
+                        const float* din2_ptr,
+                        const float* din3_ptr,
+                        const float* din4_ptr,
+                        float* doutr0_ptr,
+                        float* doutr1_ptr,
+                        float32x4_t wr0,
+                        float32x4_t wr1,
+                        float32x4_t wr2,
+                        uint32x4_t vmask_rp1,
+                        uint32x4_t vmask_rp2,
+                        uint32x4_t wmask,
+                        float32x4_t wbias,
+                        float32x4_t vzero,
+                        int cnt,
+                        int cnt_remain,
+                        const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+        asm volatile(
+            INIT_S2
+            "ld1 {v15.4s}, [%[inptr0]]                 \n"
+            "ld1 {v18.4s}, [%[inptr1]]                 \n"
+            "ld1 {v19.4s}, [%[inptr2]]                 \n"
+            "ld1 {v20.4s}, [%[inptr3]]                 \n"
+            "ld1 {v21.4s}, [%[inptr4]]                 \n"
+            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+            MID_COMPUTE_S2 MID_RESULT_S2_RELU
+            "cmp %w[remain], #1                           \n"
+            "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                RIGHT_RESULT_S2_RELU
+            "4:                                          \n"
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21");
+        break;
+      case lite_api::ActivationType::kRelu6:
+        /* 0 <= din <= 6 */
+        asm volatile(
+            INIT_S2
+            "ld1 {v15.4s}, [%[inptr0]]                 \n"
+            "ld1 {v18.4s}, [%[inptr1]]                 \n"
+            "ld1 {v19.4s}, [%[inptr2]]                 \n"
+            "ld1 {v20.4s}, [%[inptr3]]                 \n"
+            "ld1 {v21.4s}, [%[inptr4]]                 \n"
+            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+            MID_COMPUTE_S2 MID_RESULT_S2_RELU6
+            "cmp %w[remain], #1                           \n"
+            "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                RIGHT_RESULT_S2_RELU6
+            "4:                                          \n"
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [six_ptr] "r"(vsix),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22");
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+        /*din = din >= 0 ? din : din * scale*/
+        asm volatile(
+            INIT_S2
+            "ld1 {v15.4s}, [%[inptr0]]                 \n"
+            "ld1 {v18.4s}, [%[inptr1]]                 \n"
+            "ld1 {v19.4s}, [%[inptr2]]                 \n"
+            "ld1 {v20.4s}, [%[inptr3]]                 \n"
+            "ld1 {v21.4s}, [%[inptr4]]                 \n"
+            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+            MID_COMPUTE_S2 MID_RESULT_S2_LEAKY_RELU
+            "cmp %w[remain], #1                           \n"
+            "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                RIGHT_RESULT_S2_LEAKY_RELU
+            "4:                                          \n"
+            : [inptr0] "+r"(din0_ptr),
+              [inptr1] "+r"(din1_ptr),
+              [inptr2] "+r"(din2_ptr),
+              [inptr3] "+r"(din3_ptr),
+              [inptr4] "+r"(din4_ptr),
+              [outptr0] "+r"(doutr0_ptr),
+              [outptr1] "+r"(doutr1_ptr),
+              [cnt] "+r"(cnt)
+            : [vzero] "w"(vzero),
+              [w0] "w"(wr0),
+              [w1] "w"(wr1),
+              [w2] "w"(wr2),
+              [remain] "r"(cnt_remain),
+              [six_ptr] "r"(vscale),
+              [mask1] "w"(vmask_rp1),
+              [mask2] "w"(vmask_rp2),
+              [wmask] "w"(wmask),
+              [vbias] "w"(wbias)
+            : "cc",
+              "memory",
+              "v0",
+              "v1",
+              "v2",
+              "v3",
+              "v4",
+              "v5",
+              "v6",
+              "v7",
+              "v8",
+              "v9",
+              "v10",
+              "v11",
+              "v12",
+              "v13",
+              "v14",
+              "v15",
+              "v16",
+              "v17",
+              "v18",
+              "v19",
+              "v20",
+              "v21",
+              "v22");
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+    asm volatile(
+        INIT_S2
+        "ld1 {v15.4s}, [%[inptr0]]                 \n"
+        "ld1 {v18.4s}, [%[inptr1]]                 \n"
+        "ld1 {v19.4s}, [%[inptr2]]                 \n"
+        "ld1 {v20.4s}, [%[inptr3]]                 \n"
+        "ld1 {v21.4s}, [%[inptr4]]                 \n"
+        "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+        MID_COMPUTE_S2 MID_RESULT_S2
+        "cmp %w[remain], #1                           \n"
+        "blt 4f                                     \n" RIGHT_COMPUTE_S2
+            RIGHT_RESULT_S2 "4:                                          \n"
+        : [inptr0] "+r"(din0_ptr),
+          [inptr1] "+r"(din1_ptr),
+          [inptr2] "+r"(din2_ptr),
+          [inptr3] "+r"(din3_ptr),
+          [inptr4] "+r"(din4_ptr),
+          [outptr0] "+r"(doutr0_ptr),
+          [outptr1] "+r"(doutr1_ptr),
+          [cnt] "+r"(cnt)
+        : [vzero] "w"(vzero),
+          [w0] "w"(wr0),
+          [w1] "w"(wr1),
+          [w2] "w"(wr2),
+          [remain] "r"(cnt_remain),
+          [mask1] "w"(vmask_rp1),
+          [mask2] "w"(vmask_rp2),
+          [wmask] "w"(wmask),
+          [vbias] "w"(wbias)
+        : "cc",
+          "memory",
+          "v0",
+          "v1",
+          "v2",
+          "v3",
+          "v4",
+          "v5",
+          "v6",
+          "v7",
+          "v8",
+          "v9",
+          "v10",
+          "v11",
+          "v12",
+          "v13",
+          "v14",
+          "v15",
+          "v16",
+          "v17",
+          "v18",
+          "v19",
+          "v20",
+          "v21");
+  }
+}
+#endif
 /**
 * \brief depthwise convolution kernel 3x3, stride 2
 */
@@ -1319,13 +1820,13 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
                                 const float* weights,
                                 const float* bias,
                                 bool flag_bias,
-                                 bool flag_relu,
                                 const int num,
                                 const int ch_in,
                                 const int h_in,
                                 const int w_in,
                                 const int h_out,
                                 const int w_out,
+                                 const operators::ActivationParam act_param,
                                 ARMContext* ctx) {
  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
  int out_pad_idx[4] = {0, 1, 2, 3};
@@ -1438,117 +1939,24 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
          doutr1_ptr = write_ptr;
        }
        int cnt = tile_w;
-        if (flag_relu) {
+        act_switch_3x3s2p0(din0_ptr,
-          asm volatile(
+                           din1_ptr,
-              INIT_S2
+                           din2_ptr,
-              "ld1 {v15.4s}, [%[inptr0]]                 \n"
+                           din3_ptr,
-              "ld1 {v18.4s}, [%[inptr1]]                 \n"
+                           din4_ptr,
-              "ld1 {v19.4s}, [%[inptr2]]                 \n"
+                           doutr0_ptr,
-              "ld1 {v20.4s}, [%[inptr3]]                 \n"
+                           doutr1_ptr,
-              "ld1 {v21.4s}, [%[inptr4]]                 \n"
+                           wr0,
-              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+                           wr1,
-              MID_COMPUTE_S2 MID_RESULT_S2_RELU
+                           wr2,
-              "cmp %w[remain], #1                           \n"
+                           vmask_rp1,
-              "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                           vmask_rp2,
-                  RIGHT_RESULT_S2_RELU
+                           wmask,
-              "4:                                          \n"
+                           wbias,
-              : [inptr0] "+r"(din0_ptr),
+                           vzero,
-                [inptr1] "+r"(din1_ptr),
+                           cnt,
-                [inptr2] "+r"(din2_ptr),
+                           cnt_remain,
-                [inptr3] "+r"(din3_ptr),
+                           act_param);
-                [inptr4] "+r"(din4_ptr),
-                [outptr0] "+r"(doutr0_ptr),
-                [outptr1] "+r"(doutr1_ptr),
-                [cnt] "+r"(cnt)
-              : [vzero] "w"(vzero),
-                [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [remain] "r"(cnt_remain),
-                [mask1] "w"(vmask_rp1),
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        } else {
-          asm volatile(
-              INIT_S2
-              "ld1 {v15.4s}, [%[inptr0]]                 \n"
-              "ld1 {v18.4s}, [%[inptr1]]                 \n"
-              "ld1 {v19.4s}, [%[inptr2]]                 \n"
-              "ld1 {v20.4s}, [%[inptr3]]                 \n"
-              "ld1 {v21.4s}, [%[inptr4]]                 \n"
-              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-              MID_COMPUTE_S2 MID_RESULT_S2
-              "cmp %w[remain], #1                           \n"
-              "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                  RIGHT_RESULT_S2
-              "4:                                          \n"
-              : [inptr0] "+r"(din0_ptr),
-                [inptr1] "+r"(din1_ptr),
-                [inptr2] "+r"(din2_ptr),
-                [inptr3] "+r"(din3_ptr),
-                [inptr4] "+r"(din4_ptr),
-                [outptr0] "+r"(doutr0_ptr),
-                [outptr1] "+r"(doutr1_ptr),
-                [cnt] "+r"(cnt)
-              : [vzero] "w"(vzero),
-                [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [remain] "r"(cnt_remain),
-                [mask1] "w"(vmask_rp1),
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        }
        doutr0 = doutr0 + 2 * w_out;
      }
 #else
@@ -1576,64 +1984,36 @@ void conv_depthwise_3x3s2p0_bias(float* dout,
        }
        int cnt = tile_w;
        unsigned int* mask_ptr = dmask;
-        if (flag_relu) {
+        asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
-          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU
+                         RIGHT_RESULT_S2
-                           RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+                     : [din0_ptr] "+r"(din0_ptr),
-                       : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
+                       [outptr] "+r"(doutr0_ptr),
-                         [outptr] "+r"(doutr0_ptr),
+                       [cnt] "+r"(cnt),
-                         [cnt] "+r"(cnt),
+                       [mask_ptr] "+r"(mask_ptr)
-                         [mask_ptr] "+r"(mask_ptr)
+                     : [remain] "r"(cnt_remain),
-                       : [remain] "r"(cnt_remain),
+                       [wr0] "w"(wr0),
-                         [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
-                         [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
-                         [wr2] "w"(wr2),
+                       [bias] "r"(bias_c)
-                         [bias] "r"(bias_c)
+                     : "cc",
-                       : "cc",
+                       "memory",
-                         "memory",
+                       "q3",
-                         "q3",
+                       "q4",
-                         "q4",
+                       "q5",
-                         "q5",
+                       "q6",
-                         "q6",
+                       "q7",
-                         "q7",
+                       "q8",
-                         "q8",
+                       "q9",
-                         "q9",
+                       "q10",
-                         "q10",
+                       "q11",
-                         "q11",
+                       "q12",
-                         "q12",
+                       "q13",
-                         "q13",
+                       "q14",
-                         "q14",
+                       "q15");
-                         "q15");
+        if (act_param.has_active) {
-        } else {
+          act_switch_process(doutr0, doutr0, w_out, &act_param);
-          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
-                           RIGHT_RESULT_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [outptr] "+r"(doutr0_ptr),
-                         [cnt] "+r"(cnt),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [remain] "r"(cnt_remain),
-                         [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
        }
        doutr0 = doutr0 + w_out;
      }
@@ -1650,13 +2030,13 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
                                   const float* weights,
                                   const float* bias,
                                   bool flag_bias,
-                                   bool flag_relu,
                                   const int num,
                                   const int ch_in,
                                   const int h_in,
                                   const int w_in,
                                   const int h_out,
                                   const int w_out,
+                                   const operators::ActivationParam act_param,
                                   ARMContext* ctx) {
  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
  int out_pad_idx[4] = {0, 1, 2, 3};
@@ -1718,114 +2098,62 @@ void conv_depthwise_3x3s2p0_bias_s(float* dout,
        unsigned int* mask_ptr = dmask;
 #ifdef __aarch64__
-        if (flag_relu) {
+        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
+                     : [din0_ptr] "+r"(din0_ptr),
-                       : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
+                       [mask_ptr] "+r"(mask_ptr)
-                         [mask_ptr] "+r"(mask_ptr)
+                     : [wr0] "w"(wr0),
-                       : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
-                         [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
-                         [wr2] "w"(wr2),
+                       [bias] "w"(vbias),
-                         [bias] "w"(vbias),
+                       [out] "r"(out_buf)
-                         [out] "r"(out_buf)
+                     : "cc",
-                       : "cc",
+                       "memory",
-                         "memory",
+                       "v4",
-                         "v4",
+                       "v5",
-                         "v5",
+                       "v6",
-                         "v6",
+                       "v7",
-                         "v7",
+                       "v8",
-                         "v8",
+                       "v9",
-                         "v9",
+                       "v10",
-                         "v10",
+                       "v11",
-                         "v11",
+                       "v12",
-                         "v12",
+                       "v13",
-                         "v13",
+                       "v14",
-                         "v14",
+                       "v15",
-                         "v15",
+                       "v16");
-                         "v16");
-        } else {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16");
-        }
 #else
-        if (flag_relu) {
+        asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
+                     : [din0_ptr] "+r"(din0_ptr),
-                       : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr)
-                         [din2_ptr] "+r"(din2_ptr)
+                     : [wr0] "w"(wr0),
-                       : [wr0] "w"(wr0),
+                       [wr1] "w"(wr1),
-                         [wr1] "w"(wr1),
+                       [wr2] "w"(wr2),
-                         [wr2] "w"(wr2),
+                       [bias] "r"(bias_c),
-                         [bias] "r"(bias_c),
+                       [out] "r"(out_buf),
-                         [out] "r"(out_buf),
+                       [mask_ptr] "r"(dmask)
-                         [mask_ptr] "r"(dmask)
+                     : "cc",
-                       : "cc",
+                       "memory",
-                         "memory",
+                       "q3",
-                         "q3",
+                       "q4",
-                         "q4",
+                       "q5",
-                         "q5",
+                       "q6",
-                         "q6",
+                       "q7",
-                         "q7",
+                       "q8",
-                         "q8",
+                       "q9",
-                         "q9",
+                       "q10",
-                         "q10",
+                       "q11",
-                         "q11",
+                       "q12",
-                         "q12",
+                       "q13",
-                         "q13",
+                       "q14",
-                         "q14",
+                       "q15");
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf),
-                         [mask_ptr] "r"(dmask)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
 #endif
+        if (act_param.has_active) {
+          act_switch_process(out_buf, out_buf, w_out, &act_param);
+        }
        for (int w = 0; w < w_out; ++w) {
          *dout_channel++ = out_buf[w];
        }

--- a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
@@ -25,6 +25,511 @@ namespace paddle {
 namespace lite {
 namespace arm {
 namespace math {
+#ifdef __aarch64__
+#define COMPUTE                                                 \
+  "ldr    q8, [%[bias]]\n"            /* load bias */           \
+  "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/        \
+  "and    v19.16b,  v8.16b, v8.16b\n"                           \
+  "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/        \
+  "and    v20.16b,  v8.16b, v8.16b\n"                           \
+  "ldp    q4, q5,   [%[inr0]], #32\n" /* load input r0*/        \
+  "and    v21.16b,  v8.16b, v8.16b\n"                           \
+  "ldp    q6, q7,   [%[inr0]], #32\n" /* load input r0*/        \
+  "and    v22.16b,  v8.16b, v8.16b\n"                           \
+  "ldr    q8,       [%[inr0]]\n"        /* load input r0*/      \
+  "fmla   v19.4s ,  %[w0].4s,  v0.4s\n" /* outr0 = w0 * r0, 0*/ \
+  "fmla   v20.4s ,  %[w0].4s,  v2.4s\n" /* outr1 = w0 * r0, 2*/ \
+  "fmla   v21.4s ,  %[w0].4s,  v4.4s\n" /* outr2 = w0 * r0, 4*/ \
+  "fmla   v22.4s ,  %[w0].4s,  v6.4s\n" /* outr3 = w0 * r0, 6*/ \
+  "fmla   v19.4s ,  %[w1].4s,  v1.4s\n" /* outr0 = w1 * r0, 1*/ \
+  "ldp    q0, q1,   [%[inr1]], #32\n"   /* load input r1*/      \
+  "fmla   v20.4s ,  %[w1].4s,  v3.4s\n" /* outr1 = w1 * r0, 3*/ \
+  "fmla   v21.4s ,  %[w1].4s,  v5.4s\n" /* outr2 = w1 * r0, 5*/ \
+  "fmla   v22.4s ,  %[w1].4s,  v7.4s\n" /* outr3 = w1 * r0, 7*/ \
+  "fmla   v19.4s ,  %[w2].4s,  v2.4s\n" /* outr0 = w0 * r0, 2*/ \
+  "ldp    q2, q3,   [%[inr1]], #32\n"   /* load input r1*/      \
+  "fmla   v20.4s ,  %[w2].4s,  v4.4s\n" /* outr1 = w0 * r0, 4*/ \
+  "ldp    q4, q5,   [%[inr1]], #32\n"   /* load input r1*/      \
+  "fmla   v21.4s ,  %[w2].4s,  v6.4s\n" /* outr2 = w0 * r0, 6*/ \
+  "ldp    q6, q7,   [%[inr1]], #32\n"   /* load input r1*/      \
+  "fmla   v22.4s ,  %[w2].4s,  v8.4s\n" /* outr3 = w0 * r0, 8*/ \
+  "ldr    q8,   [%[inr1]]\n"            /* load input r1*/      \
+  "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr0 = w3 * r1, 0*/ \
+  "fmla   v20.4s ,  %[w3].4s,  v2.4s\n" /* outr1 = w3 * r1, 2*/ \
+  "fmla   v21.4s ,  %[w3].4s,  v4.4s\n" /* outr2 = w3 * r1, 4*/ \
+  "fmla   v22.4s ,  %[w3].4s,  v6.4s\n" /* outr3 = w3 * r1, 6*/ \
+  "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr0 = w4 * r1, 1*/ \
+  "ldp    q0, q1,   [%[inr2]], #32\n"   /* load input r2*/      \
+  "fmla   v20.4s ,  %[w4].4s,  v3.4s\n" /* outr1 = w4 * r1, 3*/ \
+  "fmla   v21.4s ,  %[w4].4s,  v5.4s\n" /* outr2 = w4 * r1, 5*/ \
+  "fmla   v22.4s ,  %[w4].4s,  v7.4s\n" /* outr3 = w4 * r1, 7*/ \
+  "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr0 = w5 * r1, 2*/ \
+  "ldp    q2, q3,   [%[inr2]], #32\n"   /* load input r2*/      \
+  "fmla   v20.4s ,  %[w5].4s,  v4.4s\n" /* outr1 = w5 * r1, 4*/ \
+  "ldp    q4, q5,   [%[inr2]], #32\n"   /* load input r2*/      \
+  "fmla   v21.4s ,  %[w5].4s,  v6.4s\n" /* outr2 = w5 * r1, 6*/ \
+  "ldp    q6, q7,   [%[inr2]], #32\n"   /* load input r2*/      \
+  "fmla   v22.4s ,  %[w5].4s,  v8.4s\n" /* outr3 = w5 * r1, 8*/ \
+  "ldr    q8,   [%[inr2]]\n"            /* load input r2*/      \
+  "fmla   v19.4s ,  %[w6].4s,  v0.4s\n" /* outr0 = w6 * r2, 0*/ \
+  "fmla   v20.4s ,  %[w6].4s,  v2.4s\n" /* outr1 = w6 * r2, 2*/ \
+  "fmla   v21.4s ,  %[w6].4s,  v4.4s\n" /* outr2 = w6 * r2, 4*/ \
+  "fmla   v22.4s ,  %[w6].4s,  v6.4s\n" /* outr3 = w6 * r2, 6*/ \
+  "fmla   v19.4s ,  %[w7].4s,  v1.4s\n" /* outr0 = w7 * r2, 1*/ \
+  "fmla   v20.4s ,  %[w7].4s,  v3.4s\n" /* outr1 = w7 * r2, 3*/ \
+  "fmla   v21.4s ,  %[w7].4s,  v5.4s\n" /* outr2 = w7 * r2, 5*/ \
+  "fmla   v22.4s ,  %[w7].4s,  v7.4s\n" /* outr3 = w7 * r2, 7*/ \
+  "fmla   v19.4s ,  %[w8].4s,  v2.4s\n" /* outr0 = w8 * r2, 2*/ \
+  "fmla   v20.4s ,  %[w8].4s,  v4.4s\n" /* outr1 = w8 * r2, 4*/ \
+  "fmla   v21.4s ,  %[w8].4s,  v6.4s\n" /* outr2 = w8 * r2, 6*/ \
+  "fmla   v22.4s ,  %[w8].4s,  v8.4s\n" /* outr3 = w8 * r2, 8*/ \
+  "trn1 v0.4s, v19.4s, v20.4s\n"        /* r0: a0a1c0c1*/       \
+  "trn2 v1.4s, v19.4s, v20.4s\n"        /* r0: b0b1d0d1*/       \
+  "trn1 v2.4s, v21.4s, v22.4s\n"        /* r0: a2a3c2c3*/       \
+  "trn2 v3.4s, v21.4s, v22.4s\n"        /* r0: b2b3d2d3*/       \
+  "trn1 v19.2d, v0.2d, v2.2d\n"         /* r0: a0a1a2a3*/       \
+  "trn2 v21.2d, v0.2d, v2.2d\n"         /* r0: c0c1c2c3*/       \
+  "trn1 v20.2d, v1.2d, v3.2d\n"         /* r0: b0b1b2b3*/       \
+  "trn2 v22.2d, v1.2d, v3.2d\n"         /* r0: d0d1d2d3*/
+#define RELU                            /* relu */     \
+  "movi v0.4s, #0\n"                    /* for relu */ \
+  "fmax v19.4s, v19.4s, v0.4s\n"                       \
+  "fmax v20.4s, v20.4s, v0.4s\n"                       \
+  "fmax v21.4s, v21.4s, v0.4s\n"                       \
+  "fmax v22.4s, v22.4s, v0.4s\n"
+#define RELU6 /* relu6 */             \
+  "fmin v19.4s, v19.4s, %[vsix].4s\n" \
+  "fmin v20.4s, v20.4s, %[vsix].4s\n" \
+  "fmin v21.4s, v21.4s, %[vsix].4s\n" \
+  "fmin v22.4s, v22.4s, %[vsix].4s\n"
+#define LEAKY_RELU                      /* LeakyRelu */ \
+  "movi v0.4s, #0\n"                    /* for relu */  \
+  "cmhs v1.4s, v19.4s,  v0.4s \n"       /* vcgeq_u32 */ \
+  "fmul v2.4s, v19.4s, %[vscale].4s \n" /* mul */       \
+  "cmhs v3.4s, v20.4s,  v0.4s \n"       /* vcgeq_u32 */ \
+  "fmul v4.4s, v20.4s, %[vscale].4s \n" /* mul */       \
+  "cmhs v5.4s, v21.4s,  v0.4s \n"       /* vcgeq_u32 */ \
+  "fmul v6.4s, v21.4s, %[vscale].4s \n" /* mul */       \
+  "cmhs v7.4s, v22.4s,  v0.4s \n"       /* vcgeq_u32 */ \
+  "fmul v8.4s, v22.4s, %[vscale].4s \n" /* mul */       \
+  "bif  v19.16b, v2.16b, v1.16b \n"     /* choose*/     \
+  "bif  v19.16b, v4.16b, v3.16b \n"     /* choose*/     \
+  "bif  v19.16b, v6.16b, v5.16b \n"     /* choose*/     \
+  "bif  v19.16b, v8.16b, v7.16b \n"     /* choose*/
+#define STORE                           /* save result */ \
+  "str q19, [%[outc0]], #16\n"                            \
+  "str q20, [%[outc1]], #16\n"                            \
+  "str q21, [%[outc2]], #16\n"                            \
+  "str q22, [%[outc3]], #16\n"
+#else
+#define COMPUTE                                                                \
+  /* fill with bias */                                                         \
+  "vld1.32  {d16-d17}, [%[bias]]\n" /* load bias */ /* load weights */         \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n"               /* load w0-2, to q9-11 */  \
+  "vld1.32  {d0-d3},   [%[r0]]!\n"                  /* load input r0, 0,1*/    \
+  "vand.i32 q12,  q8, q8\n"                                                    \
+  "vld1.32  {d4-d7},   [%[r0]]!\n" /* load input r0, 2,3*/                     \
+  "vand.i32 q13,  q8, q8\n"                                                    \
+  "vld1.32  {d8-d11},  [%[r0]]!\n" /* load input r0, 4,5*/                     \
+  "vand.i32 q14,  q8, q8\n"                                                    \
+  "vld1.32  {d12-d15}, [%[r0]]!\n" /* load input r0, 6,7*/                     \
+  "vand.i32 q15,  q8, q8\n"                                                    \
+  "vld1.32  {d16-d17}, [%[r0]]\n" /* load input r0, 8*/                        \
+  "vmla.f32   q12, q9, q0               @ w0 * inr0\n"                         \
+  "vmla.f32   q13, q9, q2               @ w0 * inr2\n"                         \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w2, to q11 */                    \
+  "vmla.f32   q14, q9, q4               @ w0 * inr4\n"                         \
+  "vmla.f32   q15, q9, q6               @ w0 * inr6\n"                         \
+  "vmla.f32   q12, q10, q1              @ w1 * inr1\n"                         \
+  "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"                     \
+  "vmla.f32   q13, q10, q3              @ w1 * inr3\n"                         \
+  "vmla.f32   q14, q10, q5              @ w1 * inr5\n"                         \
+  "vmla.f32   q15, q10, q7              @ w1 * inr7\n"                         \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w3-4, to q9-10 */                \
+  "vmla.f32   q12, q11, q2              @ w2 * inr2\n"                         \
+  "vld1.32    {d4-d7}, [%[r1]]!         @ load r1, 2, 3\n"                     \
+  "vmla.f32   q13, q11, q4              @ w2 * inr4\n"                         \
+  "vld1.32    {d8-d11}, [%[r1]]!        @ load r1, 4, 5\n"                     \
+  "vmla.f32   q14, q11, q6              @ w2 * inr6\n"                         \
+  "vld1.32    {d12-d15}, [%[r1]]!       @ load r1, 6, 7\n"                     \
+  "vmla.f32   q15, q11, q8              @ w2 * inr8\n" /* mul r1 with w3, w4*/ \
+  "vmla.f32   q12, q9, q0               @ w3 * inr0\n"                         \
+  "vmla.f32   q13, q9, q2               @ w3 * inr2\n"                         \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w5, to q11 */                    \
+  "vmla.f32   q14, q9, q4               @ w3 * inr4\n"                         \
+  "vmla.f32   q15, q9, q6               @ w3 * inr6\n"                         \
+  "vld1.32    {d16-d17}, [%[r1]]\n" /* load input r1, 8*/                      \
+  "vmla.f32   q12, q10, q1              @ w4 * inr1\n"                         \
+  "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"                     \
+  "vmla.f32   q13, q10, q3              @ w4 * inr3\n"                         \
+  "vmla.f32   q14, q10, q5              @ w4 * inr5\n"                         \
+  "vmla.f32   q15, q10, q7              @ w4 * inr7\n"                         \
+  "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w6-7, to q9-10 */                \
+  "vmla.f32   q12, q11, q2              @ w5 * inr2\n"                         \
+  "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"                     \
+  "vmla.f32   q13, q11, q4              @ w5 * inr4\n"                         \
+  "vld1.32    {d8-d11}, [%[r2]]!        @ load r2, 4, 5\n"                     \
+  "vmla.f32   q14, q11, q6              @ w5 * inr6\n"                         \
+  "vld1.32    {d12-d15}, [%[r2]]!       @ load r2, 6, 7\n"                     \
+  "vmla.f32   q15, q11, q8              @ w5 * inr8\n" /* mul r2 with w6, w7*/ \
+  "vmla.f32   q12, q9, q0               @ w6 * inr0\n"                         \
+  "vmla.f32   q13, q9, q2               @ w6 * inr2\n"                         \
+  "vld1.32    {d22-d23}, [%[wc0]]!\n" /* load w8, to q11 */                    \
+  "vmla.f32   q14, q9, q4               @ w6 * inr4\n"                         \
+  "vmla.f32   q15, q9, q6               @ w6 * inr6\n"                         \
+  "vld1.32    {d16-d17}, [%[r2]]\n" /* load input r2, 8*/                      \
+  "vmla.f32   q12, q10, q1              @ w7 * inr1\n"                         \
+  "vmla.f32   q13, q10, q3              @ w7 * inr3\n"                         \
+  "vmla.f32   q14, q10, q5              @ w7 * inr5\n"                         \
+  "vmla.f32   q15, q10, q7              @ w7 * inr7\n"                         \
+  "sub    %[wc0], %[wc0], #144          @ wc0 - 144 to start address\n"        \
+  "vmla.f32   q12, q11, q2              @ w8 * inr2\n"                         \
+  "vmla.f32   q13, q11, q4              @ w8 * inr4\n"                         \
+  "vmla.f32   q14, q11, q6              @ w8 * inr6\n"                         \
+  "vmla.f32   q15, q11, q8              @ w8 * inr8\n" /* transpose */         \
+  "vtrn.32 q12, q13\n"                                 /* a0a1c0c1, b0b1d0d1*/ \
+  "vtrn.32 q14, q15\n"                                 /* a2a3c2c3, b2b3d2d3*/ \
+  "vswp   d25, d28\n"                                  /* a0a1a2a3, c0c1c2c3*/ \
+  "vswp   d27, d30\n"                                  /* b0b1b2b3, d0d1d2d3*/
+#define RELU                                           /* relu */ \
+  "vmov.u32 q0, #0\n"                                             \
+  "vld1.32 {d2-d3}, [%[six_ptr]]\n"                               \
+  "vmax.f32 q12, q12, q0\n"                                       \
+  "vmax.f32 q13, q13, q0\n"                                       \
+  "vmax.f32 q14, q14, q0\n"                                       \
+  "vmax.f32 q15, q15, q0\n"
+#define RELU6 /* relu6 */   \
+  "vmin.f32 q12, q12, q1\n" \
+  "vmin.f32 q13, q13, q1\n" \
+  "vmin.f32 q14, q14, q1\n" \
+  "vmin.f32 q15, q15, q1\n"
+#define LEAKY_RELU /* LeakyRelu */    \
+  "vmov.u32 q0, #0\n"                 \
+  "vld1.32 {d2-d3}, [%[scale_ptr]]\n" \
+  "vcge.f32 q2, q12, q0  @ q0 > 0 \n" \
+  "vcge.f32 q4, q13, q0  @ q0 > 0 \n" \
+  "vcge.f32 q6, q14, q0  @ q0 > 0 \n" \
+  "vcge.f32 q8, q15, q0  @ q0 > 0 \n" \
+  "vmul.f32 q3, q12, q1   @ mul \n"   \
+  "vmul.f32 q5, q13, q1   @ mul \n"   \
+  "vmul.f32 q7, q14, q1   @ mul \n"   \
+  "vmul.f32 q9, q15, q1   @ mul \n"   \
+  "vbif q12, q3, q2 @ choose \n"      \
+  "vbif q13, q5, q4 @ choose \n"      \
+  "vbif q14, q7, q6 @ choose \n"      \
+  "vbif q15, q9, q8 @ choose \n"
+#define STORE                        /* save result */ \
+  "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/   \
+  "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/   \
+  "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/   \
+  "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
+#endif
+void act_switch_3x3s2(const float* inr0,
+                      const float* inr1,
+                      const float* inr2,
+                      float* outc0,
+                      float* outc1,
+                      float* outc2,
+                      float* outc3,
+                      const float* weight_c,
+                      float* bias_local,
+                      float32x4_t w0,
+                      float32x4_t w1,
+                      float32x4_t w2,
+                      float32x4_t w3,
+                      float32x4_t w4,
+                      float32x4_t w5,
+                      float32x4_t w6,
+                      float32x4_t w7,
+                      float32x4_t w8,
+                      const operators::ActivationParam act_param) {
+  bool has_active = act_param.has_active;
+  if (has_active) {
+    float tmp = act_param.Relu_clipped_coef;
+    float ss = act_param.Leaky_relu_alpha;
+#ifdef __aarch64__
+    float32x4_t vsix = vdupq_n_f32(tmp);
+    float32x4_t vscale = vdupq_n_f32(ss);
+#else
+    float vsix[4] = {tmp, tmp, tmp, tmp};
+    float vscale[4] = {ss, ss, ss, ss};
+#endif
+    switch (act_param.active_type) {
+      case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [bias] "r"(bias_local)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+#else
+        asm volatile(COMPUTE RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [bias] "r"(bias_local), [six_ptr] "r"(vsix)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [bias] "r"(bias_local),
+                       [vsix] "w"(vsix)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+#else
+        asm volatile(COMPUTE RELU RELU6 STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [bias] "r"(bias_local), [six_ptr] "r"(vsix)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [inr0] "+r"(inr0),
+                       [inr1] "+r"(inr1),
+                       [inr2] "+r"(inr2),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [w0] "w"(w0),
+                       [w1] "w"(w1),
+                       [w2] "w"(w2),
+                       [w3] "w"(w3),
+                       [w4] "w"(w4),
+                       [w5] "w"(w5),
+                       [w6] "w"(w6),
+                       [w7] "w"(w7),
+                       [w8] "w"(w8),
+                       [bias] "r"(bias_local),
+                       [vscale] "w"(vscale)
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v19",
+                       "v20",
+                       "v21",
+                       "v22");
+#else
+        asm volatile(COMPUTE LEAKY_RELU STORE
+                     : [r0] "+r"(inr0),
+                       [r1] "+r"(inr1),
+                       [r2] "+r"(inr2),
+                       [wc0] "+r"(weight_c),
+                       [outc0] "+r"(outc0),
+                       [outc1] "+r"(outc1),
+                       [outc2] "+r"(outc2),
+                       [outc3] "+r"(outc3)
+                     : [bias] "r"(bias_local), [scale_ptr] "r"(vscale)
+                     : "cc",
+                       "memory",
+                       "q0",
+                       "q1",
+                       "q2",
+                       "q3",
+                       "q4",
+                       "q5",
+                       "q6",
+                       "q7",
+                       "q8",
+                       "q9",
+                       "q10",
+                       "q11",
+                       "q12",
+                       "q13",
+                       "q14",
+                       "q15");
+#endif
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param.active_type)
+                   << " fuse not support";
+    }
+  } else {
+#ifdef __aarch64__
+    asm volatile(COMPUTE STORE
+                 : [inr0] "+r"(inr0),
+                   [inr1] "+r"(inr1),
+                   [inr2] "+r"(inr2),
+                   [outc0] "+r"(outc0),
+                   [outc1] "+r"(outc1),
+                   [outc2] "+r"(outc2),
+                   [outc3] "+r"(outc3)
+                 : [w0] "w"(w0),
+                   [w1] "w"(w1),
+                   [w2] "w"(w2),
+                   [w3] "w"(w3),
+                   [w4] "w"(w4),
+                   [w5] "w"(w5),
+                   [w6] "w"(w6),
+                   [w7] "w"(w7),
+                   [w8] "w"(w8),
+                   [bias] "r"(bias_local)
+                 : "cc",
+                   "memory",
+                   "v0",
+                   "v1",
+                   "v2",
+                   "v3",
+                   "v4",
+                   "v5",
+                   "v6",
+                   "v7",
+                   "v8",
+                   "v19",
+                   "v20",
+                   "v21",
+                   "v22");
+#else
+    asm volatile(COMPUTE STORE
+                 : [r0] "+r"(inr0),
+                   [r1] "+r"(inr1),
+                   [r2] "+r"(inr2),
+                   [wc0] "+r"(weight_c),
+                   [outc0] "+r"(outc0),
+                   [outc1] "+r"(outc1),
+                   [outc2] "+r"(outc2),
+                   [outc3] "+r"(outc3)
+                 : [bias] "r"(bias_local)
+                 : "cc",
+                   "memory",
+                   "q0",
+                   "q1",
+                   "q2",
+                   "q3",
+                   "q4",
+                   "q5",
+                   "q6",
+                   "q7",
+                   "q8",
+                   "q9",
+                   "q10",
+                   "q11",
+                   "q12",
+                   "q13",
+                   "q14",
+                   "q15");
+#endif
+  }
+}
 void conv_3x3s2_depthwise_fp32(const float* i_data,
                               float* o_data,
@@ -38,6 +543,7 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
                               const float* weights,
                               const float* bias,
                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx) {
  auto paddings = *param.paddings;
  int threads = ctx->threads();
@@ -51,11 +557,9 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
  const int win_round = ROUNDUP(win_ext, 4);
  const int hin_round = oh * 2 + 1;
  const int prein_size = win_round * hin_round * out_c_block;
-  auto workspace_size =
+  auto workspace_size = threads * prein_size + win_round + ow_round;
-      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
  /// get workspace
@@ -77,6 +581,8 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
  remain = remain > 0 ? remain : 0;
  int row_len = win_round * out_c_block;
+  float32x4_t vzero = vdupq_n_f32(0.f);
  for (int n = 0; n < bs; ++n) {
    const float* din_batch = i_data + n * ic * size_in_channel;
    float* dout_batch = o_data + n * oc * size_out_channel;
@@ -147,201 +653,47 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
            outc2 = pre_out + 8;
            outc3 = pre_out + 12;
          }
-// clang-format off
 #ifdef __aarch64__
-          asm volatile(
+          act_switch_3x3s2(inr0,
-          "ldr    q8, [%[bias]]\n"         /* load bias */
+                           inr1,
-          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
+                           inr2,
-          "and    v19.16b,  v8.16b, v8.16b\n"
+                           outc0,
-          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
+                           outc1,
-          "and    v20.16b,  v8.16b, v8.16b\n"
+                           outc2,
-          "ldp    q4, q5,   [%[inr0]], #32\n" /* load input r0*/
+                           outc3,
-          "and    v21.16b,  v8.16b, v8.16b\n"
+                           weight_c,
-          "ldp    q6, q7,   [%[inr0]], #32\n" /* load input r0*/
+                           bias_local,
-          "and    v22.16b,  v8.16b, v8.16b\n"
+                           w0,
-          "ldr    q8,       [%[inr0]]\n"      /* load input r0*/
+                           w1,
-          /*  r0 mul w0-w2, get out */
+                           w2,
-          "fmla   v19.4s ,  %[w0].4s,  v0.4s\n" /* outr0 = w0 * r0, 0*/
+                           w3,
-          "fmla   v20.4s ,  %[w0].4s,  v2.4s\n" /* outr1 = w0 * r0, 2*/
+                           w4,
-          "fmla   v21.4s ,  %[w0].4s,  v4.4s\n" /* outr2 = w0 * r0, 4*/
+                           w5,
-          "fmla   v22.4s ,  %[w0].4s,  v6.4s\n" /* outr3 = w0 * r0, 6*/
+                           w6,
-          "fmla   v19.4s ,  %[w1].4s,  v1.4s\n" /* outr0 = w1 * r0, 1*/
+                           w7,
-          "ldp    q0, q1,   [%[inr1]], #32\n"   /* load input r1*/
+                           w8,
-          "fmla   v20.4s ,  %[w1].4s,  v3.4s\n" /* outr1 = w1 * r0, 3*/
+                           act_param);
-          "fmla   v21.4s ,  %[w1].4s,  v5.4s\n" /* outr2 = w1 * r0, 5*/
-          "fmla   v22.4s ,  %[w1].4s,  v7.4s\n" /* outr3 = w1 * r0, 7*/
-          "fmla   v19.4s ,  %[w2].4s,  v2.4s\n" /* outr0 = w0 * r0, 2*/
-          "ldp    q2, q3,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v20.4s ,  %[w2].4s,  v4.4s\n" /* outr1 = w0 * r0, 4*/
-          "ldp    q4, q5,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v21.4s ,  %[w2].4s,  v6.4s\n" /* outr2 = w0 * r0, 6*/
-          "ldp    q6, q7,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v22.4s ,  %[w2].4s,  v8.4s\n" /* outr3 = w0 * r0, 8*/
-          "ldr    q8,   [%[inr1]]\n"            /* load input r1*/
-          /*  r1, mul w3-w5, get out */
-          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr0 = w3 * r1, 0*/
-          "fmla   v20.4s ,  %[w3].4s,  v2.4s\n" /* outr1 = w3 * r1, 2*/
-          "fmla   v21.4s ,  %[w3].4s,  v4.4s\n" /* outr2 = w3 * r1, 4*/
-          "fmla   v22.4s ,  %[w3].4s,  v6.4s\n" /* outr3 = w3 * r1, 6*/
-          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr0 = w4 * r1, 1*/
-          "ldp    q0, q1,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v20.4s ,  %[w4].4s,  v3.4s\n" /* outr1 = w4 * r1, 3*/
-          "fmla   v21.4s ,  %[w4].4s,  v5.4s\n" /* outr2 = w4 * r1, 5*/
-          "fmla   v22.4s ,  %[w4].4s,  v7.4s\n" /* outr3 = w4 * r1, 7*/
-          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr0 = w5 * r1, 2*/
-          "ldp    q2, q3,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v20.4s ,  %[w5].4s,  v4.4s\n" /* outr1 = w5 * r1, 4*/
-          "ldp    q4, q5,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v21.4s ,  %[w5].4s,  v6.4s\n" /* outr2 = w5 * r1, 6*/
-          "ldp    q6, q7,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v22.4s ,  %[w5].4s,  v8.4s\n" /* outr3 = w5 * r1, 8*/
-          "ldr    q8,   [%[inr2]]\n"            /* load input r2*/
-          /*  r2, mul w6-w8, get out r0, r1 */
-          "fmla   v19.4s ,  %[w6].4s,  v0.4s\n" /* outr0 = w6 * r2, 0*/
-          "fmla   v20.4s ,  %[w6].4s,  v2.4s\n" /* outr1 = w6 * r2, 2*/
-          "fmla   v21.4s ,  %[w6].4s,  v4.4s\n" /* outr2 = w6 * r2, 4*/
-          "fmla   v22.4s ,  %[w6].4s,  v6.4s\n" /* outr3 = w6 * r2, 6*/
-          "fmla   v19.4s ,  %[w7].4s,  v1.4s\n" /* outr0 = w7 * r2, 1*/
-          "fmla   v20.4s ,  %[w7].4s,  v3.4s\n" /* outr1 = w7 * r2, 3*/
-          "fmla   v21.4s ,  %[w7].4s,  v5.4s\n" /* outr2 = w7 * r2, 5*/
-          "fmla   v22.4s ,  %[w7].4s,  v7.4s\n" /* outr3 = w7 * r2, 7*/
-          "fmla   v19.4s ,  %[w8].4s,  v2.4s\n" /* outr0 = w8 * r2, 2*/
-          "fmla   v20.4s ,  %[w8].4s,  v4.4s\n" /* outr1 = w8 * r2, 4*/
-          "fmla   v21.4s ,  %[w8].4s,  v6.4s\n" /* outr2 = w8 * r2, 6*/
-          "fmla   v22.4s ,  %[w8].4s,  v8.4s\n" /* outr3 = w8 * r2, 8*/
-          /* transpose */
-          "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
-          "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
-          "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
-          "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
-          "trn1 v19.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
-          "trn2 v21.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
-          "trn1 v20.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
-          "trn2 v22.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
-          /* relu */
-          "cbz  %w[flag_relu],  0f\n"    /* skip relu*/
-          "movi v0.4s, #0\n"             /* for relu */
-          "fmax v19.4s, v19.4s, v0.4s\n"
-          "fmax v20.4s, v20.4s, v0.4s\n"
-          "fmax v21.4s, v21.4s, v0.4s\n"
-          "fmax v22.4s, v22.4s, v0.4s\n"
-          /* save result */
-          "0:\n"
-          "str q19, [%[outc0]], #16\n"
-          "str q20, [%[outc1]], #16\n"
-          "str q21, [%[outc2]], #16\n"
-          "str q22, [%[outc3]], #16\n"
-          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
-          [inr2] "+r"(inr2),
-          [outc0]"+r"(outc0), [outc1]"+r"(outc1),
-          [outc2]"+r"(outc2), [outc3]"+r"(outc3)
-          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
-          [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
-          [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
-          [bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
-          : "cc", "memory",
-                  "v0","v1","v2","v3","v4","v5","v6","v7",
-                  "v8", "v19","v20","v21","v22"
-          );
 #else
-          asm volatile(
+          act_switch_3x3s2(inr0,
-          /* fill with bias */
+                           inr1,
-          "vld1.32  {d16-d17}, [%[bias]]\n"   /* load bias */
+                           inr2,
-          /* load weights */
+                           outc0,
-          "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
+                           outc1,
-          "vld1.32  {d0-d3},   [%[r0]]!\n"    /* load input r0, 0,1*/
+                           outc2,
-          "vand.i32 q12,  q8, q8\n"
+                           outc3,
-          "vld1.32  {d4-d7},   [%[r0]]!\n"    /* load input r0, 2,3*/
+                           weight_c,
-          "vand.i32 q13,  q8, q8\n"
+                           bias_local,
-          "vld1.32  {d8-d11},  [%[r0]]!\n"    /* load input r0, 4,5*/
+                           vzero,
-          "vand.i32 q14,  q8, q8\n"
+                           vzero,
-          "vld1.32  {d12-d15}, [%[r0]]!\n"    /* load input r0, 6,7*/
+                           vzero,
-          "vand.i32 q15,  q8, q8\n"
+                           vzero,
-          "vld1.32  {d16-d17}, [%[r0]]\n"     /* load input r0, 8*/
+                           vzero,
-          /* mul r0 with w0, w1, w2 */
+                           vzero,
-          "vmla.f32   q12, q9, q0               @ w0 * inr0\n"
+                           vzero,
-          "vmla.f32   q13, q9, q2               @ w0 * inr2\n"
+                           vzero,
-          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w2, to q11 */
+                           vzero,
-          "vmla.f32   q14, q9, q4               @ w0 * inr4\n"
+                           act_param);
-          "vmla.f32   q15, q9, q6               @ w0 * inr6\n"
+#endif
-          "vmla.f32   q12, q10, q1              @ w1 * inr1\n"
-          "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"
-          "vmla.f32   q13, q10, q3              @ w1 * inr3\n"
-          "vmla.f32   q14, q10, q5              @ w1 * inr5\n"
-          "vmla.f32   q15, q10, q7              @ w1 * inr7\n"
-          "vld1.32    {d18-d21}, [%[wc0]]!\n"  /* load w3-4, to q9-10 */
-          "vmla.f32   q12, q11, q2              @ w2 * inr2\n"
-          "vld1.32    {d4-d7}, [%[r1]]!         @ load r1, 2, 3\n"
-          "vmla.f32   q13, q11, q4              @ w2 * inr4\n"
-          "vld1.32    {d8-d11}, [%[r1]]!        @ load r1, 4, 5\n"
-          "vmla.f32   q14, q11, q6              @ w2 * inr6\n"
-          "vld1.32    {d12-d15}, [%[r1]]!       @ load r1, 6, 7\n"
-          "vmla.f32   q15, q11, q8              @ w2 * inr8\n"
-          /* mul r1 with w3, w4, w5 */
-          "vmla.f32   q12, q9, q0               @ w3 * inr0\n"
-          "vmla.f32   q13, q9, q2               @ w3 * inr2\n"
-          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w5, to q11 */
-          "vmla.f32   q14, q9, q4               @ w3 * inr4\n"
-          "vmla.f32   q15, q9, q6               @ w3 * inr6\n"
-          "vld1.32    {d16-d17}, [%[r1]]\n"     /* load input r1, 8*/
-          "vmla.f32   q12, q10, q1              @ w4 * inr1\n"
-          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"
-          "vmla.f32   q13, q10, q3              @ w4 * inr3\n"
-          "vmla.f32   q14, q10, q5              @ w4 * inr5\n"
-          "vmla.f32   q15, q10, q7              @ w4 * inr7\n"
-          "vld1.32    {d18-d21}, [%[wc0]]!\n"   /* load w6-7, to q9-10 */
-          "vmla.f32   q12, q11, q2              @ w5 * inr2\n"
-          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"
-          "vmla.f32   q13, q11, q4              @ w5 * inr4\n"
-          "vld1.32    {d8-d11}, [%[r2]]!        @ load r2, 4, 5\n"
-          "vmla.f32   q14, q11, q6              @ w5 * inr6\n"
-          "vld1.32    {d12-d15}, [%[r2]]!       @ load r2, 6, 7\n"
-          "vmla.f32   q15, q11, q8              @ w5 * inr8\n"
-          /* mul r2 with w6, w7, w8 */
-          "vmla.f32   q12, q9, q0               @ w6 * inr0\n"
-          "vmla.f32   q13, q9, q2               @ w6 * inr2\n"
-          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w8, to q11 */
-          "vmla.f32   q14, q9, q4               @ w6 * inr4\n"
-          "vmla.f32   q15, q9, q6               @ w6 * inr6\n"
-          "vld1.32    {d16-d17}, [%[r2]]\n"     /* load input r2, 8*/
-          "vmla.f32   q12, q10, q1              @ w7 * inr1\n"
-          "vmla.f32   q13, q10, q3              @ w7 * inr3\n"
-          "vmla.f32   q14, q10, q5              @ w7 * inr5\n"
-          "vmla.f32   q15, q10, q7              @ w7 * inr7\n"
-          "sub    %[wc0], %[wc0], #144          @ wc0 - 144 to start address\n"
-          "vmla.f32   q12, q11, q2              @ w8 * inr2\n"
-          "vmla.f32   q13, q11, q4              @ w8 * inr4\n"
-          "vmla.f32   q14, q11, q6              @ w8 * inr6\n"
-          "vmla.f32   q15, q11, q8              @ w8 * inr8\n"
-          /* transpose */
-          "vtrn.32 q12, q13\n"    /* a0a1c0c1, b0b1d0d1*/
-          "vtrn.32 q14, q15\n"    /* a2a3c2c3, b2b3d2d3*/
-          "vswp   d25, d28\n"     /* a0a1a2a3, c0c1c2c3*/
-          "vswp   d27, d30\n"     /* b0b1b2b3, d0d1d2d3*/
-          "cmp  %[flag_relu], #0\n"
-          "beq  0f\n"             /* skip relu*/
-          "vmov.u32 q0, #0\n"
-          "vmax.f32 q12, q12, q0\n"
-          "vmax.f32 q13, q13, q0\n"
-          "vmax.f32 q14, q14, q0\n"
-          "vmax.f32 q15, q15, q0\n"
-          "0:\n"
-          "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
-          "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
-          "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
-          "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
-          :[r0] "+r"(inr0), [r1] "+r"(inr1),
-           [r2] "+r"(inr2), [wc0] "+r" (weight_c),
-           [outc0]"+r"(outc0), [outc1]"+r"(outc1),
-           [outc2]"+r"(outc2), [outc3]"+r"(outc3)
-          :[bias] "r" (bias_local),
-           [flag_relu]"r"(flag_relu)
-          :"cc", "memory",
-            "q0","q1","q2","q3","q4","q5","q6","q7",
-            "q8", "q9","q10","q11","q12","q13","q14","q15"
-          );
-#endif  //  __arch64__
-          // clang-format off
          if (flag_mask) {
            for (int i = 0; i < remain; ++i) {
              c0[i] = pre_out[i];
@@ -350,6 +702,13 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
              c3[i] = pre_out[i + 12];
            }
          }
+          inr0 += 32;
+          inr1 += 32;
+          inr2 += 32;
+          outc0 += 4;
+          outc1 += 4;
+          outc2 += 4;
+          outc3 += 4;
        }
      }
    }

--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -2151,6 +2151,210 @@ inline void act_switch_c8_fp32(const float* din_ptr,
  }
 }
+#ifdef __aarch64__
+#define LOAD_DATA                                               \
+  "1:                               \n"                         \
+  "ld1 {v0.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v1.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v2.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/ \
+  "ld1 {v3.4s}, [%[din_ptr]], #16   \n" /*vld1q_f32(din_ptr0)*/
+#define DO_RELU                                           \
+  "fmax v0.4s, v0.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v1.4s, v1.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v2.4s, v2.4s, %[vzero].4s   \n" /* vmaxq_f32() */ \
+  "fmax v3.4s, v3.4s, %[vzero].4s   \n" /* vmaxq_f32() */
+#define DO_RELU6                                         \
+  "fmin v0.4s, v0.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v1.4s, v1.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v2.4s, v2.4s, %[vsix].4s   \n" /* vmaxq_f32() */ \
+  "fmin v3.4s, v3.4s, %[vsix].4s   \n" /* vmaxq_f32() */
+#define DO_LEAKY_RELU                                     \
+  "cmhs v4.4s, v0.4s,  %[vzero].4s  \n"   /* vcgeq_u32 */ \
+  "fmul v5.4s, v0.4s, %[vscale].4s \n"    /* vmulq_f32 */ \
+  "cmhs v6.4s, v1.4s,  %[vzero].4s  \n"   /* vcgeq_u32 */ \
+  "fmul v7.4s, v1.4s, %[vscale].4s \n"    /* vmulq_f32 */ \
+  "cmhs v8.4s, v2.4s,  %[vzero].4s  \n"   /* vcgeq_u32 */ \
+  "fmul v9.4s, v2.4s, %[vscale].4s \n"    /* vmulq_f32 */ \
+  "cmhs v10.4s, v3.4s,  %[vzero].4s  \n"  /* vcgeq_u32 */ \
+  "fmul v11.4s, v3.4s, %[vscale].4s \n"   /* vmulq_f32 */ \
+  "bif v0.16b, v5.16b, v4.16b       \n"   /* choose*/     \
+  "bif v1.16b, v7.16b, v6.16b       \n"   /* choose*/     \
+  "bif v2.16b, v9.16b, v8.16b       \n"   /* choose*/     \
+  "bif v3.16b, v11.16b, v10.16b       \n" /* choose*/
+#define DO_STORE                                         \
+  "subs %w[cnt], %w[cnt], #1                    \n"      \
+  "st1 {v0.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v1.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v2.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "st1 {v3.4s}, [%[dout_ptr]], #16 \n" /* vst1q_f32() */ \
+  "bne  1b                                    \n"
+#else
+#define LOAD_DATA                                            \
+  "1:                               \n"                      \
+  "vld1.32 {d6-d7}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d8-d9}, [%[din_ptr]]!   @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d10-d11}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n" \
+  "vld1.32 {d12-d13}, [%[din_ptr]]! @ vld1q_f32(din_ptr) \n"
+#define DO_RELU                                 \
+  "vmax.f32 q3, q3, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q4, q4, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q5, q5, %q[vzero] @ vmaxq_f32() \n" \
+  "vmax.f32 q6, q6, %q[vzero] @ vmaxq_f32() \n"
+#define DO_RELU6                               \
+  "vmin.f32 q3, q3, %q[vsix] @ vminq_f32() \n" \
+  "vmin.f32 q4, q4, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q5, q5, %q[vsix] @ vmaxq_f32() \n" \
+  "vmin.f32 q6, q6, %q[vsix] @ vmaxq_f32() \n"
+#define DO_LEAKY_RELU                            \
+  "vcge.f32 q7, q3, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q8, q3, %q[vscale]  @ vmulq_f32 \n"  \
+  "vcge.f32 q9, q4, %q[vzero]   @ vcgeq_u32 \n"  \
+  "vmul.f32 q10, q4, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q11, q5, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q12, q5, %q[vscale]  @ vmulq_f32 \n" \
+  "vcge.f32 q13, q6, %q[vzero]   @ vcgeq_u32 \n" \
+  "vmul.f32 q14, q6, %q[vscale]  @ vmulq_f32 \n" \
+  "vbif q3, q8, q7               @ choose \n"    \
+  "vbif q4, q10, q9              @ choose \n"    \
+  "vbif q5, q12, q11             @ choose \n"    \
+  "vbif q6, q13, q13             @ choose \n"
+#define DO_STORE                                            \
+  "subs %[cnt], #1                                \n"       \
+  "vst1.32 {d6-d7}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d8-d9}, [%[dout_ptr]]!       @ vst1q_f32()  \n" \
+  "vst1.32 {d10-d11}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "vst1.32 {d12-d13}, [%[dout_ptr]]!     @ vst1q_f32()  \n" \
+  "bne  1b                                    \n"
+#endif
+/*
+* Data do activation process
+* Now support relu relu6 leakyrelu act
+*/
+inline void act_switch_process(float* src,
+                               float* dst,
+                               int size,
+                               const operators::ActivationParam* act_param) {
+  int cnt = size >> 4;
+  int remain = size % 16;
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  if (act_param != nullptr && act_param->has_active) {
+    float32x4_t vsix = vdupq_n_f32(act_param->Relu_clipped_coef);
+    float32x4_t vscale = vdupq_n_f32(act_param->Leaky_relu_alpha);
+    if (cnt > 0) {
+      switch (act_param->active_type) {
+        case lite_api::ActivationType::kRelu:
+#ifdef __aarch64__
+          asm volatile(
+              LOAD_DATA DO_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero)
+              : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+          asm volatile(
+              LOAD_DATA DO_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero)
+              : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+          break;
+        case lite_api::ActivationType::kRelu6:
+#ifdef __aarch64__
+          asm volatile(
+              LOAD_DATA DO_RELU DO_RELU6 DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vsix] "w"(vsix)
+              : "memory", "cc", "v0", "v1", "v2", "v3");
+#else
+          asm volatile(
+              LOAD_DATA DO_RELU DO_RELU6 DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vsix] "w"(vsix)
+              : "memory", "cc", "q3", "q4", "q5", "q6");
+#endif
+          break;
+        case lite_api::ActivationType::kLeakyRelu:
+#ifdef __aarch64__
+          asm volatile(
+              LOAD_DATA DO_LEAKY_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vscale] "w"(vscale)
+              : "memory",
+                "cc",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11");
+#else
+          asm volatile(
+              LOAD_DATA DO_LEAKY_RELU DO_STORE
+              : [din_ptr] "+r"(src), [dout_ptr] "+r"(dst), [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero), [vscale] "w"(vscale)
+              : "memory",
+                "cc",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14");
+#endif
+          break;
+        default:
+          LOG(FATAL) << "this act_type: "
+                     << static_cast<int>(act_param->active_type)
+                     << " fuse not support";
+      }
+    }
+    // remain
+    switch (act_param->active_type) {
+      case lite_api::ActivationType::kRelu:
+        for (int i = 0; i < remain; i++) {
+          *dst = *src >= 0.f ? *src : 0.f;
+          src++;
+          dst++;
+        }
+      case lite_api::ActivationType::kRelu6:
+        for (int i = 0; i < remain; i++) {
+          float tmp = *src >= 0.f ? *src : 0.f;
+          *dst = tmp <= act_param->Relu_clipped_coef
+                     ? tmp
+                     : act_param->Relu_clipped_coef;
+          src++;
+          dst++;
+        }
+      case lite_api::ActivationType::kLeakyRelu:
+        for (int i = 0; i < remain; i++) {
+          if (*src >= 0.f) {
+            *dst = *src;
+          } else {
+            *dst = *src * act_param->Leaky_relu_alpha;
+          }
+          src++;
+          dst++;
+        }
+        break;
+      default:
+        LOG(FATAL) << "this act_type: "
+                   << static_cast<int>(act_param->active_type)
+                   << " fuse not support";
+    }
+  }
+}
 /*wirte result in outputs
 * input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
 */

--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -52,6 +52,7 @@ void conv_3x3s2_depthwise_fp32(const float* i_data,
                               const float* weights,
                               const float* bias,
                               const operators::ConvParam& param,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx);
 void conv_depthwise_3x3s1_fp32(const float* din,
@@ -67,7 +68,6 @@ void conv_depthwise_3x3s1_fp32(const float* din,
                               const float* bias,
                               int pad,
                               bool flag_bias,
-                               bool flag_relu,
                               const operators::ActivationParam act_param,
                               ARMContext* ctx);
@@ -84,7 +84,7 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                               const float* bias,
                               int pad,
                               bool flag_bias,
-                               bool flag_relu,
+                               const operators::ActivationParam act_param,
                               ARMContext* ctx);
 template <typename Dtype>

--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -584,7 +584,6 @@ void conv_depthwise_3x3_fp32(const void* din,
  const int pad_w = paddings[2];
  int stride = param.strides[1];
  int pad = pad_w;
-  bool flag_relu = param.fuse_relu;
  bool flag_bias = param.bias != nullptr;
  bool pads_equal =
      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
@@ -603,7 +602,6 @@ void conv_depthwise_3x3_fp32(const void* din,
                                bias,
                                pad,
                                flag_bias,
-                                flag_relu,
                                act_param,
                                ctx);
    } else {
@@ -638,7 +636,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                bias,
                                pad,
                                flag_bias,
-                                flag_relu,
+                                act_param,
                                ctx);
    } else {
      conv_3x3s2_depthwise_fp32(reinterpret_cast<const float*>(din),
@@ -653,6 +651,7 @@ void conv_depthwise_3x3_fp32(const void* din,
                                reinterpret_cast<const float*>(weights),
                                bias,
                                param,
+                                act_param,
                                ctx);
    }
  } else {

--- a/lite/backends/arm/math/packed_sgemm_c4.cc
+++ b/lite/backends/arm/math/packed_sgemm_c4.cc
@@ -1404,8 +1404,8 @@ void sgemm_prepack_c4_small(int M,
         /* load a0, a1 */
        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
        "bne  1b                      \n"
-        "fadd v8.4s,  v8.4s,  v9.4s   \n"
        "2:\n"
+        "fadd v8.4s,  v8.4s,  v9.4s   \n"
        "st1  {v8.4s}, [%[c]], #16    \n"
        : [a] "+r" (a_ptr),
          [b] "+r" (b_ptr),
@@ -1660,8 +1660,8 @@ void sgemm_prepack_c4_small(int M,
         /* load a0, a1 */
        "vld1.32  {d2-d5}, [%[a]]!  \n"
        "bne  1b                    \n"
-        "vadd.f32   q5, q5,   q6    \n"
        "2:\n"
+        "vadd.f32   q5, q5,   q6    \n"
        "vst1.32  {d10-d11}, [%[c]]!\n"
        : [a] "+r" (a_ptr),
          [b] "+r" (b_ptr),

--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -89,9 +89,15 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
        this->act_desc_, CUDNN_ACTIVATION_RELU, CUDNN_NOT_PROPAGATE_NAN, 0.0));
  }
+#if CUDNN_VERSION_MIN(7, 0, 0)
+  cudnnMathType_t math_type =
+      use_tensor_core_ ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH;
+  CUDNN_CHECK(cudnnSetConvolutionMathType(this->conv_desc_, math_type));
+#endif
  if (ic == param.groups && ic == oc && ic != 1) {
    this->fwd_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM;
-  } else if (1) {
+  } else if (!param.var_length) {
    const auto* i_data = param.x->data<float>();
    const auto* w_data = param.filter->data<float>();
    auto* o_data = param.output->mutable_data<float>(TARGET(kCUDA));

--- a/lite/backends/cuda/math/gemm.h
+++ b/lite/backends/cuda/math/gemm.h
@@ -55,6 +55,8 @@ class Gemm {
           PtypeOut* c,
           Context<TARGET(kCUDA)>* ctx);
+  cublasHandle_t get_handle() const { return cu_handle_; }
 private:
  cudaStream_t exe_stream_;
  cublasHandle_t cu_handle_;

--- a/lite/backends/xpu/device.cc
+++ b/lite/backends/xpu/device.cc
@@ -30,7 +30,12 @@ std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build(
  // The XPU compiler build the graph and fill all of the constant params, only
  // one output is supported now.
-  xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
+  xtcl::Array<xtcl::xExpr> all_outs;
+  for (size_t i = 0; i < outputs->size(); i++) {
+    all_outs.push_back(*outputs->at(i));
+  }
+  xtcl::xNetwork network =
+      builder->FinalizeNetwork(xtcl::relay::TupleNode::make(all_outs));
  auto target = xtcl::Target::Create(device_name_);
  auto compiler = xtcl::network::xTensorCompiler(network, target);
  compiler.SetParams(*params);  // Set the data of constant tensors

--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -35,12 +35,12 @@ void TestCase::CreateInstruction() {
    op_desc_.reset(new cpp::OpDesc());
    op_desc_->SetType("subgraph");
    op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
-    op_desc_->SetInput("Inputs", op_desc_->input_vars());
+    auto in_names = sub_block_op_desc->input_vars();
-    op_desc_->SetOutput("Outputs", op_desc_->output_vars());
+    auto out_names = sub_block_op_desc->output_vars();
-    op_desc_->SetAttr<std::vector<std::string>>(
+    op_desc_->SetInput("Inputs", in_names);
-        "input_data_names", sub_block_op_desc->input_vars());
+    op_desc_->SetOutput("Outputs", out_names);
-    op_desc_->SetAttr<std::vector<std::string>>(
+    op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
-        "output_data_names", sub_block_op_desc->output_vars());
+    op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
    op = LiteOpRegistry::Global().Create(op_desc().Type());
    static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
  } else {

--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -188,13 +188,17 @@ class Arena {
    tester_->Prepare();
  }
-  bool TestPrecision() {
+  bool TestPrecision(const std::vector<std::string>& exclude_outs = {}) {
    tester_->RunBaseline(tester_->baseline_scope());
    tester_->RunInstruction();
    bool success = true;
    for (auto& out : tester_->op_desc().OutputArgumentNames()) {
      for (auto& var : tester_->op_desc().Output(out)) {
+        if (std::find(exclude_outs.begin(), exclude_outs.end(), var) !=
+            exclude_outs.end()) {
+          continue;
+        }
        success = success && CompareTensor(out, var);
      }
    }
@@ -209,7 +213,17 @@ class Arena {
    }
    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
        std::chrono::high_resolution_clock::now() - timer);
-    LOG(INFO) << "average duration: " << duration.count() << " ms";
+    timer = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < times; i++) {
+      tester_->RunBaseline(tester_->baseline_scope());
+    }
+    auto duration_basic = std::chrono::duration_cast<std::chrono::milliseconds>(
+        std::chrono::high_resolution_clock::now() - timer);
+    LOG(INFO) << "average lite duration: " << duration.count() << " ms";
+    LOG(INFO) << "average basic duration: " << duration_basic.count() << " ms";
+    LOG(INFO) << "speed up ratio: lite_speed / basic_speed: "
+              << static_cast<float>(duration_basic.count()) / duration.count();
  }
 private:

--- a/lite/core/mir/CMakeLists.txt
+++ b/lite/core/mir/CMakeLists.txt
@@ -16,9 +16,11 @@ lite_cc_library(mir_passes
      fusion/interpolate_fuse_pass.cc
      fusion/conv_elementwise_fuse_pass.cc
      fusion/conv_activation_fuse_pass.cc
+      fusion/var_conv_2d_activation_fuse_pass.cc
      fusion/conv_bn_fuse_pass.cc
      fusion/elementwise_add_activation_fuse_pass.cc
      fusion/quant_dequant_fuse_pass.cc
+      fusion/sequence_pool_concat_fuse_pass.cc
      elimination/identity_scale_eliminate_pass.cc
      elimination/elementwise_mul_constant_eliminate_pass.cc
      static_kernel_pick_pass.cc

--- a/lite/core/mir/fusion/CMakeLists.txt
+++ b/lite/core/mir/fusion/CMakeLists.txt
@@ -10,6 +10,9 @@ lite_cc_library(fuse_conv_elementwise
 lite_cc_library(fuse_conv_activation
        SRCS conv_activation_fuser.cc
        DEPS pattern_matcher_high_api)
+lite_cc_library(fuse_var_conv_activation
+        SRCS var_conv_2d_activation_fuser.cc
+        DEPS pattern_matcher_high_api)
 lite_cc_library(fuse_conv_bn
        SRCS conv_bn_fuser.cc
        DEPS pattern_matcher_high_api)
@@ -25,17 +28,22 @@ lite_cc_library(fuse_transpose_softmax_transpose
 lite_cc_library(fuse_interpolate
        SRCS interpolate_fuser.cc
        DEPS pattern_matcher_high_api)       
+lite_cc_library(fuse_sequence_pool_concat
+        SRCS sequence_pool_concat_fuser.cc
+        DEPS pattern_matcher_high_api)       
 set(mir_fusers
    fuse_fc
    fuse_shuffle_channel
    fuse_conv_elementwise
    fuse_conv_activation
+    fuse_var_conv_activation
    fuse_conv_bn
    fuse_quant_dequant
    fuse_elementwise_add_activation
    fuse_transpose_softmax_transpose
    fuse_interpolate
+    fuse_sequence_pool_concat
    CACHE INTERNAL "fusers")
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)

--- a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/sequence_pool_concat_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+void SequencePoolConcatFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
+  fusion::SequencePoolConcatFuser fuser;
+  fuser(graph.get());
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(lite_sequence_pool_concat_fuse_pass,
+                  paddle::lite::mir::SequencePoolConcatFusePass)
+    .BindTargets({TARGET(kCUDA)});
--- a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+class SequencePoolConcatFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/sequence_pool_concat_fuser.h"
+#include <memory>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+// """
+// merge {sequence_pool x 7, concat} => merge_sequence_pool_and_concat
+//   src1              src2               src7            src1    src2      src7
+//     |                |                                  |       |         |
+//     v                v                                  |       |   ...   |
+// sequence_pool  sequence_pool  ...(sequence_pool)        |       |         |
+//     |                |              |              =>   -------------------
+//     ---------------------------------                          |
+//             |                                                  |
+//             v                                                  v
+//           concat                                     sequence_pool_concat
+// """
+void SequencePoolConcatFuser::BuildPattern() {
+  // create nodes.
+  auto* concat = OpNode("concat", "concat")->AsIntermediate();
+#define STR1(R) #R
+#define STR2(R) STR1(R)
+#define POOL_CONCAT_PATTERN(num)                                            \
+  auto* x_##num = VarNode(STR2(sequence_pool_x_##num))                      \
+                      ->assert_is_op_input("sequence_pool", "X")            \
+                      ->AsInput();                                          \
+  auto* sequence_pool_##num =                                               \
+      OpNode(STR2(sequence_pool_##num), "sequence_pool")->AsIntermediate(); \
+  auto* sequence_pool_##num##_out =                                         \
+      VarNode(STR2(sequence_pool_##num##_out))                              \
+          ->assert_is_op_output("sequence_pool", "Out")                     \
+          ->assert_is_op_nth_input("concat", "X", num - 1)                  \
+          ->AsIntermediate();                                               \
+  auto* sequence_pool_##num##_idx =                                         \
+      VarNode(STR2(sequence_pool_##num##_idx))                              \
+          ->assert_is_op_output("sequence_pool", "MaxIndex")                \
+          ->AsIntermediate();                                               \
+  *sequence_pool_##num >> *sequence_pool_##num##_idx;                       \
+  *x_##num >> *sequence_pool_##num >> *sequence_pool_##num##_out >> *concat;
+  auto* concat_out =
+      VarNode("concat_out")->assert_is_op_output("concat", "Out");
+  *concat >> *concat_out;
+  POOL_CONCAT_PATTERN(1);
+  POOL_CONCAT_PATTERN(2);
+  POOL_CONCAT_PATTERN(3);
+  POOL_CONCAT_PATTERN(4);
+  POOL_CONCAT_PATTERN(5);
+  POOL_CONCAT_PATTERN(6);
+  POOL_CONCAT_PATTERN(7);
+#undef POOL_CONCAT_PATTERN
+#undef STR1
+#undef STR2
+}
+void SequencePoolConcatFuser::InsertNewNode(SSAGraph* graph,
+                                            const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto sequence_pool_concat_op =
+      LiteOpRegistry::Global().Create("sequence_pool_concat");
+  auto concat = matched.at("concat")->stmt()->op();
+  auto* scope = concat->scope();
+  auto& valid_places = concat->valid_places();
+  sequence_pool_concat_op->Attach(op_desc, scope);
+  auto* new_op_node =
+      graph->GraphCreateInstructNode(sequence_pool_concat_op, valid_places);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_1"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_2"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_3"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_4"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_5"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_6"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("sequence_pool_x_7"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("concat_out"));
+}
+cpp::OpDesc SequencePoolConcatFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("concat")->stmt()->op_info();
+  op_desc.SetType("sequence_pool_concat");
+  op_desc.SetInput("X",
+                   {matched.at("sequence_pool_x_1")->arg()->name,
+                    matched.at("sequence_pool_x_2")->arg()->name,
+                    matched.at("sequence_pool_x_3")->arg()->name,
+                    matched.at("sequence_pool_x_4")->arg()->name,
+                    matched.at("sequence_pool_x_5")->arg()->name,
+                    matched.at("sequence_pool_x_6")->arg()->name,
+                    matched.at("sequence_pool_x_7")->arg()->name});
+  std::vector<std::string> pooltypes;
+  pooltypes.push_back(matched.at("sequence_pool_1")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_2")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_3")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_4")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_5")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_6")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  pooltypes.push_back(matched.at("sequence_pool_7")
+                          ->stmt()
+                          ->op_info()
+                          ->GetAttr<std::string>("pooltype"));
+  op_desc.SetAttr("pooltype", pooltypes);
+  op_desc.SetOutput("Out", {matched.at("concat_out")->arg()->name});
+  return op_desc;
+}
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/sequence_pool_concat_fuser.h
+++ b/lite/core/mir/fusion/sequence_pool_concat_fuser.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class SequencePoolConcatFuser : public FuseBase {
+ public:
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+};
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h"
+#include <memory>
+#include <vector>
+#include "lite/core/mir/fusion/var_conv_2d_activation_fuser.h"
+#include "lite/core/mir/pass_registry.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+void VarConv2dActivationFusePass::Apply(
+    const std::unique_ptr<SSAGraph>& graph) {
+  std::vector<std::string> act_types{"relu"};
+  for (auto act_type : act_types) {
+    fusion::VarConvActivationFuser fuser(act_type, "var_conv_2d");
+    fuser(graph.get());
+  }
+}
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
+REGISTER_MIR_PASS(lite_var_conv_2d_activation_fuse_pass,
+                  paddle::lite::mir::VarConv2dActivationFusePass)
+    .BindTargets({TARGET(kCUDA)});
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/core/mir/pass.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+class VarConv2dActivationFusePass : public ProgramPass {
+ public:
+  void Apply(const std::unique_ptr<SSAGraph>& graph) override;
+};
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/core/mir/fusion/var_conv_2d_activation_fuser.h"
+#include <memory>
+#include <vector>
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+void VarConvActivationFuser::BuildPattern() {
+  // create nodes.
+  auto* input = VarNode("X")->assert_is_op_input(conv_type_, "X")->AsInput();
+  auto* filter = VarNode("W")->assert_is_op_input(conv_type_, "W")->AsInput();
+  auto* conv2d = OpNode("var_conv_2d", conv_type_)->AsIntermediate();
+  auto* act = OpNode("act", act_type_)->AsIntermediate();
+  auto* conv2d_out = VarNode("conv2d_out")
+                         ->assert_is_op_output(conv_type_, "Out")
+                         ->assert_is_op_input(act_type_, "X")
+                         ->AsIntermediate();
+  auto* conv2d_out_1 = VarNode("conv2d_out_1")
+                           ->assert_is_op_output(conv_type_, "Col")
+                           ->AsIntermediate();
+  auto* out =
+      VarNode("output")->assert_is_op_output(act_type_, "Out")->AsOutput();
+  // create topology.
+  std::vector<PMNode*> conv2d_inputs{filter, input};
+  conv2d_inputs >> *conv2d >> *conv2d_out >> *act >> *out;
+  *conv2d >> *conv2d_out_1;
+}
+void VarConvActivationFuser::InsertNewNode(SSAGraph* graph,
+                                           const key2nodes_t& matched) {
+  auto op_desc = GenOpDesc(matched);
+  auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
+  auto conv_old = matched.at("var_conv_2d")->stmt()->op();
+  auto* scope = conv_old->scope();
+  auto& valid_places = conv_old->valid_places();
+  conv_op->Attach(op_desc, scope);
+  auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places);
+  IR_NODE_LINK_TO(matched.at("X"), new_op_node);
+  IR_NODE_LINK_TO(matched.at("W"), new_op_node);
+  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
+}
+cpp::OpDesc VarConvActivationFuser::GenOpDesc(const key2nodes_t& matched) {
+  cpp::OpDesc op_desc = *matched.at("var_conv_2d")->stmt()->op_info();
+  op_desc.SetOutput("Out", {matched.at("output")->arg()->name});
+  cpp::OpDesc act_op_desc = *matched.at("act")->stmt()->op_info();
+  if (act_type_ == "relu") {
+    op_desc.SetAttr("fuse_relu", true);
+  }
+  return op_desc;
+}
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/mir/fusion/var_conv_2d_activation_fuser.h
+++ b/lite/core/mir/fusion/var_conv_2d_activation_fuser.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include "lite/core/mir/pattern_matcher_high_api.h"
+namespace paddle {
+namespace lite {
+namespace mir {
+namespace fusion {
+class VarConvActivationFuser : public FuseBase {
+ public:
+  explicit VarConvActivationFuser(const std::string& act_type,
+                                  const std::string& conv_type)
+      : act_type_(act_type), conv_type_(conv_type) {}
+  void BuildPattern() override;
+  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
+ private:
+  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  std::string act_type_;
+  std::string conv_type_;
+};
+}  // namespace fusion
+}  // namespace mir
+}  // namespace lite
+}  // namespace paddle
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -62,12 +62,14 @@ class Optimizer {
           // TODO(Superjomn) Refine the fusion related design to select fusion
           // kernels for devices automatically.
           "lite_conv_activation_fuse_pass",              //
+           "lite_var_conv_2d_activation_fuse_pass",       //
           "lite_fc_fuse_pass",                           //
           "lite_shuffle_channel_fuse_pass",              //
           "lite_transpose_softmax_transpose_fuse_pass",  //
           "lite_interpolate_fuse_pass",                  //
           "identity_scale_eliminate_pass",               //
           "elementwise_mul_constant_eliminate_pass",     //
+           "lite_sequence_pool_concat_fuse_pass",         //
 #if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA) || \
    (defined LITE_WITH_ARM)
           "lite_elementwise_add_activation_fuse_pass",  //

--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -262,14 +262,10 @@ void Instruction::Run() {
  if (op_->run_once() && has_run_) {
    return;
  }
-#ifndef LITE_SHUTDOWN_LOG
+  // VLOG(4) << "kernel launch";
-  VLOG(4) << "kernel launch";
-#endif
  op_->InferShape();
-#ifndef LITE_SHUTDOWN_LOG
+  // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
-  VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
+  //        << TargetToStr(kernel_->target());
-          << TargetToStr(kernel_->target());
-#endif
  kernel_->Launch();
  has_run_ = true;
 }

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -49,6 +49,7 @@ add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_
 add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(instance_norm_compute_arm ARM basic SRCS instance_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(grid_sampler_compute_arm ARM basic SRCS grid_sampler_compute.cc DEPS ${lite_kernel_deps} math_arm)
 ## 2.other basic kernels: basic kernels that not used in basic models
 add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -65,20 +65,20 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
      no_dilation && flag_dw) {
    /// dw conv impl
    impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking dw conv";
+    // VLOG(3) << "invoking dw conv";
  } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
             no_dilation && pads_all_equal) {
    /// winograd conv impl
    impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking winograd conv";
+    // VLOG(3) << "invoking winograd conv";
  } else if (param.groups == 1 && kw == 3 && stride == 2 &&
             chin * chout < 4 * hin * win && kps_equal && no_dilation) {
    /// direct conv impl
    impl_ = new DirectConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking direct conv";
+    // VLOG(3) << "invoking direct conv";
  } else {
    impl_ = new GemmLikeConv<PRECISION(kFloat), PRECISION(kFloat)>;
-    VLOG(3) << "invoking gemm like conv";
+    // VLOG(3) << "invoking gemm like conv";
  }
  impl_->SetContext(std::move(this->ctx_));
  impl_->SetParam(param);
@@ -117,14 +117,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
      no_dilation && flag_dw) {
    impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    VLOG(3) << "Run DepthwiseConv Int8";
+    // VLOG(3) << "Run DepthwiseConv Int8";
  } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
             kps_equal && no_dilation) {
    impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    VLOG(3) << "Run DirectConv Int8";
+    // VLOG(3) << "Run DirectConv Int8";
  } else {
    impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kFloat)>;
-    VLOG(3) << "Run GemmLikeConvInt8";
+    // VLOG(3) << "Run GemmLikeConvInt8";
  }
  impl_->SetContext(std::move(this->ctx_));
  impl_->SetParam(param);
@@ -163,14 +163,14 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
      no_dilation && flag_dw) {
    impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    VLOG(3) << "Run DepthwiseConv Int8";
+    // VLOG(3) << "Run DepthwiseConv Int8";
  } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
             kps_equal && no_dilation) {
    impl_ = new DirectConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    VLOG(3) << "Run DirectConv Int8";
+    // VLOG(3) << "Run DirectConv Int8";
  } else {
    impl_ = new GemmLikeConv<PRECISION(kInt8), PRECISION(kInt8)>;
-    VLOG(3) << "Run GemmLikeConvInt8";
+    // VLOG(3) << "Run GemmLikeConvInt8";
  }
  impl_->SetContext(std::move(this->ctx_));
  impl_->SetParam(param);

--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -30,7 +30,7 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
  auto kw = w_dims[3];
  // select dw conv kernel
  if (kw == 3) {
-    VLOG(5) << "invoke 3x3 dw conv fp32";
+    // VLOG(5) << "invoke 3x3 dw conv fp32";
    auto paddings = *param.paddings;
    bool pads_equal =
        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
@@ -54,7 +54,7 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
      flag_trans_weights_ = true;
    }
  } else if (kw == 5) {
-    VLOG(5) << "invoke 5x5 dw conv fp32";
+    // VLOG(5) << "invoke 5x5 dw conv fp32";
    impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
  } else {
    LOG(FATAL) << "this type dw conv not impl";
@@ -86,7 +86,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
  /// select dw conv kernel
  if (kw == 3) {
    // trans weights
-    VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
+    // VLOG(5) << "invoke 3x3 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_fp32;
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
@@ -96,7 +96,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
    flag_trans_weights_ = true;
  } else if (kw == 5) {
    // trans weights
-    VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
+    // VLOG(5) << "invoke 5x5 dw conv int8 kernel fp32 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_fp32;
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
@@ -145,7 +145,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
  /// select dw conv kernel
  if (kw == 3) {
    // trans weights
-    VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
+    // VLOG(5) << "invoke 3x3 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_3x3_int8_int8;
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});
@@ -155,7 +155,7 @@ void DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
    flag_trans_weights_ = true;
  } else if (kw == 5) {
    // trans weights
-    VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
+    // VLOG(5) << "invoke 5x5 dw conv int8 kernel int8 out";
    impl_ = lite::arm::math::conv_depthwise_5x5_int8_int8;
    int cround = ROUNDUP(w_dims[0], 8);
    weights_.Resize({cround / 8, 1, kh * kw, 8});

--- a/lite/kernels/arm/grid_sampler_compute.cc
+++ b/lite/kernels/arm/grid_sampler_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/arm/grid_sampler_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+void GridSamplerCompute::PrepareForRun() {}
+void GridSamplerCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto n = param.x->dims()[0];
+  auto c = param.x->dims()[1];
+  auto h = param.x->dims()[2];
+  auto w = param.x->dims()[3];
+  const float* in = param.x->data<float>();
+  const float* grid = param.grid->data<float>();
+  float* out = param.out->mutable_data<float>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  const size_t coor_size = n * h * w;
+  const size_t workspace_size = coor_size * 12 * sizeof(float);
+  ctx.ExtendWorkspace(workspace_size);
+  int32_t* coor_p = ctx.workspace_data<int>();
+  float* dis_p = reinterpret_cast<float*>(coor_p) + coor_size * 4;
+  uint32_t* bound_p = reinterpret_cast<uint32_t*>(dis_p) + coor_size * 4;
+  float x_max = static_cast<float>(w - 1);
+  float y_max = static_cast<float>(h - 1);
+  float32x4_t vxmax = vdupq_n_f32(x_max);
+  float32x4_t vymax = vdupq_n_f32(y_max);
+  float32x4_t vone = vdupq_n_f32(1.f);
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  // compute coor, dis, bound
+  int i = coor_size;
+  for (; i > 3; i -= 4) {
+    float32x4x2_t xy = vld2q_f32(grid);
+    float32x4_t grid_x = vmulq_n_f32(vaddq_f32(xy.val[0], vone), 0.5 * x_max);
+    float32x4_t grid_y = vmulq_n_f32(vaddq_f32(xy.val[1], vone), 0.5 * y_max);
+    grid += 8;
+    // compute xw, we, yn, ys
+    int32x4x4_t vcoor;
+    vcoor.val[0] = vcvtq_s32_f32(grid_x);
+    vcoor.val[2] = vcvtq_s32_f32(grid_y);
+    float32x4_t vxwf = vcvtq_f32_s32(vcoor.val[0]);
+    float32x4_t vynf = vcvtq_f32_s32(vcoor.val[2]);
+    float32x4_t vxef = vaddq_f32(vxwf, vone);
+    float32x4_t vysf = vaddq_f32(vynf, vone);
+    vcoor.val[1] = vcvtq_s32_f32(vxef);
+    vcoor.val[3] = vcvtq_s32_f32(vysf);
+    vst4q_s32(coor_p, vcoor);
+    coor_p += 16;
+    // compute dw, dn ,de, ds
+    float32x4x4_t vdis;
+    vdis.val[0] = vsubq_f32(grid_x, vxwf);
+    vdis.val[2] = vsubq_f32(grid_y, vynf);
+    vdis.val[1] = vsubq_f32(vxef, grid_x);
+    vdis.val[3] = vsubq_f32(vysf, grid_y);
+    vst4q_f32(dis_p, vdis);
+    dis_p += 16;
+    // compute bound
+    uint32x4x4_t vbound;
+    uint32x4_t logic_xw =
+        vorrq_u32(vcltq_f32(vxwf, vzero), vcgtq_f32(vxwf, vxmax));
+    uint32x4_t logic_xe =
+        vorrq_u32(vcltq_f32(vxef, vzero), vcgtq_f32(vxef, vxmax));
+    uint32x4_t logic_yn =
+        vorrq_u32(vcltq_f32(vynf, vzero), vcgtq_f32(vynf, vymax));
+    uint32x4_t logic_ys =
+        vorrq_u32(vcltq_f32(vysf, vzero), vcgtq_f32(vysf, vymax));
+    vbound.val[0] = vmvnq_u32(vorrq_u32(logic_xw, logic_yn));
+    vbound.val[1] = vmvnq_u32(vorrq_u32(logic_xe, logic_yn));
+    vbound.val[2] = vmvnq_u32(vorrq_u32(logic_xw, logic_ys));
+    vbound.val[3] = vmvnq_u32(vorrq_u32(logic_xe, logic_ys));
+    vst4q_u32(bound_p, vbound);
+    bound_p += 16;
+  }
+  for (; i > 0; i--) {
+    float x = grid[0];
+    float y = grid[1];
+    float grid_x = (x + 1) * 0.5 * x_max;
+    float grid_y = (y + 1) * 0.5 * y_max;
+    grid += 2;
+    // compute xw, xe, yn, ys
+    int32_t xw = static_cast<int32_t>(floor(grid_x));
+    int32_t xe = xw + 1;
+    int32_t yn = static_cast<int32_t>(floor(grid_y));
+    int32_t ys = yn + 1;
+    *coor_p++ = xw;
+    *coor_p++ = xe;
+    *coor_p++ = yn;
+    *coor_p++ = ys;
+    // compute dw, de, dn, ds
+    float dw = grid_x - xw;
+    float de = xe - grid_x;
+    float dn = grid_y - yn;
+    float ds = ys - grid_y;
+    *dis_p++ = dw;
+    *dis_p++ = de;
+    *dis_p++ = dn;
+    *dis_p++ = ds;
+    // compute bound
+    bool logic_xw = (xw < 0.f || xw > x_max);
+    bool logic_xe = (xe < 0.f || xe > x_max);
+    bool logic_yn = (yn < 0.f || yn > y_max);
+    bool logic_ys = (ys < 0.f || ys > y_max);
+    *bound_p++ = ((logic_xw || logic_yn) ? 0 : 0xffffffff);
+    *bound_p++ = ((logic_xe || logic_yn) ? 0 : 0xffffffff);
+    *bound_p++ = ((logic_xw || logic_ys) ? 0 : 0xffffffff);
+    *bound_p++ = ((logic_xe || logic_ys) ? 0 : 0xffffffff);
+  }
+  size_t cube_size = c * h * w;
+  size_t spatial_size = h * w;
+  // compute output
+  for (int i = 0; i < n; ++i) {
+    const float* in_n = in + i * cube_size;
+    float* out_n = out + i * cube_size;
+    int32_t* coor_n = ctx.workspace_data<int>() + i * spatial_size * 4;
+    float* dis_n = reinterpret_cast<float*>(coor_n) + coor_size * 4;
+    uint32_t* bound_n = reinterpret_cast<uint32_t*>(dis_n) + coor_size * 4;
+#pragma omp parallel for
+    for (int j = 0; j < c; ++j) {
+      int32_t* coor_ptr = coor_n;
+      float* dis_ptr = dis_n;
+      uint32_t* bound_ptr = bound_n;
+      const float* in_c = in_n + j * spatial_size;
+      float* out_c = out_n + j * spatial_size;
+      for (int k = 0; k < spatial_size; k++) {
+        int32x4_t vcoor = vld1q_s32(coor_ptr);
+        float32x4_t vdis = vld1q_f32(dis_ptr);
+        int32_t xw = vgetq_lane_s32(vcoor, 0);
+        int32_t xe = vgetq_lane_s32(vcoor, 1);
+        int32_t yn = vgetq_lane_s32(vcoor, 2);
+        int32_t ys = vgetq_lane_s32(vcoor, 3);
+        uint32x4_t vbound = vld1q_u32(bound_ptr);
+        float dw = vgetq_lane_f32(vdis, 0);
+        float de = vgetq_lane_f32(vdis, 1);
+        float dn = vgetq_lane_f32(vdis, 2);
+        float ds = vgetq_lane_f32(vdis, 3);
+        uint32_t wnbound = vgetq_lane_u32(vbound, 0);
+        uint32_t enbound = vgetq_lane_u32(vbound, 1);
+        uint32_t wsbound = vgetq_lane_u32(vbound, 2);
+        uint32_t esbound = vgetq_lane_u32(vbound, 3);
+        float in_wn = wnbound ? in_c[yn * w + xw] : 0.f;
+        float in_en = enbound ? in_c[yn * w + xe] : 0.f;
+        float in_ws = wsbound ? in_c[ys * w + xw] : 0.f;
+        float in_es = esbound ? in_c[ys * w + xe] : 0.f;
+        coor_ptr += 4;
+        dis_ptr += 4;
+        bound_ptr += 4;
+        *out_c++ =
+            ds * (in_wn * de + in_en * dw) + dn * (in_ws * de + in_es * dw);
+      }
+    }
+  }
+}
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(grid_sampler,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::GridSamplerCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Grid", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/grid_sampler_compute.h
+++ b/lite/kernels/arm/grid_sampler_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+class GridSamplerCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GridSamplerParam;
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~GridSamplerCompute() = default;
+ private:
+};
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -41,18 +41,20 @@ void PoolCompute::Run() {
  std::vector<int>& paddings = *param.paddings;
  std::string& pooling_type = param.pooling_type;
-  bool global_pooling = param.global_pooling;
  bool exclusive = param.exclusive;
  bool adaptive = param.adaptive;
  bool ceil_mode = param.ceil_mode;
  bool use_quantizer = param.use_quantizer;
  std::string& data_format = param.data_format;
-  bool pads_equal =
+  bool pads_equal = (paddings[0] == paddings[1]) &&
-      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+                    (paddings[2] == paddings[3]) &&
+                    (paddings[0] == paddings[2]);
-  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
+  bool kps_equal =
-                   (paddings[0] == paddings[2]);
+      (ksize[0] == ksize[1]) && (strides[0] == strides[1]) && pads_equal;
+  bool global_pooling = (paddings[0] == 0) && (ksize[0] == in_dims[2]) &&
+                        (ksize[1] == in_dims[3]) && pads_equal;
+  global_pooling = param.global_pooling || global_pooling;
  if (global_pooling) {
    for (size_t i = 0; i < ksize.size(); ++i) {
      paddings[2 * i] = 0;
@@ -83,8 +85,7 @@ void PoolCompute::Run() {
      return;
    }
  } else {
-    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && pads_equal &&
+    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
-        kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling2x2s2_max(din,
                                          dout,
@@ -110,7 +111,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
-               pads_equal && kps_equal) {
+               kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s1p1_max(din,
                                            dout,
@@ -136,7 +137,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
-               pads_equal && kps_equal) {
+               kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s1p0_max(din,
                                            dout,
@@ -162,7 +163,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
-               pads_equal && kps_equal) {
+               kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s2p0_max(din,
                                            dout,
@@ -188,7 +189,7 @@ void PoolCompute::Run() {
        return;
      }
    } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
-               pads_equal && kps_equal) {
+               kps_equal) {
      if (pooling_type == "max") {
        lite::arm::math::pooling3x3s2p1_max(din,
                                            dout,

--- a/lite/kernels/arm/split_lod_tensor_compute.cc
+++ b/lite/kernels/arm/split_lod_tensor_compute.cc
@@ -54,7 +54,7 @@ void SplitLodTensorCompute::Run() {
    }
    lod->clear();
    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
-      VLOG(4) << "mask: " << mask_data[i];
+      // VLOG(4) << "mask: " << mask_data[i];
      if (static_cast<size_t>(mask_data[i]) == t) {
        size_t start_idx = i;
        auto lod_and_offset = lite::arm::math::GetSubLoDAndAbsoluteOffset(

--- a/lite/kernels/arm/while_compute.h
+++ b/lite/kernels/arm/while_compute.h
@@ -36,7 +36,7 @@ class StepExecutor {
      auto &op_desc = *block->template GetOp<cpp::OpDesc>(i);
      auto op_type = op_desc.Type();
      auto op_handler = lite::LiteOpRegistry::Global().Create(op_desc.Type());
-      VLOG(4) << "while: creating Op [" << op_type << "]";
+      // VLOG(4) << "while: creating Op [" << op_type << "]";
      op_handler->Attach(op_desc, scope);
      auto hostplace = place_;
@@ -51,9 +51,9 @@ class StepExecutor {
  void Run() {
    for (auto &op_handler : ops_of_block_) {
-      VLOG(4) << op_handler->op_info()->Repr();
+      // VLOG(4) << op_handler->op_info()->Repr();
      op_handler->InferShape();
-      VLOG(4) << "while: infered shape";
+      // VLOG(4) << "while: infered shape";
      op_handler->Run();
    }
  }

--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -11,6 +11,7 @@ add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_pool_concat_compute_cuda CUDA extra SRCS sequence_pool_concat_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose)
 add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})

--- a/lite/kernels/cuda/match_matrix_tensor_compute.cu
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
 #include <vector>
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/match_matrix_tensor_compute.h"
@@ -20,6 +21,54 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
+template <typename dtype>
+void gpu_transpose(
+    cublasHandle_t handle, const dtype* src, int M, int N, dtype* dst);
+template <>
+void gpu_transpose<float>(
+    cublasHandle_t handle, const float* src, int M, int N, float* dst) {
+  float alpha = 1.0;
+  float beta = 0.0;
+  CUBLAS_CHECK(cublasSgeam(handle,
+                           CUBLAS_OP_T,
+                           CUBLAS_OP_N,
+                           M,
+                           N,
+                           &alpha,
+                           src,
+                           N,
+                           &beta,
+                           dst,
+                           M,
+                           dst,
+                           M));
+}
+template <typename dtype>
+__global__ void padding_out(const dtype* src,
+                            const int* offset,
+                            const int seq_num_r,
+                            const int max_len_r,
+                            const int tl,
+                            const int count,
+                            dtype* dst) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int thread_num = blockDim.x * gridDim.x;
+  for (tid = threadIdx.x + blockIdx.x * blockDim.x; tid < count;
+       tid += thread_num) {
+    int seq_id = tid / (tl * max_len_r);
+    int tl_id = (tid / (max_len_r)) % tl;
+    int r_id = tid % max_len_r;
+    int cur_len = offset[seq_id + 1] - offset[seq_id];
+    if (r_id < cur_len) {
+      dst[tid] = src[(offset[seq_id] + r_id) * tl + tl_id];
+    } else {
+      dst[tid] = 0.f;
+    }
+  }
+}
 void MatchMatrixTensorCompute::PrepareForRun() {
  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
 }
@@ -28,6 +77,7 @@ void MatchMatrixTensorCompute::Run() {
  CHECK(ctx_) << "running context should be set first";
  auto& param = this->Param<param_t>();
  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
  auto* x = param.x;
  auto* w = param.w;
@@ -39,76 +89,74 @@ void MatchMatrixTensorCompute::Run() {
  const auto& offset_l = x->lod()[0];
  const auto& offset_r = y->lod()[0];
+  std::vector<int> offset_r_int(offset_r.size());
-  std::vector<size_t> top_offset;
+  std::transform(offset_r.begin(),
-  int top_size = 0;
+                 offset_r.end(),
-  top_offset.push_back(top_size);
+                 offset_r_int.begin(),
-  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+                 [](int64_t x) -> int { return static_cast<int>(x); });
-    int len_l = offset_l[b + 1] - offset_l[b];
-    int len_r = offset_r[b + 1] - offset_r[b];
+  int batch = offset_r.size() - 1;
-    top_size += dim_t * len_l * len_r;
+  int len_l = offset_l[1] - offset_l[0];
-    top_offset.push_back(top_size);
+  for (int i = 1; i < offset_l.size() - 1; i++) {
+    int cur_len = offset_l[i + 1] - offset_l[i];
+    CHECK_EQ(cur_len, len_l)
+        << "each sequence of left matrix is the same length";
  }
+  int max_len_r = 0;
-  auto* bottom_l_data = x->data<float>();
+  for (int i = 0; i < offset_r.size() - 1; ++i) {
-  auto* bottom_r_data = y->data<float>();
+    int cur_len = offset_r[i + 1] - offset_r[i];
-  auto* t_data = w->data<float>();
+    max_len_r = cur_len > max_len_r ? cur_len : max_len_r;
-  auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
-  auto* bottom_l_trans_data = tmp->mutable_data<float>(TARGET(kCUDA));
-  gemm_impl_->init(
-      false, false, x->dims()[0], dim_t * dim_in, dim_in, &context);
-  gemm_impl_->run(
-      1.0f, 0.0f, bottom_l_data, t_data, bottom_l_trans_data, &context);
-  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
-    for (int t = 0; t < dim_t; t++) {
-      int len_l = offset_l[b + 1] - offset_l[b];
-      int len_r = offset_r[b + 1] - offset_r[b];
-      auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
-      const auto* l_t_data =
-          bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
-      const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
-      gemm_impl_->init(false,
-                       true,
-                       len_l,
-                       len_r,
-                       dim_in,
-                       dim_t * dim_in,
-                       dim_in,
-                       len_r,
-                       &context);
-      gemm_impl_->run(1.0f, 0.0f, l_t_data, r_data, top_data, &context);
-    }
  }
-  int batch_size = x->lod()[0].size() - 1;
+  _input_l_transform.Resize({batch, dim_t, dim_in, len_l});
-  int lod_lv1_size = batch_size * dim_t;
+  _input_l_transform_reorganize.Resize({batch, dim_t, len_l, dim_in});
-  int lod_lv2_size = x->lod()[0].back() * dim_t;
+  _output_tmp.Resize({batch, max_len_r, dim_t, len_l});
-  std::vector<size_t> out_lod0(batch_size + 1, 0);
+  out->Resize({batch, dim_t, len_l, max_len_r});
-  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
-  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  _offset_r.Resize({static_cast<int64_t>(offset_r.size())});
-  for (int i = 0; i < batch_size; i++) {
+  TargetWrapperCuda::MemcpyAsync(_offset_r.mutable_data<int>(TARGET(kCUDA)),
-    out_lod0[i + 1] = out_lod0[i] + dim_t;
+                                 &offset_r_int[0],
-    int len_l = offset_l[i + 1] - offset_l[i];
+                                 sizeof(int) * offset_r.size(),
+                                 IoDirection::HtoD,
-    for (int j = 0; j < dim_t; j++) {
+                                 stream);
-      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
-      int len_r = offset_r[i + 1] - offset_r[i];
+  int len_r = offset_r[offset_r.size() - 1];
+  const float* input_l = x->data<float>();
-      for (int k = 0; k < len_l; k++) {
+  const float* input_r = y->data<float>();
-        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+  const float* weight_data = w->data<float>();
-            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+  float* input_l_transform =
-      }
+      _input_l_transform.mutable_data<float>(TARGET(kCUDA));
-    }
+  float* input_l_transform_reorganize =
+      _input_l_transform_reorganize.mutable_data<float>(TARGET(kCUDA));
+  float* output_tmp = _output_tmp.mutable_data<float>(TARGET(kCUDA));
+  float* out_data = out->mutable_data<float>(TARGET(kCUDA));
+  gemm_impl_->init(true, true, dim_t * dim_in, len_l, dim_in, &context);
+  gemm_impl_->run(
+      1.0f, 0.0f, weight_data, input_l, input_l_transform, &context);
+  for (int i = 0; i < dim_t; ++i) {
+    int offset = i * dim_in * len_l;
+    gpu_transpose(gemm_impl_->get_handle(),
+                  input_l_transform + offset,
+                  dim_in,
+                  len_l,
+                  input_l_transform_reorganize + offset);
  }
+  gemm_impl_->init(false, true, len_r, dim_t * len_l, dim_in, &context);
-  LoD out_lod;
+  gemm_impl_->run(
-  out_lod.push_back(top_offset);
+      1.0f, 0.0f, input_r, input_l_transform_reorganize, output_tmp, &context);
-  out_lod.push_back(offset_l);
+  int seq_num = offset_r.size() - 1;
-  out_lod.push_back(offset_r);
+  int count = seq_num * max_len_r * dim_t * len_l;
-  out->set_lod(out_lod);
+  const int blocks = 512;
+  const int grids = (count + blocks - 1) / blocks;
+  padding_out<float><<<grids, blocks, 0, stream>>>(_output_tmp.data<float>(),
+                                                   _offset_r.data<int>(),
+                                                   seq_num,
+                                                   max_len_r,
+                                                   dim_t * len_l,
+                                                   count,
+                                                   out_data);
+  out->set_lod(y->lod());
 }
 }  // namespace cuda

--- a/lite/kernels/cuda/match_matrix_tensor_compute.h
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.h
@@ -34,6 +34,10 @@ class MatchMatrixTensorCompute
 private:
  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+  lite::Tensor _input_l_transform;
+  lite::Tensor _input_l_transform_reorganize;
+  lite::Tensor _output_tmp;
+  lite::Tensor _offset_r;
 };
 }  // namespace cuda

--- a/lite/kernels/cuda/search_fc_compute.cu
+++ b/lite/kernels/cuda/search_fc_compute.cu
@@ -16,92 +16,6 @@ namespace paddle {
 namespace lite {
 namespace kernels {
 namespace cuda {
-template <typename T>
-static void anakin_NV_gemv(cublasHandle_t handle,
-                           const bool TransA,
-                           const int M,
-                           const int N,
-                           const T alpha,
-                           const T* A,
-                           const T* x,
-                           const T beta,
-                           T* y);
-template <>
-void anakin_NV_gemv<float>(cublasHandle_t handle,
-                           const bool TransA,
-                           const int M,
-                           const int N,
-                           const float alpha,
-                           const float* A,
-                           const float* x,
-                           const float beta,
-                           float* y) {
-  cublasOperation_t cuTransA = (TransA == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
-  CUBLAS_CHECK(
-      cublasSgemv(handle, cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
-}
-template <typename T>
-static void anakin_NV_gemm(cublasHandle_t handle,
-                           const bool TransA,
-                           const bool TransB,
-                           const int M,
-                           const int N,
-                           const int K,
-                           const T alpha,
-                           const T* A,
-                           const T* B,
-                           const T beta,
-                           T* C);
-template <>
-void anakin_NV_gemm<float>(cublasHandle_t handle,
-                           const bool TransA,
-                           const bool TransB,
-                           const int M,
-                           const int N,
-                           const int K,
-                           const float alpha,
-                           const float* A,
-                           const float* B,
-                           const float beta,
-                           float* C) {
-  // Note that cublas follows fortran order.
-  int lda = (!TransA /* == CblasNoTrans*/) ? K : M;
-  int ldb = (!TransB /* == CblasNoTrans*/) ? N : K;
-  cublasOperation_t cuTransA =
-      (!TransA /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (!TransB /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  CUBLAS_CHECK(cublasSgemm(handle,
-                           cuTransB,
-                           cuTransA,
-                           N,
-                           M,
-                           K,
-                           &alpha,
-                           B,
-                           ldb,
-                           A,
-                           lda,
-                           &beta,
-                           C,
-                           N));
-}
-template <>
-void anakin_NV_gemm<char>(cublasHandle_t handle,
-                          const bool TransA,
-                          const bool TransB,
-                          const int M,
-                          const int N,
-                          const int K,
-                          const char alpha,
-                          const char* A,
-                          const char* B,
-                          const char beta,
-                          char* C) {
-  LOG(FATAL) << "int8 gemm is not implemented";
-}
 template <typename T>
 static __global__ void add_bias(int n,
@@ -115,6 +29,11 @@ static __global__ void add_bias(int n,
  }
 }
+template <typename T>
+void SearchFcCompute<T>::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+}
 template <typename T>
 void SearchFcCompute<T>::Run() {
  auto& param = this->Param<param_t>();
@@ -132,22 +51,10 @@ void SearchFcCompute<T>::Run() {
  const T* weight = w_tensor->data<T>();
  const Tensor* b_tensor = param.b;
  const T* bias = b_tensor->data<T>();
-  cublasCreate(&_handle);
-  if (_M == 1 && _K > 50000) {
+  CHECK(gemm_impl_->init(false, true, _M, _N, _K, &ctx));
-    anakin_NV_gemv<T>(_handle, false, _N, _K, (T)1, weight, din, (T)0, dout);
+  gemm_impl_->run(1.0f, 0.0f, din, weight, dout, &ctx);
-  } else {
-    anakin_NV_gemm<T>(_handle,
-                      false,
-                      !_flag_trans_weights,
-                      _M,
-                      _N,
-                      _K,
-                      (T)1,
-                      din,
-                      weight,
-                      (T)0,
-                      dout);
-  }
  int total_size = _M * _N;
  add_bias<T><<<CUDA_GET_BLOCKS(total_size), CUDA_NUM_THREADS, 0, stream>>>(
      total_size, _N, bias, dout);

--- a/lite/kernels/cuda/search_fc_compute.h
+++ b/lite/kernels/cuda/search_fc_compute.h
@@ -14,7 +14,9 @@
 #pragma once
 #include <cudnn.h>
+#include <memory>
 #include "lite/backends/cuda/cuda_utils.h"
+#include "lite/backends/cuda/math/gemm.h"
 #include "lite/core/kernel.h"
 namespace paddle {
@@ -34,16 +36,15 @@ template <typename T>
 class SearchFcCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
 public:
  using param_t = operators::SearchFcParam;
+  void PrepareForRun() override;
  void Run() override;
  virtual ~SearchFcCompute() = default;
 private:
-  bool _flag_trans_weights{false};
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_{nullptr};
  int _M;
  int _K;
  int _N;
-  cublasHandle_t _handle;
-  bool _is_continue_buf{true};
 };
 }  // namespace cuda

--- a/lite/kernels/cuda/sequence_concat_compute.cu
+++ b/lite/kernels/cuda/sequence_concat_compute.cu
@@ -22,43 +22,44 @@ namespace lite {
 namespace kernels {
 namespace cuda {
-const int CUDA_NUM_THREADS = 512;
+template <typename dtype>
+__global__ void concat_impl_cuda(const int nthreads,
-template <typename T>
+                                 const dtype* in_data,
-inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs) {
+                                 const int num_concats,
-  std::vector<size_t> result;
+                                 const int concat_size,
-  result.resize(xs[0]->lod()[0].size());
+                                 const int top_concat_axis,
+                                 const int bottom_concat_axis,
-  for (size_t i = 1; i < result.size(); ++i) {
+                                 const int offset_concat_axis,
-    size_t sum = 0;
+                                 dtype* out_data) {
-    for (size_t j = 0; j < xs.size(); ++j) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
-      auto& x_lod = xs[j]->lod()[0];
+       index += blockDim.x * gridDim.x) {
-      sum += x_lod[i];
+    const int total_concat_size = concat_size * bottom_concat_axis;
-    }
+    const int concat_num = index / total_concat_size;
-    result[i] = sum;
+    const int concat_index = index % total_concat_size;
+    const int top_index =
+        concat_index +
+        (concat_num * top_concat_axis + offset_concat_axis) * concat_size;
+    out_data[top_index] = in_data[index];
  }
-  LoD lod;
-  lod.emplace_back(result);
-  return lod;
 }
-template <typename Dtype>
+template <typename dtype>
-__global__ void ker_sequence_concat(Dtype* out_data,
+__global__ void concat_impl_2d_impl(const int inner_size,
-                                    const uint64_t* in_locate_data,
+                                    const int num_concats,
-                                    const int* o2i_map,
+                                    const dtype* in_data,
-                                    const int* o2i_w_map,
+                                    const int concat_size,
-                                    const int seq_num,
+                                    const int out_concat_axis,
-                                    const int emb_size,
+                                    const int offset_concat_axis,
-                                    const int count) {
+                                    dtype* out_data) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int idx_inner = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int tid = idx; tid < count; tid += blockDim.x * gridDim.x) {
+  int idx_outer = threadIdx.y + blockIdx.y * blockDim.y;
-    int emb_id = tid % emb_size;
-    int word_id = tid / emb_size;
+  if (idx_inner < inner_size && idx_outer < num_concats) {
-    int input_id = o2i_map[word_id];
+    int idx_input = idx_outer * inner_size + idx_inner;
-    int cur_work_id = o2i_w_map[word_id];
+    int idx_output =
-    const Dtype* in_data = reinterpret_cast<const Dtype*>(
+        (idx_outer * out_concat_axis + offset_concat_axis) * concat_size +
-        reinterpret_cast<uintptr_t>(in_locate_data[input_id]));
+        idx_inner;
-    out_data[tid] = in_data[cur_work_id * emb_size + emb_id];
+    out_data[idx_output] = in_data[idx_input];
  }
 }
@@ -66,73 +67,75 @@ void SequenceConcatCompute::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->template As<CUDAContext>();
  auto stream = ctx.exec_stream();
-  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
-  int seq_num = param.X[0]->lod()[0].size() - 1;
+  const int BLOCK_SIZE = 32;
-  const int emb_size = param.X[0]->numel() / param.X[0]->dims()[0];
+  const int axis = 1;
-  std::vector<uint64_t> in_locate_vec;
+  int num_concats = param.X[0]->dims().count(0, axis);
-  for (size_t i = 0; i < param.X.size(); ++i) {
+  int concat_input_size =
-    in_locate_vec.push_back(
+      param.X[0]->dims().count(axis + 1, param.X[0]->dims().size());
-        reinterpret_cast<uintptr_t>(param.X[i]->data<float>()));
-  }
-  in_locate_tensor.Resize({static_cast<int64_t>(in_locate_vec.size())});
-  std::vector<int> out2in_map;
+  int input_size = param.X.size();
-  std::vector<int> out2in_word_map;
+  std::vector<std::vector<int64_t>> shapes_in(input_size);
-  for (int i = 0; i < seq_num; ++i) {
+  for (int i = 0; i < input_size; ++i) {
-    for (int j = 0; j < param.X.size(); ++j) {
+    shapes_in[i] = param.X[i]->dims().Vectorize();
-      auto offset = param.X[j]->lod()[0];
+  }
-      int cur_len = offset[i + 1] - offset[i];
+  std::vector<int64_t> shape_out = shapes_in[0];
-      for (int k = 0; k < cur_len; ++k) {
-        out2in_map.push_back(j);
+  // compute output shape
-        out2in_word_map.push_back(offset[i] + k);
+  for (int i = 1; i < input_size; ++i) {
+    for (int j = 0; j < shapes_in[i].size(); ++j) {
+      if (j == axis) {
+        continue;
+      } else if (shapes_in[i][j] != -1) {
+        CHECK_EQ(shape_out[j], shapes_in[i][j])
+            << "All inputs must have the same shape, except at concat_axis.";
      }
    }
+    shape_out[axis] += shapes_in[i][axis];
  }
-  int word_num = out2in_map.size();
-  out2in_map_tensor.Resize({word_num});
-  out2in_word_map_tensor.Resize({word_num});
-  int* gpu_o2i_map_data = out2in_map_tensor.mutable_data<int>(TARGET(kCUDA));
-  int* gpu_o2i_w_map_data =
-      out2in_word_map_tensor.mutable_data<int>(TARGET(kCUDA));
-  uint64_t* gpu_in_locate_data =
-      in_locate_tensor.mutable_data<uint64_t>(TARGET(kCUDA));
-  TargetWrapperCuda::MemcpyAsync(gpu_o2i_map_data,
+  param.Out->Resize(shape_out);
-                                 out2in_map.data(),
+  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
-                                 sizeof(int) * out2in_map.size(),
+  int offset_concat_axis = 0;
-                                 IoDirection::HtoD,
+  const int out_concat_axis = shape_out[axis];
-                                 stream);
-  TargetWrapperCuda::MemcpyAsync(gpu_o2i_w_map_data,
+  for (int i = 0; i < input_size; ++i) {
-                                 out2in_word_map.data(),
+    std::vector<int64_t> in_shape = param.X[i]->dims().Vectorize();
-                                 sizeof(int) * out2in_word_map.size(),
+    const auto* in_data = param.X[i]->data<float>();
-                                 IoDirection::HtoD,
+    const int in_concat_axis = in_shape[axis];
-                                 stream);
+    const int in_concat_size = in_concat_axis * concat_input_size;
-  TargetWrapperCuda::MemcpyAsync(gpu_in_locate_data,
+    const int nthreads = in_concat_size * num_concats;
-                                 in_locate_vec.data(),
+    float ratio = static_cast<float>(in_concat_size) / num_concats;
-                                 sizeof(uint64_t) * in_locate_vec.size(),
+    bool is_balance = (ratio > 0.1 && ratio < 10);
-                                 IoDirection::HtoD,
+    if (is_balance) {
-                                 stream);
+      int block_x = BLOCK_SIZE;
+      int block_y = BLOCK_SIZE;
-  param.Out->set_lod(ConcatLoD<float>(param.X));
+      int grid_x = (in_concat_size + block_x - 1) / block_x;
+      int grid_y = (num_concats + block_y - 1) / block_y;
-  int count = param.X[0]->numel();
+      dim3 block(block_x, block_y);
-  for (int i = 1; i < param.X.size(); ++i) {
+      dim3 grid(grid_x, grid_y);
-    count += param.X[i]->numel();
+      concat_impl_2d_impl<float><<<grid, block, 0, stream>>>(in_concat_size,
+                                                             num_concats,
+                                                             in_data,
+                                                             concat_input_size,
+                                                             out_concat_axis,
+                                                             offset_concat_axis,
+                                                             out_data);
+    } else {
+      int grid = (nthreads + BLOCK_SIZE - 1) / BLOCK_SIZE;
+      concat_impl_cuda<float><<<grid, BLOCK_SIZE, 0, stream>>>(
+          nthreads,
+          in_data,
+          num_concats,
+          concat_input_size,
+          out_concat_axis,
+          in_concat_axis,
+          offset_concat_axis,
+          out_data);
+    }
+    offset_concat_axis += in_concat_axis;
  }
+  param.Out->set_lod(param.X[0]->lod());
-  int blocks = (count + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-  ker_sequence_concat<float><<<blocks, CUDA_NUM_THREADS, 0, stream>>>(
-      out_data,
-      gpu_in_locate_data,
-      gpu_o2i_map_data,
-      gpu_o2i_w_map_data,
-      seq_num,
-      emb_size,
-      count);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
 }  // namespace cuda

--- a/lite/kernels/cuda/sequence_concat_compute.h
+++ b/lite/kernels/cuda/sequence_concat_compute.h
@@ -27,11 +27,6 @@ class SequenceConcatCompute
  void Run() override;
  virtual ~SequenceConcatCompute() = default;
- private:
-  lite::Tensor out2in_map_tensor;
-  lite::Tensor out2in_word_map_tensor;
-  lite::Tensor in_locate_tensor;
 };
 }  // namespace cuda

--- a/lite/kernels/cuda/sequence_pool_concat_compute.cu
+++ b/lite/kernels/cuda/sequence_pool_concat_compute.cu
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <vector>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_pool_concat_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+template <typename Dtype>
+__global__ void sequence_pool_concat(const uint64_t* input_locate_data,
+                                     const int* pool_type_list,
+                                     Dtype* output_data,
+                                     const int* offset,
+                                     int batch,
+                                     int in_num,
+                                     int in_dim) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  int em_id = tid % in_dim;
+  int in_id = (tid / in_dim) % in_num;
+  int seq_id = tid / (in_dim * in_num);
+  if (seq_id >= batch) {
+    return;
+  }
+  Dtype* out_data = output_data + tid;
+  int offset_id = in_id * (batch + 1) + seq_id;
+  if (pool_type_list[in_id] == 4) {  // last
+    const Dtype* in_data =
+        reinterpret_cast<const Dtype*>(
+            reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
+        em_id;
+    output_data[tid] = in_data[(offset[offset_id + 1] - 1) * in_dim];
+  } else if (pool_type_list[in_id] == 6) {  // max
+    const Dtype* in_data =
+        reinterpret_cast<const Dtype*>(
+            reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
+        em_id + offset[offset_id] * in_dim;
+    Dtype max = in_data[0];
+    for (int i = 1; i < offset[offset_id + 1] - offset[offset_id]; i++) {
+      Dtype cur_data = in_data[i * in_dim];
+      max = cur_data > max ? cur_data : max;
+    }
+    output_data[tid] = max;
+  } else {
+    return;
+  }
+}
+template <typename Dtype>
+__global__ void sequence_pool_concat(const uint64_t* input_locate_data,
+                                     const int* pool_type_list,
+                                     Dtype* output_data,
+                                     const int* offset,
+                                     int batch,
+                                     int in_num,
+                                     const int* out_offset,
+                                     const int* out_id_seq_map_data,
+                                     int out_dim) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  int em_id = tid % out_dim;
+  int seq_id = tid / out_dim;
+  int in_id = out_id_seq_map_data[em_id];
+  em_id = em_id - out_offset[in_id];
+  int in_dim = out_offset[in_id + 1] - out_offset[in_id];
+  if (seq_id >= batch) {
+    return;
+  }
+  Dtype* out_data = output_data + tid;
+  int offset_id = in_id * (batch + 1) + seq_id;
+  if (pool_type_list[in_id] == 4) {  // last
+    const Dtype* in_data =
+        reinterpret_cast<const Dtype*>(
+            reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
+        em_id;
+    output_data[tid] = in_data[(offset[offset_id + 1] - 1) * in_dim];
+  } else if (pool_type_list[in_id] == 6) {  // max
+    const Dtype* in_data =
+        reinterpret_cast<const Dtype*>(
+            reinterpret_cast<uintptr_t>(input_locate_data[in_id])) +
+        em_id + offset[offset_id] * in_dim;
+    Dtype max = in_data[0];
+    for (int i = 1; i < offset[offset_id + 1] - offset[offset_id]; i++) {
+      Dtype cur_data = in_data[i * in_dim];
+      max = cur_data > max ? cur_data : max;
+    }
+    output_data[tid] = max;
+  } else {
+    return;
+  }
+}
+void SequencePoolConcatCompute::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  int in_num = param.X.size();
+  std::vector<int64_t> shape({in_num, 1, 1, 1});
+  _in_offset_tensor.Resize(shape);
+  _in_ptr_tensor.Resize(shape);
+  _in_pool_type_tensor.Resize(shape);
+  int* in_pool_type_data =
+      _in_pool_type_tensor.mutable_data<int>(TARGET(kCUDA));
+  std::vector<int> pool_type_list;
+  for (auto type : param.pool_type) {
+    if (type == "AVERAGE") {
+      pool_type_list.push_back(1);
+    } else if (type == "SUM") {
+      pool_type_list.push_back(2);
+    } else if (type == "SQRT") {
+      pool_type_list.push_back(3);
+    } else if (type == "LAST") {
+      pool_type_list.push_back(4);
+    } else if (type == "FIRST") {
+      pool_type_list.push_back(5);
+    } else if (type == "MAX") {
+      pool_type_list.push_back(6);
+    } else {
+      LOG(ERROR) << "pool type " << type << " is not supoorted.";
+    }
+  }
+  _is_in_same_len = true;
+  int in_len = param.X[0]->dims().count(1, param.X[0]->dims().size());
+  std::vector<int> out_id_seq_map_list;
+  std::vector<int> out_offset_list;
+  int total_len = 0;
+  out_offset_list.push_back(total_len);
+  for (int i = 0; i < in_num; ++i) {
+    int cur_len = param.X[i]->dims().count(1, param.X[i]->dims().size());
+    _is_in_same_len = _is_in_same_len && in_len == cur_len;
+    for (int k = 0; k < cur_len; ++k) {
+      out_id_seq_map_list.push_back(i);
+    }
+    total_len += cur_len;
+    out_offset_list.push_back(total_len);
+  }
+  std::vector<int64_t> out_id_seq_map_shape({total_len, 1, 1, 1});
+  std::vector<int64_t> out_offset_shape({in_num + 1, 1, 1, 1});
+  _out_offset_tensor.Resize(out_offset_shape);
+  _out_id_seq_map_tensor.Resize(out_id_seq_map_shape);
+  int* out_offset_data = _out_offset_tensor.mutable_data<int>(TARGET(kCUDA));
+  int* out_id_seq_map_data =
+      _out_id_seq_map_tensor.mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(in_pool_type_data,
+                                 &pool_type_list[0],
+                                 sizeof(int) * param.X.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  TargetWrapperCuda::MemcpyAsync(out_offset_data,
+                                 &out_offset_list[0],
+                                 sizeof(int) * out_offset_list.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  TargetWrapperCuda::MemcpyAsync(out_id_seq_map_data,
+                                 &out_id_seq_map_list[0],
+                                 sizeof(int) * out_id_seq_map_list.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  cudaStreamSynchronize(stream);
+}
+void SequencePoolConcatCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  auto& inputs = param.X;
+  auto offset = inputs[0]->lod()[0];
+  int batch = offset.size() - 1;
+  CHECK_GE(offset.size(), 1);
+  std::vector<int> all_offset;
+  for (int i = 0; i < inputs.size(); ++i) {
+    auto it = all_offset.end();
+    auto cur_offset = inputs[i]->lod()[0];
+    all_offset.insert(it, cur_offset.begin(), cur_offset.end());
+  }
+  int total_size = all_offset.size();
+  std::vector<int64_t> offset_shape({total_size, 1, 1, 1});
+  _in_offset_tensor.Resize(offset_shape);
+  int* offset_data = _in_offset_tensor.mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(offset_data,
+                                 &all_offset[0],
+                                 sizeof(int) * all_offset.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  std::vector<uint64_t> in_locate_vec;
+  for (int i = 0; i < inputs.size(); ++i) {
+    in_locate_vec.push_back(
+        reinterpret_cast<uintptr_t>(inputs[i]->data<float>()));
+  }
+  uint64_t* in_locate_data =
+      _in_ptr_tensor.mutable_data<uint64_t>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(in_locate_data,
+                                 &in_locate_vec[0],
+                                 sizeof(uint64_t) * inputs.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  const int* in_pool_type_data = _in_pool_type_tensor.data<int>();
+  const int* out_id_seq_map_data = _out_id_seq_map_tensor.data<int>();
+  const int* out_offset_data = _out_offset_tensor.data<int>();
+  int count = param.Out->numel();
+  int in_dim = inputs[0]->numel() / inputs[0]->dims()[0];
+  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+  int in_num = inputs.size();
+  if (_is_in_same_len) {
+    sequence_pool_concat<
+        float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+        in_locate_data,
+        in_pool_type_data,
+        out_data,
+        offset_data,
+        batch,
+        in_num,
+        in_dim);
+  } else {
+    int out_dim = param.Out->numel() / param.Out->dims()[0];
+    sequence_pool_concat<
+        float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+        in_locate_data,
+        in_pool_type_data,
+        out_data,
+        offset_data,
+        batch,
+        in_num,
+        out_offset_data,
+        out_id_seq_map_data,
+        out_dim);
+  }
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(sequence_pool_concat,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequencePoolConcatCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
--- a/lite/kernels/cuda/sequence_pool_concat_compute.h
+++ b/lite/kernels/cuda/sequence_pool_concat_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+class SequencePoolConcatCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequencePoolConcatParam;
+  void Run() override;
+  void PrepareForRun() override;
+  virtual ~SequencePoolConcatCompute() = default;
+ private:
+  lite::Tensor _in_offset_tensor;
+  lite::Tensor _in_ptr_tensor;
+  lite::Tensor _in_pool_type_tensor;
+  lite::Tensor _out_offset_tensor;
+  lite::Tensor _out_id_seq_map_tensor;
+  bool _is_in_same_len;
+};
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
@@ -26,6 +26,8 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
    const Dtype *input,
    const int *gpu_input_offset_l,
    const int *gpu_input_offset_r,
+    const int row_max,
+    const int col_max,
    const int topk_size,
    const int *topks,
    const int feat_map_num) {
@@ -33,20 +35,17 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
      gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x];  // 8
  int col = gpu_input_offset_r[blockIdx.x + 1] -
            gpu_input_offset_r[blockIdx.x];  // 30
  int max_k = topks[topk_size - 1];
  max_k = max_k < col ? max_k : col;
  extern __shared__ Dtype smem[];  // H*W
-  const Dtype *fm_row_in_data = input;
+  const Dtype *fm_row_in_data = input +
-  for (int i = 0; i < blockIdx.x; ++i) {
+                                blockIdx.x * row_max * feat_map_num * col_max +
-    int tmp_row = gpu_input_offset_l[i + 1] - gpu_input_offset_l[i];
+                                blockIdx.y * row_max * col_max;
-    int tmp_col = gpu_input_offset_r[i + 1] - gpu_input_offset_r[i];
-    fm_row_in_data += tmp_row * feat_map_num * tmp_col;
-  }
-  fm_row_in_data += blockIdx.y * row * col;
-  for (int i = threadIdx.x; i < row * col; i += blockDim.x) {
+  for (int i = threadIdx.x; i < row * col_max; i += blockDim.x) {
    smem[i] = fm_row_in_data[i];
  }
  __syncthreads();
@@ -57,13 +56,13 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
        (gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size +
        blockIdx.y * topk_size;
-    Dtype *smem_start_col = smem + idx * col;
+    Dtype *smem_start_col = smem + idx * col_max;
    int counter = max_k;  // topk_size;
    Dtype last_max_val = -20000.0;
    while (counter) {
      Dtype max_val = -10000.0;
-      int max_pos = 0;
+      int max_pos = 0;  // -1;
      int m = 0;
      for (; m < col; m++) {
        Dtype cur_data = smem_start_col[m];
@@ -77,6 +76,7 @@ __global__ void topk_avg_pooling_kernel_by_row_improve(
        max_val = last_max_val;
      }
      smem_start_col[max_pos] = -10000000.0;
      int i = max_k - counter;
      for (int c = 0; c < topk_size; c++) {
        if (i <= topks[c] - 1) {
@@ -98,22 +98,18 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
  auto &param = this->Param<param_t>();
  auto &ctx = this->ctx_->template As<CUDAContext>();
  auto cuda_stream = ctx.exec_stream();
-  int topk_num = param.topks.size();
-  lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
-  _top_ks.Resize(top_ks_shape);
-  cudaMemcpyAsync(_top_ks.mutable_data<int>(TARGET(kCUDA)),
-                  &param.topks[0],
-                  sizeof(int) * topk_num,
-                  cudaMemcpyHostToDevice,
-                  cuda_stream);
-  int width_offset_len = param.COLUMN->lod()[0].size();
+  CHECK(param.X->lod().size() > 0 && param.X->lod()[0].size() > 0)
-  lite::DDim width_offset_shape(
+      << "X sequence offset is not valid";
-      std::vector<int64_t>{width_offset_len, 1, 1, 1});
+  CHECK(param.ROW->lod().size() > 0 && param.ROW->lod()[0].size() > 0)
+      << "ROW sequence offset is not valid";
+  int width_offset_len = param.X->lod()[0].size();
+  lite::DDim width_offset_shape(std::vector<int64_t>{width_offset_len});
  _width_offset.Resize(width_offset_shape);
  std::vector<int> width_lod_0(width_offset_len, 0);
-  for (size_t i = 0; i < param.COLUMN->lod()[0].size(); ++i) {
+  for (size_t i = 0; i < param.X->lod()[0].size(); ++i) {
-    width_lod_0[i] = static_cast<int>(param.COLUMN->lod()[0][i]);
+    width_lod_0[i] = static_cast<int>(param.X->lod()[0][i]);
  }
  cudaMemcpyAsync(_width_offset.mutable_data<int>(TARGET(kCUDA)),
                  &width_lod_0[0],
@@ -122,8 +118,7 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
                  cuda_stream);
  int height_offset_len = param.ROW->lod()[0].size();
-  lite::DDim height_offset_shape(
+  lite::DDim height_offset_shape(std::vector<int64_t>{height_offset_len});
-      std::vector<int64_t>{height_offset_len, 1, 1, 1});
  _height_offset.Resize(height_offset_shape);
  std::vector<int> height_lod_0(height_offset_len, 0);
  for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) {
@@ -139,39 +134,42 @@ void SequenceTopkAvgPoolingCompute<T>::Run() {
  Tensor *out_tensor = param.Out;
  const T *in_data = x_tensor->data<T>();
  T *out_data = out_tensor->mutable_data<T>(TARGET(kCUDA));
-  TargetWrapperCuda::MemsetAsync(out_tensor->mutable_data<T>(TARGET(kCUDA)),
+  TargetWrapperCuda::MemsetAsync(
-                                 0,
+      out_data, 0, sizeof(T) * param.Out->numel(), cuda_stream);
-                                 sizeof(T) * out_tensor->numel(),
-                                 cuda_stream);
+  int topk_num = param.topks.size();
+  lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
+  _top_ks.Resize(top_ks_shape);
+  cudaMemcpyAsync(_top_ks.mutable_data<int>(TARGET(kCUDA)),
+                  &param.topks[0],
+                  sizeof(int) * topk_num,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
-  int num = param.ROW->lod()[0].size() - 1;
+  int num = param.X->dims()[0];
-  int channel = param.channel_num;
+  int channel = param.X->dims()[1];
+  int height = param.X->dims()[2];
+  int width = param.X->dims()[3];
  const int *height_offset = _height_offset.data<int>();
  const int *width_offset = _width_offset.data<int>();
-  int feat_map_size = 0;
+  int feat_map_size = height * width;
-  for (size_t i = 0; i < height_lod_0.size() - 1; ++i) {
-    int height = height_lod_0[i + 1] - height_lod_0[i];
-    int width = width_lod_0[i + 1] - width_lod_0[i];
-    if (height * width > feat_map_size) {
-      feat_map_size = height * width;
-    }
-  }
  dim3 blocks(num, channel);
  dim3 threads(32, 1);
  topk_avg_pooling_kernel_by_row_improve<
      T><<<blocks, threads, feat_map_size * sizeof(T), cuda_stream>>>(
      out_data,
      in_data,
      height_offset,
      width_offset,
+      height,
+      width,
      param.topks.size(),
      _top_ks.data<int>(),
      param.channel_num);
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
 }
 }  // namespace cuda

--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
@@ -21,6 +21,8 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
+const int CUDA_NUM_THREADS = 512;
 extern __shared__ char tile[];
 template <typename dtype>
 __global__ void sharemem_softmax_kernel(int total_size,
@@ -149,6 +151,15 @@ __global__ void softmax_divid_output_kernel(int total_size,
  }
 }
+void SoftmaxCompute::PrepareForRun() {
+  int device_id;
+  cudaGetDevice(&device_id);
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, device_id);
+  sharedmem_size = deviceProp.sharedMemPerBlock;
+  max_dimsize = sharedmem_size / sizeof(float) / CUDA_NUM_THREADS;
+}
 void SoftmaxCompute::Run() {
  auto& param = this->Param<param_t>();
  auto& ctx = this->ctx_->template As<CUDAContext>();
@@ -165,18 +176,10 @@ void SoftmaxCompute::Run() {
  int total_threads = inner_num * outer_num;
  int axis_size = x_dims[axis];
-  int device_id;
+  const int threads = CUDA_NUM_THREADS;
-  const int threads = 512;
  const int blocks = (total_threads + threads - 1) / threads;
-  cudaGetDevice(&device_id);
-  cudaDeviceProp deviceProp;
-  cudaGetDeviceProperties(&deviceProp, device_id);
-  size_t sharedmem_size = deviceProp.sharedMemPerBlock;
-  int max_dimsize = sharedmem_size / sizeof(float) / threads;
  auto input_data = param.x->data<float>();
  auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
-  TargetWrapperCuda::MemsetSync(
-      output_data, 0, param.output->numel() * sizeof(float));
  if (axis_size <= max_dimsize) {
    int use_sharemem_size = axis_size * threads * sizeof(float);
    sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(

--- a/lite/kernels/cuda/softmax_compute.h
+++ b/lite/kernels/cuda/softmax_compute.h
@@ -25,8 +25,14 @@ class SoftmaxCompute
 public:
  using param_t = operators::SoftmaxParam;
+  void PrepareForRun() override;
  void Run() override;
  virtual ~SoftmaxCompute() = default;
+ private:
+  size_t sharedmem_size;
+  int num_threads;
+  int max_dimsize;
 };
 }  // namespace cuda

--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
@@ -25,224 +25,83 @@ namespace lite {
 namespace kernels {
 namespace cuda {
-const int CUDA_NUM_THREADS = 512;
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
-template <typename Dtype>
+                          int dilation,
-__global__ void var_im2col_gpu_kernel(const int n,
+                          int pad_left,
-                                      const Dtype* data_im,
+                          int pad_right,
-                                      const int height,
+                          int stride) {
-                                      const int width,
+  const int dkernel = dilation * (filter_size - 1) + 1;
-                                      const int kernel_h,
+  int output_size =
-                                      const int kernel_w,
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
-                                      const int pad_h,
-                                      const int pad_w,
+  return output_size;
-                                      const int stride_h,
-                                      const int stride_w,
-                                      const int height_col,
-                                      const int width_col,
-                                      Dtype* data_col) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int index = idx; index < n; index += blockDim.x * gridDim.x) {
-    const int h_index = index / width_col;
-    const int h_col = h_index % height_col;
-    const int w_col = index % width_col;
-    const int c_im = h_index / height_col;
-    const int c_col = c_im * kernel_h * kernel_w;
-    const int h_offset = h_col * stride_h - pad_h;
-    const int w_offset = w_col * stride_w - pad_w;
-    Dtype* data_col_ptr = data_col;
-    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
-    const Dtype* data_im_ptr = data_im;
-    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
-    for (int i = 0; i < kernel_h; ++i) {
-      for (int j = 0; j < kernel_w; ++j) {
-        int h_im = h_offset + i;
-        int w_im = w_offset + j;
-        *data_col_ptr =
-            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
-                ? data_im_ptr[i * width + j]
-                : 0;
-        data_col_ptr += height_col * width_col;
-      }
-    }
-  }
 }
-void VarConv2DCompute::var_im2col(const cudaStream_t& stream) {
+void VarConv2DCompute::PrepareForRun() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
  auto& param = this->Param<param_t>();
-  int input_channel = param.input_channel;
+  conv_param_.x = const_cast<lite::Tensor*>(param.X);
-  int kernel_h = param.kernel_h;
+  conv_param_.var_length = true;
-  int kernel_w = param.kernel_w;
-  int stride_h = param.stride_h;
+  conv_param_.paddings.reset(new std::vector<int>);
-  int stride_w = param.stride_w;
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_h / 2));
-  // auto* in_row = param.ROW;
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_h / 2));
-  // auto* in_col = param.COLUMN;
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_w / 2));
-  const auto* input = param.X;
+  conv_param_.paddings->push_back(static_cast<int>(param.kernel_w / 2));
-  auto* col = param.Col;
+  conv_param_.dilations.reset(new std::vector<int>);
+  conv_param_.dilations->push_back(1);
-  int batch = input->lod()[0].size() - 1;
+  conv_param_.dilations->push_back(1);
-  const auto& bottom_offset = input->lod()[0];
+  conv_param_.strides[0] = param.stride_h;
-  // 2-D lod info.
+  conv_param_.strides[1] = param.stride_w;
-  // const auto& offset_x = in_col->lod()[0];
+  conv_param_.filter = const_cast<lite::Tensor*>(param.W);
-  // const auto& offset_y = in_row->lod()[0];
+  conv_param_.filter->Resize({param.output_channel,
-  const auto& offset_y = param.X->lod()[1];
+                              param.input_channel,
-  const auto& offset_x = param.X->lod()[2];
+                              param.kernel_h,
-  // top offset is the whole size of each data sample
+                              param.kernel_w});
-  std::vector<uint64_t> top_offset;
-  int top_size = 0;
+  conv_param_.output = param.Out;
-  top_offset.push_back(top_size);
+  std::vector<int64_t> output_shape(
-  for (int b = 0; b < batch; ++b) {
+      {conv_param_.x->dims()[0], param.output_channel});
-    int width = offset_x[b + 1] - offset_x[b];
+  for (size_t i = 0; i < conv_param_.strides.size(); ++i) {
-    int height = offset_y[b + 1] - offset_y[b];
+    output_shape.push_back(
-    int top_im_x = 0;
+        ConvOutputSize(conv_param_.x->dims()[i + 2],
-    if (width == 0) {
+                       conv_param_.filter->dims()[i + 2],
-      top_im_x = 0;
+                       (*conv_param_.dilations.get())[i],
-    } else {
+                       (*conv_param_.paddings.get())[i * 2],
-      top_im_x = (width - 1) / stride_w + 1;
+                       (*conv_param_.paddings.get())[i * 2 + 1],
-    }
+                       conv_param_.strides[i]));
-    int top_im_y = 0;
-    if (height == 0) {
-      top_im_y = 0;
-    } else {
-      top_im_y = (height - 1) / stride_h + 1;
-    }
-    int top_x = top_im_x * top_im_y;
-    int top_y = input_channel * kernel_h * kernel_w;
-    top_size += top_y * top_x;
-    top_offset.push_back(top_size);
  }
+  if (param.fuse_relu) {
-  LoD col_lod;
+    conv_param_.activation_param.has_active = true;
-  col_lod.push_back(top_offset);
+    conv_param_.activation_param.active_type = lite_api::ActivationType::kRelu;
-  col->set_lod(col_lod);
-  std::vector<int64_t> col_dims_vec{top_size};
-  col_dims_vec.push_back(1);
-  col->Resize(col_dims_vec);
-  auto* top_data = col->mutable_data<float>(TARGET(kCUDA));
-  const auto* bottom_data = input->data<float>();
-  for (int b = 0; b < batch; ++b) {
-    int t_offset = top_offset[b];
-    int b_offset = bottom_offset[b];
-    int width = offset_x[b + 1] - offset_x[b];
-    int height = offset_y[b + 1] - offset_y[b];
-    if (width == 0 || height == 0) {
-      continue;
-    }
-    int width_col = (width - 1) / stride_w + 1;
-    int height_col = (height - 1) / stride_h + 1;
-    const float* data_im = bottom_data + b_offset;
-    float* data_col = top_data + t_offset;
-    // We are going to launch channels * height_col * width_col kernels, each
-    // kernel responsible for copying a single-channel grid.
-    int num_kernels = height_col * width_col * input_channel;
-    const int CUDA_NUM_BLOCKS =
-        (num_kernels + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
-    var_im2col_gpu_kernel<
-        float><<<CUDA_NUM_BLOCKS, CUDA_NUM_THREADS, 0, stream>>>(
-        num_kernels,
-        data_im,
-        height,
-        width,
-        kernel_h,
-        kernel_w,
-        ((stride_h - 1) * height + kernel_h - 1) / 2,
-        ((stride_w - 1) * width + kernel_w - 1) / 2,
-        stride_h,
-        stride_w,
-        height_col,
-        width_col,
-        data_col);
  }
+  conv_param_.output->Resize({output_shape});
+  conv_impl_.reset(new lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>);
+  conv_impl_->init(conv_param_, &context);
 }
 void VarConv2DCompute::Run() {
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-  auto* bottom = param.X;
+  param.Out->set_lod(param.X->lod());
-  // auto* in_row = param.ROW;
+  std::vector<int64_t> output_shape(
-  // auto* in_col = param.COLUMN;
+      {conv_param_.x->dims()[0], param.output_channel});
-  auto* w = param.W;
+  for (size_t i = 0; i < conv_param_.strides.size(); ++i) {
-  auto* top = param.Out;
+    output_shape.push_back(
-  auto* col = param.Col;
+        ConvOutputSize(conv_param_.x->dims()[i + 2],
-  int output_channel = param.output_channel;
+                       conv_param_.filter->dims()[i + 2],
-  int input_channel = param.input_channel;
+                       (*conv_param_.dilations.get())[i],
-  int kernel_h = param.kernel_h;
+                       (*conv_param_.paddings.get())[i * 2],
-  int kernel_w = param.kernel_w;
+                       (*conv_param_.paddings.get())[i * 2 + 1],
-  int stride_h = param.stride_h;
+                       conv_param_.strides[i]));
-  int stride_w = param.stride_w;
-  var_im2col(stream);
-  int batch = bottom->lod()[0].size() - 1;
-  const auto& col_offset = col->lod()[0];
-  // const auto& offset_x = in_col->lod()[0];
-  // const auto& offset_y = in_row->lod()[0];
-  const auto& offset_y = param.X->lod()[1];
-  const auto& offset_x = param.X->lod()[2];
-  std::vector<size_t> top_offset;
-  std::vector<int64_t> height_vector;
-  std::vector<int64_t> width_vector;
-  int top_size = 0;
-  top_offset.push_back(top_size);
-  for (int b = 0; b < batch; ++b) {
-    int width = offset_x[b + 1] - offset_x[b];
-    int height = offset_y[b + 1] - offset_y[b];
-    int top_im_x = 0;
-    if (width == 0) {
-      top_im_x = 0;
-    } else {
-      top_im_x = (width - 1) / stride_w + 1;
-    }
-    int top_im_y = 0;
-    if (height == 0) {
-      top_im_y = 0;
-    } else {
-      top_im_y = (height - 1) / stride_h + 1;
-    }
-    height_vector.push_back(top_im_y);
-    width_vector.push_back(top_im_x);
-    int top_im_size = top_im_y * top_im_x;
-    top_size += output_channel * top_im_size;
-    top_offset.push_back(top_size);
  }
+  conv_param_.output->Resize({output_shape});
-  LoD top_lod;
+  conv_impl_->create(conv_param_, &context);
-  top_lod.push_back(top_offset);
+  conv_impl_->run(conv_param_);
-  top->set_lod(top_lod);
-  std::vector<int64_t> top_dims_vec{top_size};
-  top_dims_vec.push_back(1);
-  top->Resize(top_dims_vec);
-  auto* top_data = top->mutable_data<float>(TARGET(kCUDA));
-  const auto* w_data = w->data<float>();
-  const auto* col_data = col->data<float>();
-  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
-  for (int b = 0; b < batch; ++b) {
-    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
-    if (top_im_size == 0) {
-      continue;
-    }
-    float* out_data = top_data + top_offset[b];
-    const float* in_data = col_data + col->lod()[0][b];
-    gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
-    gemm_impl_->init(false,
-                     false,
-                     w->dims()[0],
-                     height_vector[b] * width_vector[b],
-                     input_channel * kernel_h * kernel_w,
-                     &ctx);
-    gemm_impl_->run(1., 0., w_data, in_data, out_data, &ctx);
-  }
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
 }
 }  // namespace cuda

--- a/lite/kernels/cuda/var_conv_2d_compute.h
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
@@ -13,6 +13,8 @@
 // limitations under the License.
 #pragma once
+#include <memory>
+#include "lite/backends/cuda/math/cudnn_conv.h"
 #include "lite/core/kernel.h"
 namespace paddle {
@@ -25,10 +27,12 @@ class VarConv2DCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
  using param_t = operators::VarConv2DParam;
  void Run() override;
+  void PrepareForRun() override;
  virtual ~VarConv2DCompute() = default;
 private:
-  void var_im2col(const cudaStream_t& stream);
+  mutable operators::ConvParam conv_param_;
+  std::unique_ptr<lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>> conv_impl_;
 };
 }  // namespace cuda

--- a/lite/kernels/x86/sequence_concat_compute.h
+++ b/lite/kernels/x86/sequence_concat_compute.h
@@ -52,7 +52,29 @@ class SequenceConcatCompute
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
-    // auto& param = Param<param_t>();
+    int64_t batch_size = 0;
+    int64_t feature_size = 0;
+    std::vector<int64_t> out_dims;
+    for (const auto& tensor : param.X) {
+      const auto x_dims = tensor->dims();
+      if (out_dims.empty()) {
+        out_dims = x_dims.Vectorize();
+      }
+      batch_size += x_dims[0];
+      if (feature_size == 0) {
+        feature_size = x_dims.production() / x_dims[0];
+      } else {
+        CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+            << "Inputs of sequence concat must have same feature size";
+      }
+    }
+    if (batch_size < 0) {
+      batch_size = -1;  // Normalize batch size for compile time.
+    }
+    out_dims[0] = batch_size;
+    param.Out->Resize(out_dims);
    T* dout = param.Out->mutable_data<T>();
    std::vector<lite::Tensor> x_in_order;

--- a/lite/kernels/xpu/bridges/CMakeLists.txt
+++ b/lite/kernels/xpu/bridges/CMakeLists.txt
@@ -14,6 +14,9 @@ lite_cc_library(subgraph_bridge_pool_op_xpu SRCS pool_op.cc DEPS ${subgraph_brid
 lite_cc_library(subgraph_bridge_softmax_op_xpu SRCS softmax_op.cc DEPS ${subgraph_bridge_deps_xpu})
 lite_cc_library(subgraph_bridge_mul_op_xpu SRCS mul_op.cc DEPS ${xpu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_batch_norm_op_xpu SRCS batch_norm_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_transpose_op_xpu SRCS transpose_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_reshape_op_xpu SRCS reshape_op.cc DEPS ${xpu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_layer_norm_op_xpu SRCS layer_norm_op.cc DEPS ${xpu_subgraph_bridge_deps})
 set(xpu_subgraph_bridges
        subgraph_bridge_registry
@@ -26,6 +29,9 @@ set(xpu_subgraph_bridges
        subgraph_bridge_softmax_op_xpu
        subgraph_bridge_mul_op_xpu
        subgraph_bridge_batch_norm_op_xpu
+        subgraph_bridge_transpose_op_xpu
+        subgraph_bridge_reshape_op_xpu
+        subgraph_bridge_layer_norm_op_xpu
        CACHE INTERNAL "xpu_subgraph_bridges")
 message(STATUS "+++++ xpu_subgraph_bridges: ${xpu_subgraph_bridges}")
--- a/lite/kernels/xpu/bridges/layer_norm_op.cc
+++ b/lite/kernels/xpu/bridges/layer_norm_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+int LayerNormConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+  // Get input vars and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto scale_var_name = op_info->Input("Scale").front();
+  auto* scale = scope->FindMutableTensor(scale_var_name);
+  auto bias_var_name = op_info->Input("Bias").front();
+  auto* bias = scope->FindMutableTensor(bias_var_name);
+  auto y_var_name = op_info->Output("Y").front();
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  auto axis = op_info->GetAttr<int>("begin_norm_axis");
+  // Create scale, bias nodes
+  auto scale_const_node = graph->AddNode(scale_var_name, *scale);
+  auto bias_const_node = graph->AddNode(bias_var_name, *bias);
+  // Create node and set params from op
+  auto layer_norm_node =
+      graph->builder_.CreateLayerNorm(*graph->GetNode(x_var_name),
+                                      *scale_const_node,
+                                      *bias_const_node,
+                                      axis,
+                                      epsilon,
+                                      true,
+                                      true);
+  graph->AddNode(y_var_name, graph->builder_.GetField(layer_norm_node, 0));
+  return SUCCESS;
+}
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         layer_norm,
+                         paddle::lite::subgraph::xpu::LayerNormConverter);
--- a/lite/kernels/xpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h
@@ -22,3 +22,7 @@ USE_SUBGRAPH_BRIDGE(XPU, pool2d);
 USE_SUBGRAPH_BRIDGE(XPU, softmax);
 USE_SUBGRAPH_BRIDGE(XPU, mul);
 USE_SUBGRAPH_BRIDGE(XPU, batch_norm);
+USE_SUBGRAPH_BRIDGE(XPU, transpose);
+USE_SUBGRAPH_BRIDGE(XPU, transpose2);
+USE_SUBGRAPH_BRIDGE(XPU, reshape);
+USE_SUBGRAPH_BRIDGE(XPU, reshape2);
--- a/lite/kernels/xpu/bridges/reshape_op.cc
+++ b/lite/kernels/xpu/bridges/reshape_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/reshape_op.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+int ReshapeConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto scope = op->scope();
+  auto op_type = op_info->Type();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+  // Create node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  std::vector<int> shape;
+  if (op_info->HasInput("ShapeTensor") &&
+      !op_info->Input("ShapeTensor").empty()) {
+    for (auto var_name : op_info->Input("ShapeTensor")) {
+      shape.emplace_back(scope->FindMutableTensor(var_name)->data<int>()[0]);
+    }
+    CHECK_GT(shape.size(), 0)
+        << "ShapeError: When `shape` in ReshapeOp is a list or tuple "
+           "which contains Tensor, the shape's size can't be zero. "
+           "But received shape's size is "
+        << shape.size();
+  } else if (op_info->HasInput("Shape") && !op_info->Input("Shape").empty()) {
+    auto shape_tensor =
+        scope->FindMutableTensor(op_info->Input("Shape").front());
+    auto shape_data = shape_tensor->data<int>();
+    shape = std::vector<int>(shape_data, shape_data + shape_tensor->numel());
+  } else if (op_info->HasAttr("shape")) {
+    shape = op_info->GetAttr<std::vector<int>>("shape");
+  } else {
+    LOG(FATAL) << "no new shape for reshape op";
+  }
+  auto out_dims =
+      operators::ValidateShape(shape, scope->FindTensor(x_var_name)->dims());
+  CHECK(graph->HasNode(x_var_name));
+  graph->AddNode(out_var_name,
+                 graph->builder_.CreateReshape(*graph->GetNode(x_var_name),
+                                               Cvt2ArrayInt(out_dims)));
+  return SUCCESS;
+}
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         reshape2,
+                         paddle::lite::subgraph::xpu::ReshapeConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         reshape,
+                         paddle::lite::subgraph::xpu::ReshapeConverter);
--- a/lite/kernels/xpu/bridges/transpose_op.cc
+++ b/lite/kernels/xpu/bridges/transpose_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/xpu/bridges/graph.h"
+#include "lite/kernels/xpu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace xpu {
+int TransposeConverter(void* ctx, OpLite* op) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  VLOG(3) << "[XPU] Converting " + op_type + "...";
+  // Create node and set params from op
+  auto x_var_name = op_info->Input("X").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto axis = op_info->GetAttr<std::vector<int>>("axis");
+  CHECK(graph->HasNode(x_var_name));
+  graph->AddNode(
+      out_var_name,
+      graph->builder_.CreateTranspose(
+          *graph->GetNode(x_var_name),
+          Cvt2ArrayInt(std::vector<int64_t>(axis.begin(), axis.end()))));
+  return SUCCESS;
+}
+}  // namespace xpu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         transpose,
+                         paddle::lite::subgraph::xpu::TransposeConverter);
+REGISTER_SUBGRAPH_BRIDGE(XPU,
+                         transpose2,
+                         paddle::lite::subgraph::xpu::TransposeConverter);
--- a/lite/kernels/xpu/bridges/utility.cc
+++ b/lite/kernels/xpu/bridges/utility.cc
@@ -125,6 +125,18 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor(const Tensor& in_tensor,
  return out_tensor;
 }
+xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const std::vector<int64_t>& input) {
+  xtcl::Array<xtcl::Integer> output;
+  for (auto i : input) {
+    output.push_back(i);
+  }
+  return output;
+}
+xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const DDim& input) {
+  return Cvt2ArrayInt(input.Vectorize());
+}
 }  // namespace xpu
 }  // namespace subgraph
 }  // namespace lite

--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
@@ -47,6 +47,9 @@ std::shared_ptr<xtcl::xNDArray> CvtTensor(
    PrecisionType in_ptype = PRECISION(kFloat),
    DataLayoutType in_ltype = DATALAYOUT(kNCHW));
+xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const std::vector<int64_t>& input);
+xtcl::Array<xtcl::Integer> Cvt2ArrayInt(const DDim& input);
 }  // namespace xpu
 }  // namespace subgraph
 }  // namespace lite

--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -60,9 +60,14 @@ int SubgraphEngine::BuildDeviceProgram() {
  // Obtain the output nodes of the XPU IR graph and build the graph to XPU
  // runtime
  std::vector<xtcl::xExpr*> output_nodes;
+  std::vector<std::string> valid_output_names;
  for (auto& output_name : output_names_) {
-    output_nodes.push_back(graph.GetNode(output_name).get());
+    if (graph.HasNode(output_name)) {
+      output_nodes.push_back(graph.GetNode(output_name).get());
+      valid_output_names.push_back(output_name);
+    }
  }
+  CHECK(!valid_output_names.empty()) << "[XPU] no valid output names";
  device_program_ = lite::xpu::Device::Global().Build(
      &graph.builder_, &graph.params_, &output_nodes);
  if (device_program_ == nullptr) {
@@ -73,16 +78,16 @@ int SubgraphEngine::BuildDeviceProgram() {
  // Query and check the dimensions of input and output tensors
  origin_idims_.resize(input_names_.size());
  origin_itensors_.resize(input_names_.size());
-  origin_odims_.resize(output_names_.size());
+  origin_odims_.resize(valid_output_names.size());
-  origin_otensors_.resize(output_names_.size());
+  origin_otensors_.resize(valid_output_names.size());
  for (int i = 0; i < input_names_.size(); i++) {
    origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
    CHECK(origin_itensors_[i]);
    origin_idims_[i] = origin_itensors_[i]->dims();
    VLOG(3) << "[XPU] Input dims[" << i << "]: " << origin_idims_[i];
  }
-  for (int i = 0; i < output_names_.size(); i++) {
+  for (int i = 0; i < valid_output_names.size(); i++) {
-    origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
+    origin_otensors_[i] = scope_->FindMutableTensor(valid_output_names[i]);
    CHECK(origin_otensors_[i]);
    origin_odims_[i] = origin_otensors_[i]->dims();
    VLOG(3) << "[XPU] Output dims[" << i << "]: " << origin_odims_[i];
@@ -113,7 +118,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
  device_program_->Run();
  VLOG(3) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
  // Copy the data of output XPU tensor to the buffer of origin output tensors
-  for (size_t i = 0; i < output_names_.size(); i++) {
+  for (size_t i = 0; i < origin_otensors_.size(); i++) {
    auto output_ndarray = device_program_->GetOutput(i);
    std::memcpy(origin_otensors_[i]->mutable_data<float>(),
                static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data),

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -49,6 +49,7 @@ add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
 add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
 add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
 add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
+add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS})
 # 2.basic ops not used in basic models
 add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
@@ -89,6 +90,8 @@ add_operator(merge_lod_tensor_op_lite extra SRCS merge_lod_tensor_op.cc DEPS ${o
 add_operator(reduce_prod_op_lite extra SRCS reduce_prod_op.cc DEPS ${op_DEPS})
 add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS})
 add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS})
+add_operator(sequence_pool extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
+add_operator(sequence_pool_concat extra SRCS sequence_pool_concat_op.cc DEPS ${op_DEPS})
 add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS})
 add_operator(match_matrix_tensor_op_lite extra SRCS match_matrix_tensor_op.cc DEPS ${op_DEPS})
 add_operator(search_seq_depadding_op_lite extra SRCS search_seq_depadding_op.cc DEPS ${op_DEPS})
@@ -119,7 +122,6 @@ add_operator(greater_than  extra SRCS compare_op.cc DEPS ${op_DEPS})
 add_operator(greater_equal  extra SRCS compare_op.cc DEPS ${op_DEPS})
 add_operator(read_from_array_op extra SRCS read_from_array_op.cc DEPS ${op_DEPS})
 add_operator(beam_search_op extra SRCS beam_search_op.cc DEPS ${op_DEPS})
-add_operator(sequence_pool extra SRCS sequence_pool_op.cc DEPS ${op_DEPS})
 add_operator(lod_reset_op extra SRCS lod_reset_op.cc DEPS ${op_DEPS})
 add_operator(is_empty extra SRCS is_empty_op.cc DEPS ${op_DEPS})
 add_operator(slice_op_lite basic SRCS slice_op.cc DEPS ${op_DEPS})

--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -52,12 +52,12 @@ inline int ConvOutputSize(int input_size,
  return output_size;
 }
-inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+void UpdatePaddingAndDilation(std::vector<int>* paddings,
-                                     std::vector<int>* dilations,
+                              std::vector<int>* dilations,
-                                     const std::vector<int>& strides,
+                              const std::vector<int>& strides,
-                                     const std::string padding_algorithm,
+                              const std::string padding_algorithm,
-                                     const lite::DDim data_dims,
+                              const lite::DDim data_dims,
-                                     const lite::DDim& ksize) {
+                              const lite::DDim& ksize) {
  // when padding_desc is "VALID" or "SAME"
  if (padding_algorithm == "SAME") {
    for (size_t i = 0; i < strides.size(); ++i) {

--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -136,7 +136,13 @@ class ConvOpLite : public OpLite {
  mutable ConvParam param_;
  std::string padding_algorithm_{""};
 };
+// update padding dilation
+void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                              std::vector<int>* dilations,
+                              const std::vector<int>& strides,
+                              const std::string padding_algorithm,
+                              const lite::DDim data_dims,
+                              const lite::DDim& ksize);
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
--- a/lite/operators/grid_sampler_op.cc
+++ b/lite/operators/grid_sampler_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/grid_sampler_op.h"
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool GridSamplerOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.out);
+  CHECK_OR_FALSE(param_.grid);
+  auto x_dims = param_.x->dims();
+  auto grid_dims = param_.grid->dims();
+  CHECK_EQ(x_dims.size(), 4UL) << "Input must have 4 dimensions.";
+  CHECK_EQ(grid_dims.size(), 4UL) << "Grid must have 4 dimensions.";
+  CHECK_EQ(grid_dims[0], x_dims[0])
+      << "Input(X) dims[0] and Input(Grid) dims[0] should be equal.";
+  CHECK_EQ(grid_dims[1], x_dims[2])
+      << "Input(X) dims[2] and Input(Grid) dims[1] should be equal.";
+  CHECK_EQ(grid_dims[2], x_dims[3])
+      << "Input(X) dims[3] and Input(Grid) dims[2] should be equal.";
+  return true;
+}
+bool GridSamplerOp::InferShape() const {
+  auto x_dims = param_.x->dims();
+  param_.out->Resize(x_dims);
+  return true;
+}
+bool GridSamplerOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
+  param_.x = scope->FindVar(op_desc.Input("X").front())->GetMutable<Tensor>();
+  param_.grid =
+      scope->FindVar(op_desc.Input("Grid").front())->GetMutable<Tensor>();
+  param_.out =
+      scope->FindVar(op_desc.Output("Output").front())->GetMutable<Tensor>();
+  return true;
+}
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
+REGISTER_LITE_OP(grid_sampler, paddle::lite::operators::GridSamplerOp);
--- a/lite/operators/grid_sampler_op.h
+++ b/lite/operators/grid_sampler_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class GridSamplerOp : public OpLite {
+ public:
+  GridSamplerOp() {}
+  explicit GridSamplerOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "grid_sampler"; }
+ private:
+  mutable GridSamplerParam param_;
+};
+} /* namespace operators */
+} /* namespace lite */
+} /* namespace paddle */
--- a/lite/operators/layer_norm_op.cc
+++ b/lite/operators/layer_norm_op.cc
@@ -30,7 +30,7 @@ bool LayerNormOp::CheckShape() const {
 bool LayerNormOp::InferShape() const {
  auto out_dims = param_.X->dims();
  param_.Y->Resize(out_dims);
-  auto inner_size = out_dims.Flatten2D(param_.begin_norm_axis)[1];
+  auto inner_size = out_dims.Flatten2D(param_.begin_norm_axis)[0];
  param_.Mean->Resize(std::vector<int64_t>({inner_size}));
  param_.Variance->Resize(std::vector<int64_t>({inner_size}));

--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -286,6 +286,8 @@ struct ConvParam {
  std::string data_format{"Anylayout"};
  // for activation
  ActivationParam activation_param;
+  // support var_length or not
+  bool var_length{false};
  // for int8
  WITH_INT8_CONFIG
 };
@@ -767,6 +769,12 @@ struct SequencePoolParam {
 #endif
 };
+struct SequencePoolConcatParam {
+  std::vector<lite::Tensor*> X{};
+  lite::Tensor* Out{};
+  std::vector<std::string> pool_type{};
+};
 struct SearchGroupPaddingParam {
  lite::Tensor* x{};
  lite::Tensor* out_emb_padding{};
@@ -862,6 +870,8 @@ struct VarConv2DParam {
  int stride_w;
  int kernel_h;
  int kernel_w;
+  bool fuse_relu{false};
 };
 /// ----------------------- shape operators ----------------------
@@ -1114,6 +1124,12 @@ struct InstanceNormParam {
  lite::Tensor* saved_variance{};
  float epsilon;
 };
+/// --------------------- grid sampler operators --------------------
+struct GridSamplerParam {
+  lite::Tensor* x{};
+  lite::Tensor* out{};
+  lite::Tensor* grid{};
+};
 }  // namespace operators
 }  // namespace lite

--- a/lite/operators/sequence_concat_op.cc
+++ b/lite/operators/sequence_concat_op.cc
@@ -23,47 +23,10 @@ bool SequenceConcatOp::CheckShape() const {
  CHECK_GT(param_.X.size(), 1)
      << "The number of input sequences is at least two.";
  CHECK_OR_FALSE(param_.Out);
-  size_t lod_size = 0;
-  for (const auto &t : param_.X) {
-    CHECK_EQ(t->lod().empty(), false)
-        << "Input Tensor of X does not contain LoD information.";
-    // CHECK_EQ(t->lod().size(), 1) << "Only support one level sequence now.";
-    if (lod_size == 0) {
-      lod_size = t->lod()[0].size();
-    } else {
-      CHECK_EQ(t->lod()[0].size(), lod_size)
-          << "The number of sequence must be same between each input";
-    }
-  }
-  CHECK_NE(lod_size, 0) << "Each input must have sequence information";
  return true;
 }
-bool SequenceConcatOp::InferShape() const {
+bool SequenceConcatOp::InferShape() const { return true; }
-  int64_t batch_size = 0;
-  int64_t feature_size = 0;
-  std::vector<int64_t> out_dims;
-  for (const auto &tensor : param_.X) {
-    const auto x_dims = tensor->dims();
-    if (out_dims.empty()) {
-      out_dims = x_dims.Vectorize();
-    }
-    batch_size += x_dims[0];
-    if (feature_size == 0) {
-      feature_size = x_dims.production() / x_dims[0];
-    } else {
-      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
-          << "Inputs of sequence concat must have same feature size";
-    }
-  }
-  if (batch_size < 0) {
-    batch_size = -1;  // Normalize batch size for compile time.
-  }
-  out_dims[0] = batch_size;
-  param_.Out->Resize(out_dims);
-  // LoD info will be computed in Kernel.
-  return true;
-}
 bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
                                  lite::Scope *scope) {

--- a/lite/operators/sequence_pool_concat_op.cc
+++ b/lite/operators/sequence_pool_concat_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/sequence_pool_concat_op.h"
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool SequencePoolConcatOp::CheckShape() const {
+  CHECK_GE(param_.X.size(), 1)
+      << "The number of input sequences is at least two.";
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+bool SequencePoolConcatOp::InferShape() const {
+  int out_dim = 0;
+  for (int i = 0; i < param_.X.size(); ++i) {
+    out_dim += param_.X[i]->dims().count(1, param_.X[i]->dims().size());
+  }
+  int seq_num = param_.X[0]->lod()[0].size() - 1;
+  std::vector<std::vector<uint64_t>> lod(1);
+  for (int i = 0; i < seq_num + 1; ++i) {
+    lod[0].push_back(i);
+  }
+  param_.Out->set_lod(lod);
+  param_.Out->Resize({seq_num, out_dim});
+  return true;
+}
+bool SequencePoolConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                      lite::Scope *scope) {
+  auto input_list = opdesc.Input("X");
+  param_.X.clear();
+  for (auto var : input_list) {
+    param_.X.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  CHECK(param_.Out) << "Output(Out) of Sequence Concat Op should not be null.";
+  param_.pool_type = opdesc.GetAttr<std::vector<std::string>>("pooltype");
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(sequence_pool_concat,
+                 paddle::lite::operators::SequencePoolConcatOp);
--- a/lite/operators/sequence_pool_concat_op.h
+++ b/lite/operators/sequence_pool_concat_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+class SequencePoolConcatOp : public OpLite {
+ public:
+  SequencePoolConcatOp() {}
+  explicit SequencePoolConcatOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_pool_concat"; }
+ private:
+  mutable SequencePoolConcatParam param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/lite/operators/transpose_op.cc
+++ b/lite/operators/transpose_op.cc
@@ -135,6 +135,15 @@ bool Transpose2Op::InferShape() const {
    out_dims[i] = x_dims[axis[i]];
  }
  param_.output->Resize(out_dims);
+  std::vector<DDim::value_type> xshape_dims(x_dims.size() + 1, 0);
+  for (size_t i = 0; i < x_dims.size(); i++) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  param_.xshape->Resize(xshape_dims);
+  auto xshape_lod = param_.xshape->mutable_lod();
+  *xshape_lod = param_.x->lod();
  return true;
 }

--- a/lite/operators/var_conv_2d_op.cc
+++ b/lite/operators/var_conv_2d_op.cc
@@ -19,28 +19,7 @@ namespace paddle {
 namespace lite {
 namespace operators {
-bool VarConv2dOp::CheckShape() const {
+bool VarConv2dOp::CheckShape() const { return true; }
-  auto x_dims = param_.X->dims();
-  CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) can't be less than 2.";
-  auto w_dims = param_.W->dims();
-  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor";
-  CHECK_EQ(w_dims[0], param_.output_channel)
-      << "W dim[0] should be equal to OutputChannel";
-  CHECK_EQ(w_dims[1], param_.input_channel * param_.kernel_h * param_.kernel_w)
-      << "W dim[1] should be equal to InputChannel * KernelH * KernelW";
-  LoD x_lod = param_.X->lod();
-  CHECK_EQ(x_lod.empty(), false) << "The Input(X) must hold lod info.";
-  // CHECK_GE(x_lod.size(), 1) << "The Input(X)'s lod info is corrupted.";
-  CHECK_GE(x_lod.size(), 3) << "The Input(X)'s lod info is corrupted.";
-  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()))
-      << "The Input(X)'s lod info mismatches the actual tensor shape.";
-  // LoD row_lod = param_.ROW->lod();
-  // CHECK_EQ(row_lod.empty(), false) << "The Input(ROW) must hold lod info.";
-  // LoD col_lod = param_.COLUMN->lod();
-  // CHECK_EQ(col_lod.empty(), false) << "The Input(COLUMN) must hold lod
-  // info.";
-  return true;
-}
 bool VarConv2dOp::InferShape() const { return true; }
@@ -69,6 +48,10 @@ bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
  param_.kernel_w = opdesc.GetAttr<int>("KernelW");
  param_.stride_h = opdesc.GetAttr<int>("StrideH");
  param_.stride_w = opdesc.GetAttr<int>("StrideW");
+  if (opdesc.HasAttr("fuse_relu")) {
+    param_.fuse_relu = opdesc.GetAttr<bool>("fuse_relu");
+  }
  return true;
 }

--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -15,6 +15,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE
    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -24,6 +25,9 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_XPU) AND (LITE
    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 if(LITE_BUILD_EXTRA)
    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/grid_sampler_compute_test.cc
+++ b/lite/tests/kernels/grid_sampler_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+namespace paddle {
+namespace lite {
+class GridSamplerComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string input_ = "x";
+  std::string output_ = "y";
+  std::string grid_ = "grid";
+  DDim dims_{{4, 5, 19, 19}};
+ public:
+  GridSamplerComputeTest(const Place& place,
+                         const std::string& alias,
+                         DDim dims)
+      : TestCase(place, alias), dims_(dims) {}
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(input_);
+    auto grid = scope->FindTensor(grid_);
+    auto out = scope->NewTensor(output_);
+    CHECK(out);
+    out->Resize(dims_);
+    const float* x_data = x->data<float>();
+    const float* grid_data = grid->data<float>();
+    float* out_data = out->mutable_data<float>();
+    int num = x->dims()[0];
+    int channel = x->dims()[1];
+    int height = x->dims()[2];
+    int width = x->dims()[3];
+    int spatial_size = height * width;
+    auto inbound = [](int x, int y, float x_max, float y_max) {
+      if (x < 0 || x > x_max || y < 0 || y > y_max) {
+        return false;
+      }
+      return true;
+    };
+    for (int n = 0; n < num; ++n) {
+      const float* x_n = x_data + n * channel * height * width;
+      float* out_n = out_data + n * channel * height * width;
+      const float* grid_n = grid_data + n * height * width * 2;
+      for (int c = 0; c < channel; ++c) {
+        const float* x_c = x_n + c * spatial_size;
+        float* out_c = out_n + c * spatial_size;
+        for (int s = 0; s < spatial_size; ++s) {
+          float x = grid_n[s * 2];
+          float y = grid_n[s * 2 + 1];
+          float xwf = (x + 1.f) * 0.5 * (width - 1);
+          float ynf = (y + 1.f) * 0.5 * (height - 1);
+          int xw = floor(xwf);
+          int xe = xw + 1;
+          int yn = floor(ynf);
+          int ys = yn + 1;
+          float dw = xwf - xw;
+          float de = xe - xwf;
+          float dn = ynf - yn;
+          float ds = ys - ynf;
+          float wn = inbound(xw,
+                             yn,
+                             static_cast<float>(width - 1),
+                             static_cast<float>(height - 1))
+                         ? x_c[yn * width + xw]
+                         : 0.f;
+          float en = inbound(xe,
+                             yn,
+                             static_cast<float>(width - 1),
+                             static_cast<float>(height - 1))
+                         ? x_c[yn * width + xe]
+                         : 0.f;
+          float ws = inbound(xw,
+                             ys,
+                             static_cast<float>(width - 1),
+                             static_cast<float>(height - 1))
+                         ? x_c[ys * width + xw]
+                         : 0.f;
+          float es = inbound(xe,
+                             ys,
+                             static_cast<float>(width - 1),
+                             static_cast<float>(height - 1))
+                         ? x_c[ys * width + xe]
+                         : 0.f;
+          out_c[s] = wn * de * ds + en * dw * ds + ws * de * dn + es * dw * dn;
+        }
+      }
+    }
+  }
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("grid_sampler");
+    op_desc->SetInput("X", {input_});
+    op_desc->SetInput("Grid", {grid_});
+    op_desc->SetOutput("Output", {output_});
+  }
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    DDim gird_dims{{dims_[0], dims_[2], dims_[3], 2}};
+    std::vector<float> grid(gird_dims.production());
+    fill_data_rand(grid.data(), -1.f, 1.f, gird_dims.production());
+    SetCommonTensor(input_, dims_, din.data());
+    SetCommonTensor(grid_, gird_dims, grid.data());
+  }
+};
+void test_grid_sampler(Place place) {
+  for (auto& n : {1, 13}) {
+    for (auto& c : {1, 3, 8}) {
+      for (auto& h : {1, 3, 8, 64}) {
+        for (auto& w : {2, 4, 9, 63}) {
+          DDim dim_in({n, c, h, w});
+          std::unique_ptr<arena::TestCase> tester(
+              new GridSamplerComputeTest(place, "def", dim_in));
+#ifdef LITE_WITH_ARM
+          auto& ctx = tester->context()->As<ARMContext>();
+          ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 1);
+#endif
+          arena::Arena arena(std::move(tester), place, 6e-5);
+          LOG(INFO) << "run n: " << n << ", c: " << c << ", h: " << h
+                    << ", w: " << w;
+          if (!arena.TestPrecision()) {
+            LOG(ERROR) << "No Pass!!";
+            return;
+          }
+          // if you want to test this op performance, uncomment the following
+          // line
+          // arena.TestPerformance();
+        }
+      }
+    }
+  }
+}
+TEST(GridSampler, precision) {
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  test_grid_sampler(place);
+#endif
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+namespace paddle {
+namespace lite {
+class LayerNormComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "layer_norm";
+  std::string input_ = "x";
+  std::string scale_ = "scale";
+  std::string bias_ = "bias";
+  std::string output_ = "y";
+  std::string mean_ = "mean";
+  std::string variance_ = "variance";
+  DDim dims_{{4, 5, 19, 19}};
+  float epsilon_ = 1e-5f;
+  int begin_norm_axis_ = 1;
+  bool has_bias_ = true;
+  bool has_scale_ = true;
+ public:
+  LayerNormComputeTest(const Place& place,
+                       const std::string& alias,
+                       DDim dims,
+                       float epsilon,
+                       int begin_norm_axis,
+                       bool has_bias,
+                       bool has_scale)
+      : TestCase(place, alias),
+        dims_(dims),
+        epsilon_(epsilon),
+        begin_norm_axis_(begin_norm_axis),
+        has_bias_(has_bias),
+        has_scale_(has_scale) {}
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(input_);
+    auto scale = scope->FindTensor(scale_);
+    auto bias = scope->FindTensor(bias_);
+    auto y = scope->NewTensor(output_);
+    auto mean = scope->NewTensor(mean_);
+    auto variance = scope->NewTensor(variance_);
+    CHECK(y);
+    CHECK(mean);
+    CHECK(variance);
+    y->Resize(dims_);
+    auto matrix_dim = dims_.Flatten2D(begin_norm_axis_);
+    int batch_size = matrix_dim[0];
+    int feature_size = matrix_dim[1];
+    mean->Resize(std::vector<int64_t>{batch_size});
+    variance->Resize(std::vector<int64_t>{batch_size});
+    auto* x_data = x->data<float>();
+    auto* scale_data = (scale == nullptr ? nullptr : scale->data<float>());
+    auto* bias_data = (bias == nullptr ? nullptr : bias->data<float>());
+    auto* out_data = y->mutable_data<float>();
+    auto* mean_data = mean->mutable_data<float>();
+    auto* variance_data = variance->mutable_data<float>();
+    for (int i = 0; i < batch_size; ++i) {
+      int start = i * feature_size;
+      int end = start + feature_size;
+      float mean_t = 0;
+      float variance_t = 0;
+      for (int j = start; j < end; ++j) {
+        mean_t += x_data[j];
+        variance_t += x_data[j] * x_data[j];
+      }
+      mean_t /= feature_size;
+      variance_t = variance_t / feature_size - mean_t * mean_t;
+      mean_data[i] = mean_t;
+      variance_data[i] = variance_t;
+      variance_t = sqrt(variance_t + epsilon_);
+      for (int j = start; j < end; ++j) {
+        out_data[j] = (x_data[j] - mean_t) / variance_t;
+        if (scale_data) {
+          out_data[j] *= scale_data[j - start];
+        }
+        if (bias_data) {
+          out_data[j] += bias_data[j - start];
+        }
+      }
+    }
+  }
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {input_});
+    op_desc->SetInput("Bias", {bias_});
+    op_desc->SetInput("Scale", {scale_});
+    op_desc->SetOutput("Y", {output_});
+    op_desc->SetOutput("Mean", {mean_});
+    op_desc->SetOutput("Variance", {variance_});
+    op_desc->SetAttr("epsilon", epsilon_);
+    op_desc->SetAttr("begin_norm_axis", begin_norm_axis_);
+  }
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    std::vector<int64_t> scale_v;
+    for (size_t i = begin_norm_axis_; i < dims_.size(); i++) {
+      scale_v.push_back(dims_[i]);
+    }
+    DDim scale_dim(scale_v);
+    std::vector<float> scale(scale_dim.production());
+    fill_data_rand(scale.data(), -1.f, 1.f, scale_dim.production());
+    std::vector<float> bias(scale_dim.production());
+    fill_data_rand(bias.data(), -1.f, 1.f, scale_dim.production());
+    SetCommonTensor(input_, dims_, din.data());
+    SetCommonTensor(scale_, scale_dim, scale.data());
+    SetCommonTensor(bias_, scale_dim, bias.data());
+  }
+};
+TEST(LayerNorm, precision) {
+  LOG(INFO) << "test layer_norm op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+  abs_error = 6e-5;
+#else
+  return;
+#endif
+  std::vector<std::vector<int64_t>> dims{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}};
+  for (auto dim_in : dims) {
+    for (auto epsilon : {1e-5f}) {
+      for (auto axis : {0, 1, 2, 3}) {
+        for (bool has_bias : {true, false}) {
+          for (bool has_scale : {true, false}) {
+            if (axis >= dim_in.size()) continue;
+            std::unique_ptr<arena::TestCase> tester(
+                new LayerNormComputeTest(place,
+                                         "def",
+                                         DDim(dim_in),
+                                         epsilon,
+                                         axis,
+                                         has_bias,
+                                         has_scale));
+#ifdef LITE_WITH_ARM
+            auto& ctx = tester->context()->As<ARMContext>();
+            ctx.SetRunMode(lite_api::LITE_POWER_HIGH, 4);
+#endif
+            arena::Arena arena(std::move(tester), place, abs_error);
+            arena.TestPrecision({"mean", "variance"});
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+namespace paddle {
+namespace lite {
+class ReshapeComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "reshape2";
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::string xshape_ = "xshape";
+  std::vector<std::string> shape_tensor_vct_;
+  std::string shape_tensor_;
+  DDim x_dims_;
+  std::vector<int> shape_;
+  bool inplace_ = false;
+ public:
+  ReshapeComputeTester(const Place& place,
+                       const std::string& alias,
+                       DDim x_dims,
+                       std::vector<int> shape,
+                       bool is_shape_tensor_vct = false,
+                       bool is_shape_tensor = false,
+                       bool is_shape = true)
+      : TestCase(place, alias), x_dims_(x_dims) {
+    if (is_shape_tensor_vct) {
+      for (size_t i = 0; i < shape.size(); i++) {
+        shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i));
+      }
+    } else if (is_shape_tensor) {
+      shape_tensor_ = op_type_ + "/shape";
+    } else if (is_shape) {
+      shape_ = shape;
+    } else {
+      LOG(FATAL) << "must set new shape!";
+    }
+  }
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(output_);
+    CHECK(out);
+    auto* x = scope->FindTensor(input_);
+    auto x_dims = x->dims();
+    std::vector<int> out_shape;
+    if (shape_tensor_vct_.size() > 0) {
+      for (auto shape_tensor : shape_tensor_vct_) {
+        out_shape.push_back(scope->FindTensor(shape_tensor)->data<int>()[0]);
+      }
+    } else if (!shape_tensor_.empty()) {
+      auto shape_tensor = scope->FindTensor(shape_tensor_);
+      auto shape_tensor_data = shape_tensor->data<int>();
+      out_shape = std::vector<int>(shape_tensor_data,
+                                   shape_tensor_data + shape_tensor->numel());
+    } else if (!shape_.empty()) {
+      out_shape = shape_;
+    } else {
+      LOG(FATAL) << "must set new shape!";
+    }
+    std::vector<int64_t> final_out_shape(out_shape.size(), 1);
+    int unk_dim_idx = -1;
+    int cap = 1;
+    for (size_t i = 0; i < out_shape.size(); i++) {
+      if (out_shape[i] == -1) {
+        CHECK_EQ(unk_dim_idx, -1);
+        unk_dim_idx = i;
+      } else if (out_shape[i] == 0) {
+        CHECK_LE(i, x_dims.size());
+        final_out_shape[i] = x_dims[i];
+      } else if (out_shape[i] > 0) {
+        final_out_shape[i] = out_shape[i];
+      } else {
+        LOG(FATAL) << "invalid shape";
+      }
+      cap *= final_out_shape[i];
+    }
+    if (unk_dim_idx > -1) {
+      final_out_shape[unk_dim_idx] = x_dims.production() / cap;
+    }
+    out->Resize(final_out_shape);
+    auto x_data = x->data<float>();
+    auto out_data = out->mutable_data<float>();
+    memcpy(out_data, x_data, sizeof(float) * x_dims.production());
+    if (op_type_ == "reshape2") {
+      auto* xshape = scope->NewTensor(xshape_);
+      auto xshape_dims = x_dims.Vectorize();
+      xshape_dims.insert(xshape_dims.begin(), 0);
+      xshape->Resize(xshape_dims);
+    }
+  }
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {input_});
+    if (shape_tensor_vct_.size() > 0) {
+      op_desc->SetInput("ShapeTensor", shape_tensor_vct_);
+    } else if (!shape_tensor_.empty()) {
+      op_desc->SetInput("Shape", {shape_tensor_});
+    } else if (shape_.size() > 0) {
+      op_desc->SetAttr("shape", shape_);
+    } else {
+      LOG(FATAL) << "invalid shape";
+    }
+    op_desc->SetOutput("Out", {output_});
+    if (op_type_ == "reshape2") {
+      op_desc->SetOutput("XShape", {xshape_});
+    }
+    op_desc->SetAttr("inplace", inplace_);
+  }
+  void PrepareData() override {
+    std::vector<float> data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      data[i] = i * 1.1;
+    }
+    SetCommonTensor(input_, x_dims_, data.data());
+    if (shape_tensor_vct_.size() > 0) {
+      for (size_t i = 0; i < shape_.size(); i++) {
+        std::vector<int> shape_data{shape_[i]};
+        SetCommonTensor(shape_tensor_vct_[i],
+                        DDim(std::vector<int64_t>{1}),
+                        shape_data.data());
+      }
+    }
+    if (!shape_tensor_.empty()) {
+      SetCommonTensor(
+          shape_tensor_,
+          DDim(std::vector<int64_t>{static_cast<int64_t>(shape_.size())}),
+          shape_.data());
+    }
+  }
+};
+TEST(Reshape, precision) {
+  LOG(INFO) << "test Reshape op";
+  float abs_error = 2e-5;
+  Place place;
+#ifdef LITE_WITH_XPU
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+  DDim x_dims{{2, 3, 4, 5}};
+  std::vector<std::vector<int>> shapes{{5, 4, 3, 2},
+                                       {2, 3, 20},
+                                       {2, 60},
+                                       {120},
+                                       {2, 3, -1},
+                                       {0, 0, 20},
+                                       {0, 0, -1}};
+  for (auto shape : shapes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new ReshapeComputeTester(place, "def", x_dims, shape));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+namespace paddle {
+namespace lite {
+int data_index(std::vector<int> pos, DDimLite dims) {
+  int d1 = dims[1];
+  int d2 = dims[2];
+  int d3 = dims[3];
+  return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1;
+}
+std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
+  std::vector<int> out_pos(in_pos.size());
+  for (int i = 0; i < axis.size(); i++) {
+    out_pos[axis[i]] = in_pos[i];
+  }
+  return out_pos;
+}
+class TransposeComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "transpose2";
+  std::string input_ = "x";
+  std::string output_ = "out";
+  std::string xshape_ = "xshape";
+  DDim x_dims_;
+  std::vector<int> axis_;
+ public:
+  TransposeComputeTester(const Place& place,
+                         const std::string& alias,
+                         DDim x_dims,
+                         std::vector<int> axis)
+      : TestCase(place, alias), x_dims_(x_dims), axis_(axis) {}
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(output_);
+    CHECK(out);
+    auto* x = scope->FindTensor(input_);
+    auto x_dims = x->dims();
+    std::vector<int64_t> out_shape(x_dims.size(), 0);
+    for (size_t i = 0; i < x_dims.size(); i++) {
+      out_shape[i] = x_dims[axis_[i]];
+    }
+    out->Resize(out_shape);
+    auto y_dims = out->dims();
+    int input_n = x_dims[0];
+    int input_c = x_dims[1];
+    int input_h = x_dims[2];
+    int input_w = x_dims[3];
+    auto input_data = x->data<float>();
+    auto output_data = out->mutable_data<float>();
+    for (int n = 0; n < input_n; ++n) {
+      for (int c = 0; c < input_c; ++c) {
+        for (int h = 0; h < input_h; ++h) {
+          for (int w = 0; w < input_w; ++w) {
+            std::vector<int> in_pos{n, c, h, w};
+            std::vector<int> out_pos = pos_trans(in_pos, axis_);
+            int in_index = data_index(in_pos, x_dims);
+            int out_index = data_index(out_pos, y_dims);
+            output_data[out_index] = input_data[in_index];
+          }
+        }
+      }
+    }
+    if (op_type_ == "transpose2") {
+      auto* xshape = scope->NewTensor(xshape_);
+      auto xshape_dims = x_dims.Vectorize();
+      xshape_dims.insert(xshape_dims.begin(), 0);
+      xshape->Resize(xshape_dims);
+    }
+  }
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {input_});
+    op_desc->SetOutput("Out", {output_});
+    if (op_type_ == "transpose2") {
+      op_desc->SetOutput("XShape", {xshape_});
+    }
+    op_desc->SetAttr("axis", axis_);
+  }
+  void PrepareData() override {
+    std::vector<float> data(x_dims_.production());
+    for (int i = 0; i < x_dims_.production(); i++) {
+      data[i] = i * 1.1;
+    }
+    SetCommonTensor(input_, x_dims_, data.data());
+  }
+};
+TEST(Transpose, precision) {
+  LOG(INFO) << "test Transpose op";
+  float abs_error = 2e-5;
+  Place place;
+#ifdef LITE_WITH_XPU
+  place = TARGET(kXPU);
+#else
+  return;
+#endif
+  DDim x_dims{{2, 3, 4, 5}};
+  // [XPU]: {3, 1, 0, 2} is unsupported
+  std::vector<std::vector<int>> axes{
+      {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}};
+  for (auto axis : axes) {
+    std::unique_ptr<arena::TestCase> tester(
+        new TransposeComputeTester(place, "def", x_dims, axis));
+    arena::Arena arena(std::move(tester), place, abs_error);
+    arena.TestPrecision({"xshape"});
+  }
+}
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
@@ -355,7 +355,8 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
              LOG(FATAL) << "test fp32 pool: input: " << dim_in
                         << ", output: " << dim_out
                         << ", kernel dim: " << ksize[0] << ", " << ksize[1]
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                         << ", stride: " << strides[0] << ", " << strides[1]
                         << ", global_pooling: "
                         << (flag_global ? "global" : "false")
@@ -370,6 +371,7 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
        LOG(INFO) << "test fp32 pool: input: " << dim_in
                  << ", output: " << dim_out << ", kernel dim: " << ksize[0]
                  << ", " << ksize[1] << ", pad: " << pads[0] << ", " << pads[1]
+                  << ", " << pads[2] << ", " << pads[3]
                  << ", stride: " << strides[0] << ", " << strides[1]
                  << ", global_pooling: " << (flag_global ? "global" : "false")
                  << ", pooling_type: " << pooling_type

--- a/lite/tools/debug/CMakeLists.txt
+++ b/lite/tools/debug/CMakeLists.txt
-lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser)
+if(NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
+    return()
+endif()
-if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
+lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser)
-  lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
+lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
    DEPS
    cxx_api
    debug_utils
@@ -16,4 +18,3 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
    XPU_DEPS ${xpu_kernels}
    FPGA_DEPS ${fpga_kernels}
    CL_DEPS ${opencl_kernels})
-endif()
--- a/lite/tools/debug/debug_utils.h
+++ b/lite/tools/debug/debug_utils.h
@@ -27,7 +27,7 @@
 #include "lite/model_parser/pb/var_desc.h"
 #include "lite/utils/all.h"
-DEFINE_string(model_path, "", "Model dir path");
+DEFINE_string(model_dir, "", "Model dir path");
 DEFINE_string(input_file, "", "Input datas file path");
 DEFINE_string(topo_output_file, "", "Runtime topology order output file path");
 DEFINE_bool(output_topo, true, "Dump runtime topology or not");
@@ -185,7 +185,7 @@ void ParseConfig(DebugConfig* conf) {
  CHECK(conf);
 #define CHECK_NON_EMPTY(name__) \
  CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty."
-  CHECK_NON_EMPTY(model_path);
+  CHECK_NON_EMPTY(model_dir);
  if (FLAGS_output_topo) {
    CHECK_NON_EMPTY(topo_output_file);
  }
@@ -193,7 +193,7 @@ void ParseConfig(DebugConfig* conf) {
    CHECK_NON_EMPTY(tensor_output_file);
  }
 #undef CHECK_NON_EMPTY
-  conf->model_dir = FLAGS_model_path;
+  conf->model_dir = FLAGS_model_dir;
  conf->topo_output_file = FLAGS_topo_output_file;
  conf->tensor_output_file = FLAGS_tensor_output_file;
  conf->input_file = FLAGS_input_file;

--- a/mobile/src/common/log.h
+++ b/mobile/src/common/log.h
@@ -33,28 +33,49 @@ namespace paddle_mobile {
 static const char *ANDROID_LOG_TAG =
    "paddle_mobile LOG built on " __DATE__ " " __TIME__;
+#ifdef PADDLE_ENABLE_COLORABLE_LOG
+#define PADDLE_RED "\033[1;31;40m"
+#define PADDLE_GREEN "\033[1;32;40m"
+#define PADDLE_YELLOW "\033[1;33;40m"
+#define PADDLE_LIGHT_RED "\033[1;35;40m"
+#define PADDLE_BLUE "\033[1;34;40m"
+#define PADDLE_WHITE "\033[1;37;40m"
+#define PADDLE_CONON "\033[0m"
+#else
+#define PADDLE_RED ""
+#define PADDLE_GREEN ""
+#define PADDLE_YELLOW ""
+#define PADDLE_LIGHT_RED ""
+#define PADDLE_BLUE ""
+#define PADDLE_WHITE ""
+#define PADDLE_CONON ""
+#endif
 #define ANDROIDLOGI(...)                                               \
  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                \
+  fprintf(stderr, PADDLE_YELLOW "%s\n" PADDLE_CONON, __VA_ARGS__);     \
  fflush(stderr)
-#define ANDROIDLOGW(...)                                                  \
+#define ANDROIDLOGW(...)                                               \
-  __android_log_print(ANDROID_LOG_WARNING, ANDROID_LOG_TAG, __VA_ARGS__); \
+  __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                   \
+  fprintf(stderr, PADDLE_LIGHT_RED "%s\n" PADDLE_CONON, __VA_ARGS__);  \
  fflush(stderr)
 #define ANDROIDLOGD(...)                                                \
  __android_log_print(ANDROID_LOG_DEBUG, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                 \
+  fprintf(stderr, PADDLE_WHITE "%s\n" PADDLE_CONON, __VA_ARGS__);       \
  fflush(stderr)
 #define ANDROIDLOGE(...)                                                \
  __android_log_print(ANDROID_LOG_ERROR, ANDROID_LOG_TAG, __VA_ARGS__); \
-  fprintf(stderr, "%s\n", __VA_ARGS__);                                 \
+  fprintf(stderr, PADDLE_RED "%s\n" PADDLE_CONON, __VA_ARGS__);         \
+  fflush(stderr)
+#define ANDROIDLOGV(...)                                                  \
+  __android_log_print(ANDROID_LOG_VERBOSE, ANDROID_LOG_TAG, __VA_ARGS__); \
+  fprintf(stderr, PADDLE_GREEN "%s\n" PADDLE_CONON, __VA_ARGS__);         \
  fflush(stderr)
 #else
 #define ANDROIDLOGI(...)
 #define ANDROIDLOGW(...)
 #define ANDROIDLOGD(...)
 #define ANDROIDLOGE(...)
+#define ANDROIDLOGV(...)
 #endif
@@ -63,6 +84,7 @@ enum LogLevel {
  kLOG_ERROR,
  kLOG_WARNING,
  kLOG_INFO,
+  kLOG_VERBOSE,
  kLOG_DEBUG,
  kLOG_DEBUG1,
  kLOG_DEBUG2,
@@ -73,9 +95,9 @@ enum LogLevel {
 // log level
 static LogLevel log_level = kLOG_DEBUG4;
-static std::vector<std::string> logs{"NO",      "ERROR ",  "WARNING",
+static std::vector<std::string> logs{"NO     ", "ERROR  ", "WARNING", "INFO   ",
-                                     "INFO   ", "DEBUG  ", "DEBUG1 ",
+                                     "VERBOSE", "DEBUG  ", "DEBUG1 ", "DEBUG2 ",
-                                     "DEBUG2 ", "DEBUG3 ", "DEBUG4 "};
+                                     "DEBUG3 ", "DEBUG4 "};
 struct ToLog;
 struct Print;
@@ -97,9 +119,27 @@ struct Print {
 #else
      std::cerr << buffer_.str() << std::endl;
 #endif
-    } else {
+    } else if (level == kLOG_INFO) {
 #ifdef ANDROID
      ANDROIDLOGI(buffer_.str().c_str());
+#else
+      std::cerr << buffer_.str() << std::endl;
+#endif
+    } else if (level == kLOG_VERBOSE) {
+#ifdef ANDROID
+      ANDROIDLOGV(buffer_.str().c_str());
+#else
+      std::cerr << buffer_.str() << std::endl;
+#endif
+    } else if (level == kLOG_WARNING) {
+#ifdef ANDROID
+      ANDROIDLOGW(buffer_.str().c_str());
+#else
+      std::cerr << buffer_.str() << std::endl;
+#endif
+    } else {
+#ifdef ANDROID
+      ANDROIDLOGD(buffer_.str().c_str());
 #else
      std::cout << buffer_.str() << std::endl;
 #endif
@@ -131,6 +171,7 @@ struct ToLog {
 #define LOG(level)                                                           \
  if (level > paddle_mobile::log_level) {                                    \
+    /* NOLINTNEXTLINE */                                                     \
  } else                                                                     \
    paddle_mobile::ToLog(                                                    \
        level, static_cast<const std::stringstream &>(                       \
@@ -143,6 +184,7 @@ struct ToLog {
 #define DLOG                                                          \
  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) {         \
+    /* NOLINTNEXTLINE */                                              \
  } else                                                              \
    paddle_mobile::ToLog(                                             \
        paddle_mobile::kLOG_DEBUG,                                    \
@@ -156,11 +198,13 @@ struct ToLog {
 #define LOGF(level, format, ...)          \
  if (level > paddle_mobile::log_level) { \
+    /* NOLINTNEXTLINE */                  \
  } else                                  \
    printf(format, ##__VA_ARGS__)
 #define DLOGF(format, ...)                                    \
  if (paddle_mobile::kLOG_DEBUG > paddle_mobile::log_level) { \
+    /* NOLINTNEXTLINE */                                      \
  } else                                                      \
    printf(format, ##__VA_ARGS__)
@@ -170,12 +214,14 @@ struct ToLog {
 #define ANDROIDLOGW(...)
 #define ANDROIDLOGD(...)
 #define ANDROIDLOGE(...)
+#define ANDROIDLOGV(...)
 enum LogLevel {
  kNO_LOG,
  kLOG_ERROR,
  kLOG_WARNING,
  kLOG_INFO,
+  kLOG_VERBOSE,
  kLOG_DEBUG,
  kLOG_DEBUG1,
  kLOG_DEBUG2,
@@ -193,7 +239,7 @@ struct Print {
 };
 struct ToLog {
-  ToLog(LogLevel level) {}
+  explicit ToLog(LogLevel level) {}
  template <typename T>
  ToLog &operator<<(T const &value) {
@@ -201,14 +247,16 @@ struct ToLog {
  }
 };
-#define LOG(level) \
+#define LOG(level)       \
-  if (true) {      \
+  if (true) {            \
-  } else           \
+    /* NOLINTNEXTLINE */ \
+  } else                 \
    paddle_mobile::ToLog(level)
-#define DLOG  \
+#define DLOG             \
-  if (true) { \
+  if (true) {            \
-  } else      \
+    /* NOLINTNEXTLINE */ \
+  } else                 \
    paddle_mobile::ToLog(paddle_mobile::kLOG_DEBUG)
 #define LOGF(level, format, ...)

--- a/mobile/src/common/types.cpp
+++ b/mobile/src/common/types.cpp
@@ -134,6 +134,8 @@ const char *G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE =
    "fill_constant_batch_size_like";
 const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU = "fusion_instancenorm_relu";
 const char *G_OP_TYPE_PIXEL_SHUFFLE = "pixel_shuffle";
+const char *G_OP_TYPE_EXPAND = "expand";
+const char *G_OP_TYPE_GRID_SAMPLER = "grid_sampler";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
@@ -156,7 +158,7 @@ std::unordered_map<
        {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}},
        {G_OP_TYPE_POOL2D, {{"X"}, {"Out"}}},
        {G_OP_TYPE_BATCHNORM, {{"X"}, {"Y"}}},
-        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_INSTANCENORM, {{"X"}, {"Y"}}},
        {G_OP_TYPE_FUSION_INSTANCENORM_RELU, {{"X"}, {"Out"}}},
        {G_OP_TYPE_LRN, {{"X"}, {"Out"}}},
        {G_OP_TYPE_CONCAT, {{"X"}, {"Out"}}},
@@ -258,5 +260,7 @@ std::unordered_map<
         {{"Ids", "Scores"}, {"SentenceIds", "SentenceScores"}}},
        {G_OP_TYPE_FILL_CONSTAN_BATCH_SIZE_LIKE, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}},
-        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}}};
+        {G_OP_TYPE_PIXEL_SHUFFLE, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_EXPAND, {{"X"}, {"Out"}}},
+        {G_OP_TYPE_GRID_SAMPLER, {{"X", "Grid"}, {"Output"}}}};
 }  // namespace paddle_mobile
--- a/mobile/src/common/types.h
+++ b/mobile/src/common/types.h
@@ -265,6 +265,8 @@ extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN;
 extern const char *G_OP_TYPE_FUSION_DECONV_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_INSTANCENORM_RELU;
 extern const char *G_OP_TYPE_PIXEL_SHUFFLE;
+extern const char *G_OP_TYPE_EXPAND;
+extern const char *G_OP_TYPE_GRID_SAMPLER;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>

--- a/mobile/src/fpga/V2/bias_scale.cpp
+++ b/mobile/src/fpga/V2/bias_scale.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "fpga/V2/bias_scale.h"
-#include <memory.h>
 #include <math.h>
+#include <memory.h>
 #include "fpga/common/fpga_common.h"
 namespace paddle_mobile {
@@ -56,15 +56,16 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
  *data_in = ptr_aligned;
 }
-void fixed_scale_bias_new(void*data_in, int data_len) {
+void fixed_scale_bias_new(void *data_in, int data_len) {
-    int* data_tmp = static_cast<int*>(data_in);
+  int *data_tmp = static_cast<int *>(data_in);
-    for (int idx = 0; idx < data_len/2; ++idx) {
+  for (int idx = 0; idx < data_len / 2; ++idx) {
-        float tmp = (static_cast<float*>(data_in))[idx];
+    float tmp = (static_cast<float *>(data_in))[idx];
-        data_tmp[idx] = static_cast<int>(round(tmp*pow(2.0, 23.0)));
+    data_tmp[idx] = static_cast<int>(round(tmp * pow(2.0, 23.0)));
-        tmp = (static_cast<float*>(data_in))[idx+data_len/2];
+    tmp = (static_cast<float *>(data_in))[idx + data_len / 2];
-        data_tmp[idx+data_len/2] = static_cast<int>(round(tmp*pow(2.0, 30.0)));
+    data_tmp[idx + data_len / 2] =
-    }
+        static_cast<int>(round(tmp * pow(2.0, 30.0)));
-    return;
+  }
+  return;
 }
 void interleave(float **data_in, int num_after_alignment) {

--- a/mobile/src/fpga/V2/image.cpp
+++ b/mobile/src/fpga/V2/image.cpp
@@ -94,11 +94,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
      for (i = 0; i < image_num; i++) {
        align_each_in_area_cw =
            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy(
+        memcpy((int8_t *)image_out + tmp_channel +  // NOLINT
-            (int8_t *)image_out + tmp_channel +  // NOLINT
+                   k * align_each_out_area_cw_differ,
-                k * align_each_out_area_cw_differ,
+               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
-            images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
+               channel_num[i] * sizeof(int8_t));
-            channel_num[i] * sizeof(int8_t));
        tmp_channel += channel_num[i];
      }

--- a/mobile/src/fpga/V2/pe.cpp
+++ b/mobile/src/fpga/V2/pe.cpp
@@ -257,8 +257,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
    return ret;
  }
-    // reg_writeq(reg_ActivationArgs,
+  // reg_writeq(reg_ActivationArgs,
-             // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
+  // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
  reg_writeq(output_scale, REG_SCALE_PARAMETER);
  // new
@@ -274,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
                 args.driver.filter_pad_width_mul_channel,
             REG_CONV_REG1);
-    reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
+  reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
-               (args.driver.filter_row << 10) |
+                 (args.driver.filter_row << 10) |
-               (args.driver.filter_height << 5) | args.driver.filter_width,
+                 (args.driver.filter_height << 5) | args.driver.filter_width,
-               REG_CONV_REG2);
+             REG_CONV_REG2);
  reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
                 (args.driver.prog_full_cnt << 16) |
@@ -369,74 +369,77 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
  uint64_t cmd = 0;
  uint64_t image_physical_address = 0;
  uint64_t output_physical_address = 0;
-uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
  image_physical_address = vaddr_to_paddr(args.image.address);
  output_physical_address = vaddr_to_paddr(args.output.address);
  uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
  uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
  uint64_t output_height = (uint64_t)(
      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h + 1);
+          args.kernel.stride_h +
+      1);
  uint64_t output_width = (uint64_t)(
      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-           args.kernel.stride_w + 1);
+          args.kernel.stride_w +
+      1);
  uint64_t image_amount_per_row =
      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
                 IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row = (uint64_t)args.image.width *
+  uint64_t image_one_pad_per_row =
-          (uint64_t)args.image.channels +(uint64_t)args.image.pad_width *
+      (uint64_t)args.image.width * (uint64_t)args.image.channels +
-          (uint64_t)args.image.channels;
+      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width *
+  uint64_t result_amount_align_32 =
-          (uint64_t)args.image.channels, 32);
+      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
  uint64_t result_addr_row =
-          (result_amount_align_32 << 32) | output_physical_address;
+      (result_amount_align_32 << 32) | output_physical_address;
  uint64_t row_padding_down =
-          (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
+      (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
-  uint64_t kernel_width_sub1 =
+  uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
-          (uint64_t)args.kernel.width - 1;
  uint64_t kernel_padding_step = row_padding_down |
-          ((uint64_t)args.image.pad_height << 16) |
+                                 ((uint64_t)args.image.pad_height << 16) |
-          ((uint64_t)args.kernel.stride_h << 24) |
+                                 ((uint64_t)args.kernel.stride_h << 24) |
-          ((uint64_t)kernel_width_sub1<<32) |
+                                 ((uint64_t)kernel_width_sub1 << 32) |
-          ((uint64_t)args.kernel.height << 40) |
+                                 ((uint64_t)args.kernel.height << 40) |
-          ((uint64_t)(args.kernel.height-1) << 48);
+                                 ((uint64_t)(args.kernel.height - 1) << 48);
-  uint64_t image_calcu_height = (uint64_t)args.kernel.height +
+  uint64_t image_calcu_height =
-          (output_height - 1) * (uint64_t)args.kernel.stride_h;
+      (uint64_t)args.kernel.height +
+      (output_height - 1) * (uint64_t)args.kernel.stride_h;
  uint64_t result_size_calcu_height = (output_height - 1) |
-          ((output_width - 1) << 16) | (image_calcu_height << 32);
+                                      ((output_width - 1) << 16) |
-  uint64_t col_padding_down = ((uint64_t)args.image.width +
+                                      (image_calcu_height << 32);
-          (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
+  uint64_t col_padding_down =
+      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
+      (uint64_t)args.image.channels;
  uint64_t image_row_col_padding_down =
-          image_amount_per_row | (col_padding_down << 32);
+      image_amount_per_row | (col_padding_down << 32);
  uint64_t image_rowXpadding_h =
-          image_amount_per_row * (uint64_t)args.image.pad_height;
+      image_amount_per_row * (uint64_t)args.image.pad_height;
  uint64_t image_rowXstep_h =
-          image_amount_per_row * (uint64_t)args.kernel.stride_h;
+      image_amount_per_row * (uint64_t)args.kernel.stride_h;
  uint64_t image_rowXpad_h_rowXstep_h =
-          image_rowXpadding_h | (image_rowXstep_h << 32);
+      image_rowXpadding_h | (image_rowXstep_h << 32);
  uint64_t channelXpad_w =
-          (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
+      (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
  uint64_t channelXstep_w =
-          (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
+      (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
  uint64_t channelXpad_w_channelXstep_w =
-          channelXpad_w | (channelXstep_w << 32);
+      channelXpad_w | (channelXstep_w << 32);
-  uint64_t filter_row_align =
+  uint64_t filter_row_align = C_align_32 * (uint64_t)args.kernel.width;
-      C_align_32 * (uint64_t)args.kernel.width;
+  uint64_t sub_filter_amount_align =
-  uint64_t sub_filter_amount_align = C_align_32 *
+      C_align_32 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
-          (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
  uint64_t mult_factor = 0;
  float average_reciprocal = args.kernel_reciprocal;
-  uint32_t* kernel_reciprocal;
+  uint32_t *kernel_reciprocal;
-  kernel_reciprocal =(reinterpret_cast<uint32_t*>(&average_reciprocal));
+  kernel_reciprocal = (reinterpret_cast<uint32_t *>(&average_reciprocal));
  if (args.mode == 1)
-    mult_factor = (uint64_t)(*kernel_reciprocal) |
+    mult_factor = (uint64_t)(*kernel_reciprocal) | ((uint64_t)1 << 32) |
-            ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
+                  ((uint64_t)1 << 40);
  else
    mult_factor =
-            (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
+        (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
    ret = -EIO;
@@ -501,7 +504,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #endif
 #ifdef PADDLE_MOBILE_ZU5
  int ret = 0;
-uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
@@ -511,7 +514,6 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
    return ret;
  }
  uint64_t image0_physical_address = 0;
  uint64_t image1_physical_address = 0;
  uint64_t image_physical_address = 0;
@@ -519,26 +521,28 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
  image0_physical_address = vaddr_to_paddr(args.image0.address);
  image1_physical_address = vaddr_to_paddr(args.image1.address);
  image_physical_address =
-          image0_physical_address | (image1_physical_address << 32);
+      image0_physical_address | (image1_physical_address << 32);
  output_physical_address = vaddr_to_paddr(args.output.address);
  uint64_t image_amount_per_row =
-          align_to_x((uint64_t)args.image0.width *
+      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
-          (uint64_t)args.image0.channels, IMAGE_ALIGNMENT);
+                 IMAGE_ALIGNMENT);
  uint64_t result_addr_row =
-          output_physical_address | (image_amount_per_row << 32);
+      output_physical_address | (image_amount_per_row << 32);
  uint64_t kernel_padding_step = 0;
  kernel_padding_step = ((uint64_t)args.image0.height * 2) |
-          ((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48);
+                        ((uint64_t)2 << 24) | ((uint64_t)2 << 40) |
-  uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) |
+                        ((uint64_t)1 << 48);
-          ((image_amount_per_row / 32 - 1) << 16) |
+  uint64_t result_size_calcu_height =
-          (((uint64_t)args.image0.height * 2) << 32);
+      ((uint64_t)args.image0.height - 1) |
-  uint64_t image_row_col_padding_down = image_amount_per_row |
+      ((image_amount_per_row / 32 - 1) << 16) |
-          (image_amount_per_row << 32);
+      (((uint64_t)args.image0.height * 2) << 32);
-  float  quantParam =
+  uint64_t image_row_col_padding_down =
-          ((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]);
+      image_amount_per_row | (image_amount_per_row << 32);
-  uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam);
+  float quantParam =
-  uint64_t ew_scale_mult_factor = (*ew_scale) |
+      ((args.image0.scale_address)[0]) / ((args.output.scale_address)[0]);
-          ((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40);
+  uint32_t *ew_scale = reinterpret_cast<uint32_t *>(&quantParam);
+  uint64_t ew_scale_mult_factor = (*ew_scale) | ((uint64_t)args.const0 << 32) |
+                                  ((uint64_t)args.const1 << 40);
  reg_writeq(0ul, REG_SCALE_PARAMETER);
  reg_writeq(image_physical_address, 0x808);
  reg_writeq(result_addr_row, 0x810);
@@ -546,7 +550,7 @@ uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
  reg_writeq(result_size_calcu_height, 0x820);
  reg_writeq(32, 0x828);
  reg_writeq(image_row_col_padding_down, 0x830);
-  reg_writeq(((image_amount_per_row*2) << 32), 0x838);
+  reg_writeq(((image_amount_per_row * 2) << 32), 0x838);
  reg_writeq(ew_scale_mult_factor, 0x840);  // dw donot care
  reg_writeq(((uint64_t)32 << 32), 0x848);
  reg_writeq(0, 0x858);
@@ -924,7 +928,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
       << "   pad_height:" << args.image.pad_height
       << "   pad_width:" << args.image.pad_width;
  DLOG << "   filter_address:" << args.filter_address;
-       //<< "   bias_address:" << args.bias_address;
+  //<< "   bias_address:" << args.bias_address;
  DLOG << "   kernel_height:" << args.kernel.height
       << "   kernel_width:" << args.kernel.width
       << "   stride_h:" << args.kernel.stride_h
@@ -950,67 +954,71 @@ int ComputeDWConv(const struct DWconvArgs &args) {
  bias_physical_address = vaddr_to_paddr(args.bias_address);
  uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64);
  uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
-  uint64_t output_height = (uint64_t)
+  uint64_t output_height = (uint64_t)(
-          ((args.image.height + args.image.pad_height * 2 -
+      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.height) / args.kernel.stride_h +1);
+          args.kernel.stride_h +
-  uint64_t output_width = (uint64_t)
+      1);
-          (((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+  uint64_t output_width = (uint64_t)(
-          args.kernel.stride_w + 1) * args.sub_conv_num);
+      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+           args.kernel.stride_w +
+       1) *
+      args.sub_conv_num);
  uint64_t image_amount_per_row =
-          align_to_x((uint64_t)args.image.width *
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-          (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
+                 IMAGE_ALIGNMENT);
  uint64_t image_one_pad_per_row =
-          (uint64_t)args.image.width * (uint64_t)args.image.channels +
+      (uint64_t)args.image.width * (uint64_t)args.image.channels +
-          (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
+      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t result_amount_align_32 = align_to_x(
+  uint64_t result_amount_align_32 =
-          (uint64_t)output_width * (uint64_t)args.image.channels, 32);
+      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels, 32);
  uint64_t result_addr_row =
-          (result_amount_align_32 << 32) | output_physical_address;
+      (result_amount_align_32 << 32) | output_physical_address;
  uint64_t row_padding_down =
-          (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
+      (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
  uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
  uint64_t kernel_padding_step = row_padding_down |
-          ((uint64_t)args.image.pad_height << 16) |
+                                 ((uint64_t)args.image.pad_height << 16) |
-          ((uint64_t)args.kernel.stride_h << 24) |
+                                 ((uint64_t)args.kernel.stride_h << 24) |
-          ((uint64_t)kernel_width_sub1<<32) |
+                                 ((uint64_t)kernel_width_sub1 << 32) |
-          ((uint64_t)args.kernel.height << 40) |
+                                 ((uint64_t)args.kernel.height << 40) |
-          ((uint64_t)(args.kernel.height-1) << 48);
+                                 ((uint64_t)(args.kernel.height - 1) << 48);
-  uint64_t image_calcu_height = (uint64_t)args.kernel.height +
+  uint64_t image_calcu_height =
-          (output_height - 1) * (uint64_t)args.kernel.stride_h;
+      (uint64_t)args.kernel.height +
+      (output_height - 1) * (uint64_t)args.kernel.stride_h;
  uint64_t result_size_calcu_height = (output_height - 1) |
-          ((output_width - 1) << 16) | (image_calcu_height << 32);
+                                      ((output_width - 1) << 16) |
-  uint64_t col_padding_down = ((uint64_t)args.image.width +
+                                      (image_calcu_height << 32);
-          (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
+  uint64_t col_padding_down =
+      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width) *
+      (uint64_t)args.image.channels;
  uint64_t image_row_col_padding_down =
-          image_amount_per_row | (col_padding_down << 32);
+      image_amount_per_row | (col_padding_down << 32);
  uint64_t image_rowXpadding_h =
-          image_amount_per_row * (uint64_t)args.image.pad_height;
+      image_amount_per_row * (uint64_t)args.image.pad_height;
  uint64_t image_rowXstep_h =
-          image_amount_per_row * (uint64_t)args.kernel.stride_h;
+      image_amount_per_row * (uint64_t)args.kernel.stride_h;
  uint64_t image_rowXpad_h_rowXstep_h =
-          image_rowXpadding_h | (image_rowXstep_h << 32);
+      image_rowXpadding_h | (image_rowXstep_h << 32);
  uint64_t channelXpad_w =
-          (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
+      (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
  uint64_t channelXstep_w =
-          (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
+      (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
  uint64_t channelXpad_w_channelXstep_w =
-          channelXpad_w | (channelXstep_w << 32);
+      channelXpad_w | (channelXstep_w << 32);
-  uint64_t filter_row_align =
+  uint64_t filter_row_align = C_align_64 * (uint64_t)args.kernel.width;
-          C_align_64 * (uint64_t)args.kernel.width;
+  uint64_t sub_filter_amount_align =
-  uint64_t sub_filter_amount_align = C_align_64 *
+      C_align_64 * (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
-          (uint64_t)args.kernel.width *
-          (uint64_t)args.kernel.height;
  uint64_t filter_amount_align =
-          sub_filter_amount_align * (uint64_t)args.sub_conv_num;
+      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
  uint64_t filter_param = filter_row_align | (filter_amount_align << 16) |
-          (sub_filter_amount_align << 32) |
+                          (sub_filter_amount_align << 32) |
-          (((uint64_t)args.sub_conv_num -1) << 48);
+                          (((uint64_t)args.sub_conv_num - 1) << 48);
  uint64_t channel_parameter =
-          (uint64_t)args.image.channels | (C_align_64 << 16);
+      (uint64_t)args.image.channels | (C_align_64 << 16);
  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
    ret = -EIO;
@@ -1030,8 +1038,9 @@ int ComputeDWConv(const struct DWconvArgs &args) {
  reg_writeq(channelXpad_w_channelXstep_w, 0x848);
  reg_writeq(filter_physical_address, 0x850);
  reg_writeq(filter_param, 0x858);
-  reg_writeq(((bias_physical_address+C_align_64*4) |
+  reg_writeq(((bias_physical_address + C_align_64 * 4) |
-  (bias_physical_address << 32)), 0x860);
+              (bias_physical_address << 32)),
+             0x860);
  cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8);
  reg_writeq(cmd, 0x800);

--- a/mobile/src/framework/executor.cpp
+++ b/mobile/src/framework/executor.cpp
@@ -554,8 +554,8 @@ PMStatus Executor<Device, T>::Predict() {
      clock_gettime(CLOCK_MONOTONIC, &ts);
      profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
-      DLOG << i << "th, "
+      LOG(paddle_mobile::kLOG_INFO) << i << "th, "
-           << "run op: " << op_handler->Type();
+                                    << "run op: " << op_handler->Type();
      if (lod_mode_ && input_dim_has_changed_) {
        op_handler->InferShape();
      }

--- a/mobile/src/framework/load_ops.h
+++ b/mobile/src/framework/load_ops.h
@@ -246,7 +246,7 @@ LOAD_OP2(fusion_conv_bn, CPU, FPGA);
 LOAD_FUSION_MATCHER(fusion_conv_bn);
 #endif
 #ifdef ELEMENTWISESUB_OP
-LOAD_OP1(elementwise_sub, CPU)
+LOAD_OP2(elementwise_sub, CPU, GPU_CL)
 #endif
 #ifdef TOP_K_OP
 LOAD_OP1(top_k, CPU)
@@ -380,3 +380,9 @@ LOAD_OP1(reduce_prod, CPU);
 #ifdef PIXEL_SHUFFLE_OP
 LOAD_OP1(pixel_shuffle, GPU_CL);
 #endif
+#ifdef EXPAND_OP
+LOAD_OP1(expand, GPU_CL);
+#endif
+#ifdef GRID_SAMPLER_OP
+LOAD_OP1(grid_sampler, GPU_CL);
+#endif
--- a/mobile/src/io/api_paddle_mobile.cc
+++ b/mobile/src/io/api_paddle_mobile.cc
@@ -262,6 +262,37 @@ void PaddleMobilePredictor<Device, T>::Predict_From_To(int start, int end) {
  paddle_mobile_->Predict_From_To(start, end);
 }
+#else
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::Feed(const std::string &var_name,
+                                            const PaddleTensor &input) {
+  framework::DDim ddim = framework::make_ddim(input.shape);
+  framework::Tensor input_tensor(static_cast<T *>(input.data.data()), ddim);
+  paddle_mobile_->Feed(var_name, input_tensor);
+}
+template <typename Device, typename T>
+void PaddleMobilePredictor<Device, T>::Fetch(const std::string &var_name,
+                                             PaddleTensor *output) {
+  auto output_tensor = paddle_mobile_->Fetch(var_name);
+  auto ddim = output_tensor->dims();
+  output->shape.clear();
+  for (int i = 0; i < ddim.size(); i++) {
+    output->shape.push_back(static_cast<int>(ddim[i]));
+  }
+  int length = output_tensor->numel() * sizeof(T);
+  if (output->data.length() < length) {
+    output->data.Resize(length);
+  }
+  memcpy(output->data.data(), output_tensor->template data<T>(), length);
+}
+template <typename Device, typename T>
+bool PaddleMobilePredictor<Device, T>::Run() {
+  paddle_mobile_->Predict();
+}
 #endif
 template <typename Device, typename T>
 PaddleMobilePredictor<Device, T>::~PaddleMobilePredictor() {

--- a/mobile/src/io/api_paddle_mobile.h
+++ b/mobile/src/io/api_paddle_mobile.h
@@ -39,7 +39,10 @@ class PaddleMobilePredictor : public PaddlePredictor {
  void FetchPaddleTensors(std::vector<PaddleTensor>* outputs) override;
  void FetchPaddleTensors(PaddleTensor* outputs, int id) override;
  void GetPaddleTensor(const std::string& name, PaddleTensor* output) override;
+#else
+  void Feed(const std::string& var_name, const PaddleTensor& input);
+  void Fetch(const std::string& var_name, PaddleTensor* output);
+  bool Run();
 #endif
  ~PaddleMobilePredictor() override;

--- a/mobile/src/io/paddle_inference_api.h
+++ b/mobile/src/io/paddle_inference_api.h
@@ -191,6 +191,10 @@ class PaddlePredictor {
  virtual void FetchPaddleTensors(PaddleTensor* outputs, int id) = 0;
  virtual void GetPaddleTensor(const std::string& name,
                               PaddleTensor* output) = 0;
+#else
+  virtual void Feed(const std::string& var_name, const PaddleTensor& input) = 0;
+  virtual void Fetch(const std::string& var_name, PaddleTensor* output) = 0;
+  virtual bool Run() = 0;
 #endif
 protected:

--- a/mobile/src/operators/elementwise_sub_op.cpp
+++ b/mobile/src/operators/elementwise_sub_op.cpp
@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(elementwise_sub, ops::ElementwiseSubOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(elementwise_sub, ops::ElementwiseSubOp);
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/mobile/src/operators/expand_op.cpp
+++ b/mobile/src/operators/expand_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef EXPAND_OP
+#include "operators/expand_op.h"
+#include <framework/ddim.h>
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void ExpandOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  int expand_size = this->param_.expand_times.size();
+  int x_dims_size = x_dim.size();
+  PADDLE_MOBILE_ENFORCE(expand_size == x_dims_size,
+                        "The number of expand_times size must be qual to the "
+                        "rank of Input(X). The number of expand_times size "
+                        "must be qual to the rank of Input(X).")
+  framework::DDim out_dims(this->param_.InputX()->dims());
+  for (size_t i = 0; i < this->param_.expand_times.size(); ++i) {
+    out_dims[i] *= this->param_.expand_times[i];
+  }
+  this->param_.Out()->Resize(out_dims);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(expand, ops::ExpandOp);
+#endif
+#endif
--- a/mobile/src/operators/expand_op.h
+++ b/mobile/src/operators/expand_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef EXPAND_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/expand_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef EXPAND_OP
+DECLARE_OPERATOR(Expand, ExpandParam, ExpandKernel);
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/fusion_instancenorm_relu_op.h
+++ b/mobile/src/operators/fusion_instancenorm_relu_op.h
@@ -45,7 +45,7 @@ class FusionInstanceNormReluMatcher : public framework::FusionOpMatcher {
 template <typename DeviceType, typename T>
 class FusionInstanceNormReluOp
    : public framework::OperatorWithKernel<
-          DeviceType, InstanceNormParam<DeviceType>,
+          DeviceType, FusionInstanceNormReluParam<DeviceType>,
          operators::InstanceNormReluKernel<DeviceType, T>> {
 public:
  FusionInstanceNormReluOp(const string &type, const VariableNameMap &inputs,
@@ -53,7 +53,7 @@ class FusionInstanceNormReluOp
                           const framework::AttributeMap &attrs,
                           framework::Scope *scope)
      : framework::OperatorWithKernel<
-            DeviceType, InstanceNormParam<DeviceType>,
+            DeviceType, FusionInstanceNormReluParam<DeviceType>,
            operators::InstanceNormReluKernel<DeviceType, T>>(
            type, inputs, outputs, attrs, scope) {}

--- a/mobile/src/operators/grid_sampler_op.cpp
+++ b/mobile/src/operators/grid_sampler_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRID_SAMPLER_OP
+#include "operators/grid_sampler_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename Dtype, typename T>
+void GridSamplerOp<Dtype, T>::InferShape() const {
+  auto x_dim = this->param_.InputX()->dims();
+  this->param_.Output()->Resize(x_dim);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(grid_sampler, ops::GridSamplerOp);
+#endif
+#endif
--- a/mobile/src/operators/grid_sampler_op.h
+++ b/mobile/src/operators/grid_sampler_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRID_SAMPLER_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/grid_sampler_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef GRID_SAMPLER_OP
+DECLARE_OPERATOR(GridSampler, GridSamplerParam, GridSamplerKernel);
+#endif
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/instancenorm_op.cpp
+++ b/mobile/src/operators/instancenorm_op.cpp
@@ -24,7 +24,7 @@ namespace operators {
 template <typename Dtype, typename T>
 void InstanceNormOp<Dtype, T>::InferShape() const {
  auto x_dims = this->param_.InputX()->dims();
-  this->param_.Out()->Resize(x_dims);
+  this->param_.OutputY()->Resize(x_dims);
 }
 }  // namespace operators

--- a/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
+++ b/mobile/src/operators/kernel/arm/convolution/conv_common.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
 #include "operators/kernel/arm/convolution/conv_common.h"
 #include "framework/context.h"
 #include "operators/math/gemm/gemm1x1s1.h"
@@ -111,3 +113,4 @@ void InitBaseConvKernel(ConvParam<CPU> *param) {
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
+++ b/mobile/src/operators/kernel/central-arm-func/conv_arm_func.cpp
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
 #include "operators/kernel/central-arm-func/conv_arm_func.h"
 #include <vector>
@@ -375,3 +376,4 @@ template void DepthwiseConv5x5<int8_t, int32_t>(const ConvParam<CPU> &param);
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/conv_func.cpp
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef CONV_OP
 #include "operators/kernel/cl/cl-kernel-func/conv_func.h"
 #include <vector>
@@ -1123,3 +1124,4 @@ void ConvTranspose3x3s2AddBnRelu(framework::CLHelper *cl_helper,
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.cpp
@@ -11,23 +11,23 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#ifdef INSTANCENORM_OP
 #include "operators/kernel/cl/cl-kernel-func/instancenorm_func.h"
 #include <algorithm>
 namespace paddle_mobile {
 namespace operators {
 void InstanceNorm(framework::CLHelper *cl_helper,
-                  const InstanceNormParam<GPU_CL> &param) {
+                  const framework::CLImage *input, framework::CLImage *output,
+                  float epsilon) {
  auto kernel = cl_helper->KernelAt(0);
-  auto &dims = param.Out()->dims();
+  auto &dims = output->dims();
  const int n = dims[0];
  const int c_group = (dims[1] + 3) / 4;
  const int h = dims[2];
  const int w = dims[3];
-  auto epsilon = param.Epsilon();
+  auto input_image = input->GetCLImage();
-  auto input = param.InputX()->GetCLImage();
+  auto out_image = output->GetCLImage();
-  auto out = param.Out()->GetCLImage();
  //      DLOG << "Epsilon: " << epsilon;
@@ -66,12 +66,13 @@ void InstanceNorm(framework::CLHelper *cl_helper,
  CL_CHECK_ERRORS(status);
  clSetKernelArg(kernel, 5, sizeof(cl_float), &epsilon);
  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input);
+  clSetKernelArg(kernel, 6, sizeof(cl_mem), &input_image);
  CL_CHECK_ERRORS(status);
-  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out);
+  clSetKernelArg(kernel, 7, sizeof(cl_mem), &out_image);
  CL_CHECK_ERRORS(status);
  clEnqueueNDRangeKernel(cl_helper->CLCommandQueue(), kernel, 3, NULL,
                         work_size, local_work_size, 0, NULL, NULL);
 }
 }  // namespace operators
 }  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
+++ b/mobile/src/operators/kernel/cl/cl-kernel-func/instancenorm_func.h
@@ -21,7 +21,8 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 void InstanceNorm(framework::CLHelper *cl_helper,
-                  const InstanceNormParam<GPU_CL> &param);
+                  const framework::CLImage *input, framework::CLImage *output,
+                  float epsilon);
 }
 }  // namespace paddle_mobile
 #endif
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void elementwise_sub(__global image2d_t inputImage, __global image2d_t bias, __write_only image2d_t outputImage) {
+     int x = get_global_id(0);
+     int y = get_global_id(1);
+     const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+     int2 coords;
+     coords.x = x;
+     coords.y = y;
+     half4 input = read_imageh(inputImage, sampler, coords);
+     half4 biase = read_imageh(bias, sampler, coords);
+     half4 output = input - biase;
+     write_imageh(outputImage, coords, output);
+ }
--- a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void expend_c1(
+    __private const int OUT_C, __private const int OUT_W,
+    __private const int OUT_NH,
+    __private const int IN_C, __private const int IN_W,
+    __private const int IN_NH,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const int n_times, __private const int c_times,
+    __private const int h_times, __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  //  const real_in_c = out_c * 4 / c_times;
+  //  const int in_c = real_in_c / 4;
+  const int in_c = 0;
+  //  const int in_c = out_c / c_times;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, input_pos);
+  in.y = in.x;
+  in.z = in.x;
+  in.w = in.x;
+  write_imageh(output, output_pos, in);
+}
+__kernel void expend_c2(
+    __private const int OUT_C, __private const int OUT_W,
+    __private const int OUT_NH,
+    __private const int IN_C, __private const int IN_W,
+    __private const int IN_NH,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const int n_times, __private const int c_times,
+    __private const int h_times, __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  //  const real_in_c = out_c * 4 / c_times;
+  //  const int in_c = real_in_c / 4;
+  const int in_c = 0;
+  //  const int in_c = out_c / c_times;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, input_pos);
+  in.z = in.x;
+  in.w = in.y;
+  write_imageh(output, output_pos, in);
+}
+__kernel void expend_c4(
+    __private const int OUT_C, __private const int OUT_W,
+    __private const int OUT_NH,
+    __private const int IN_C, __private const int IN_W,
+    __private const int IN_NH,
+    __private const int input_width,  /* of one block */
+    __private const int input_height, /* of one block */
+    __private const int output_width, __private const int output_height,
+    __read_only image2d_t input, __write_only image2d_t output,
+    __private const int n_times, __private const int c_times,
+    __private const int h_times, __private const int w_times) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+  if (out_c >= OUT_C || out_w >= OUT_W || out_nh >= OUT_NH) {
+    return;
+  }
+  const int out_n = out_nh / output_height;
+  const int out_h = out_nh % output_height;
+  //  const real_in_c = out_c * 4 / c_times;
+  //  const int in_c = real_in_c / 4;
+  const int in_c = 0;
+  //  const int in_c = out_c / c_times;
+  const int in_w = out_w / w_times;
+  const int in_h = out_h / h_times;
+  const int in_n = out_n / n_times;
+  const int in_nh = in_n * input_height + in_h;
+  int2 output_pos = (int2)(out_c * OUT_W + out_w, out_nh);
+  int2 input_pos = (int2)(in_c * IN_W + in_w, in_nh);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  half4 in = read_imageh(input, sampler, input_pos);
+  write_imageh(output, output_pos, in);
+}
\ No newline at end of file
--- a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "cl_common.h"
+__kernel void grid_sampler(__private const int out_height,
+                           __private const int out_width,
+                           __read_only image2d_t input,
+                           __read_only image2d_t grid,
+                           __write_only image2d_t output) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2) * 4;
+  const int out_n = out_nh / out_height;
+  const int out_h = out_nh % out_height;
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int x_grid = out_h / 4 * 2;
+  int y_grid = out_n * out_width + out_w;
+  float4 g1 = read_imagef(grid, sampler, (int2)(x_grid, y_grid));
+  float4 g2 = read_imagef(grid, sampler, (int2)(x_grid + 1, y_grid));
+  float x = (g1.x + 1) * (out_width - 1) / 2;
+  float y = (g2.x + 1) * (out_height - 1) / 2;
+  float x0 = floor(x);
+  float y0 = floor(y);
+  int x_p = out_c * out_width + x0;
+  int y_p = out_n * out_height + y0;
+  int x_out = out_c * out_width + out_w;
+  int y_out = out_n * out_height + out_h;
+  float4 input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  float4 input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  float4 input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  float4 input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  float4 out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out), convert_half4(out_val));
+  x = (g1.y + 1) * (out_width - 1) / 2;
+  y = (g2.y + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out + 1), convert_half4(out_val));
+  x = (g1.z + 1) * (out_width - 1) / 2;
+  y = (g2.z + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out + 2), convert_half4(out_val));
+  x = (g1.w + 1) * (out_width - 1) / 2;
+  y = (g2.w + 1) * (out_height - 1) / 2;
+  x0 = floor(x);
+  y0 = floor(y);
+  x_p = out_c * out_width + x0;
+  y_p = out_n * out_height + y0;
+  input0 = read_imagef(input, sampler, (int2)(x_p,     y_p));
+  input1 = read_imagef(input, sampler, (int2)(x_p + 1, y_p));
+  input2 = read_imagef(input, sampler, (int2)(x_p,     y_p + 1));
+  input3 = read_imagef(input, sampler, (int2)(x_p + 1, y_p + 1));
+  out_val = input0 * (x0 + 1 - x) * (y0 + 1 - y) +
+                                      input1 * (x - x0) * (y0 + 1 - y) +
+                                      input2 * (x0 + 1 - x) * (y - y0) +
+                                      input3 * (x - x0) * (y - y0);
+  write_imageh(output, (int2)(x_out, y_out + 3), convert_half4(out_val));
+}
--- a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef ELEMENTWISESUB_OP
+#include "operators/kernel/elementwise_sub_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ElementwiseSubKernel<GPU_CL, float>::Init(
+    ElementwiseSubParam<GPU_CL> *param) {
+  framework::CLImage *bias = reinterpret_cast<framework::CLImage *>(
+      const_cast<framework::CLImage *>(param->InputY()));
+  if (bias->dims().size() == 4) {
+    if (!bias->isInit()) {
+      bias->InitNormalCLImage(cl_helper_.CLContext(),
+                              this->cl_helper_.CLCommandQueue());
+    }
+    DLOG << " bias: " << *bias;
+    this->cl_helper_.AddKernel("elementwise_sub", "elementwise_sub_kernel.cl");
+  } else {
+    DLOG << "error:bias dims not support";
+  }
+  return true;
+}
+template <>
+void ElementwiseSubKernel<GPU_CL, float>::Compute(
+    const ElementwiseSubParam<GPU_CL> &param) {
+  auto input = param.InputX();
+  auto bias = param.InputY();
+  auto output = param.Out();
+  cl_int status;
+  auto kernel = this->cl_helper_.KernelAt(0);
+  if (bias->dims().size() == 4) {
+    cl_mem input_image = input->GetCLImage();
+    cl_mem bias_image = bias->GetCLImage();
+    cl_mem output_image = output->GetCLImage();
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bias_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    auto width = input->ImageWidth();
+    auto height = input->ImageHeight();
+    size_t global_work_size[2] = {width, height};
+    status =
+        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                               NULL, global_work_size, NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+  } else {
+    DLOG << "error:bias dims not support";
+  }
+}
+template class ElementwiseSubKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/kernel/cl/expand_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/expand_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef EXPAND_OP
+#include "operators/kernel/expand_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool ExpandKernel<GPU_CL, float>::Init(ExpandParam<GPU_CL>* param) {
+  const framework::DDim& input_dims = param->InputX()->dims();
+  PADDLE_MOBILE_ENFORCE(input_dims.size() == 4,
+                        "expend now support 4 size dims");
+  if (input_dims[1] == 1) {
+    this->cl_helper_.AddKernel("expend_c1", "expend.cl");
+  } else if (input_dims[1] == 2) {
+    this->cl_helper_.AddKernel("expend_c2", "expend.cl");
+  } else if (input_dims[1] == 4) {
+    this->cl_helper_.AddKernel("expend_c4", "expend.cl");
+  } else {
+    PADDLE_MOBILE_ENFORCE(false, "expend did not supported this type");
+  }
+  return true;
+}
+template <>
+void ExpandKernel<GPU_CL, float>::Compute(const ExpandParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  DLOG << "param.Out()->dims():  " << param.Out()->dims();
+  const framework::DDim& image_dims = param.Out()->ImageDims();
+  DLOG << "param.Out()->image_dims():  " << image_dims;
+  auto out_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+  DLOG << "out_work_size:  " << out_work_size;
+  int out_c_block = out_work_size[0];
+  int out_w = out_work_size[1];
+  int out_nh = out_work_size[2];
+  auto in_work_size = this->cl_helper_.DefaultWorkSize(*param.InputX());
+  int in_c_block = in_work_size[0];
+  int in_w = in_work_size[1];
+  int in_nh = in_work_size[2];
+  int input_width = param.InputX()->dims()[3];
+  int input_height = param.InputX()->dims()[2];
+  int output_width = param.Out()->dims()[3];
+  int output_height = param.Out()->dims()[2];
+  const auto* input = param.InputX();
+  auto* output = param.Out();
+  vector<int> expandTimes = {1, 1, 1, 1};
+  DLOG << "param.expand_times: " << param.expand_times;
+  for (int i = 0; i < param.expand_times.size(); ++i) {
+    expandTimes[i] = param.expand_times[i];
+  }
+  DLOG << "expandTimes: " << expandTimes;
+  auto inputImage = input->GetCLImage();
+  auto outputImage = output->GetCLImage();
+  input->dims();
+  int idx = 0;
+  cl_int status;
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &out_nh);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_c_block);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_w);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &in_nh);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &inputImage);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(cl_mem), &outputImage);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[0]);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[1]);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[2]);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, idx++, sizeof(int), &expandTimes[3]);
+  CL_CHECK_ERRORS(status);
+  status =
+      clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
+                             out_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+  DLOG << *output;
+}
+template class ExpandKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef GRID_SAMPLER_OP
+#include "operators/kernel/grid_sampler_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool GridSamplerKernel<GPU_CL, float>::Init(GridSamplerParam<GPU_CL>* param) {
+  this->cl_helper_.AddKernel("grid_sampler", "grid_sampler_kernel.cl");
+  return true;
+}
+template <>
+void GridSamplerKernel<GPU_CL, float>::Compute(
+    const GridSamplerParam<GPU_CL>& param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Output()));
+  cl_int status;
+  auto output = param.Output();
+  auto input = param.InputX();
+  auto grid = param.Grid();
+  auto output_image = output->GetCLImage();
+  auto input_image = input->GetCLImage();
+  auto grid_image = grid->GetCLImage();
+  const int out_H = output->dims()[2];
+  const int out_W = output->dims()[3];
+  status = clSetKernelArg(kernel, 0, sizeof(cl_int), &out_H);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_int), &out_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &grid_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+  const size_t work_size[3] = {default_work_size[0], default_work_size[1],
+                               default_work_size[2] / 4};
+  status = clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3,
+                                  NULL, work_size, NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class GridSamplerKernel<GPU_CL, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {
 template <>
 bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
-  auto &dims = param->Out()->dims();
+  auto &dims = param->OutputY()->dims();
  const int h = dims[2];
  std::string build_options = "";
  if (h == 128) {
@@ -41,7 +41,8 @@ bool InstanceNormKernel<GPU_CL, float>::Init(InstanceNormParam<GPU_CL> *param) {
 template <>
 void InstanceNormKernel<GPU_CL, float>::Compute(
    const InstanceNormParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param);
+  InstanceNorm(&this->cl_helper_, param.InputX(), param.OutputY(),
+               param.Epsilon());
 }
 template class InstanceNormKernel<GPU_CL, float>;

--- a/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/instancenorm_relu_kernel.cpp
@@ -23,7 +23,7 @@ namespace operators {
 template <>
 bool InstanceNormReluKernel<GPU_CL, float>::Init(
-    InstanceNormParam<GPU_CL> *param) {
+    FusionInstanceNormReluParam<GPU_CL> *param) {
  auto &dims = param->Out()->dims();
  const int h = dims[2];
  std::string build_options = "-DRELU";
@@ -41,8 +41,8 @@ bool InstanceNormReluKernel<GPU_CL, float>::Init(
 template <>
 void InstanceNormReluKernel<GPU_CL, float>::Compute(
-    const InstanceNormParam<GPU_CL> &param) {
+    const FusionInstanceNormReluParam<GPU_CL> &param) {
-  InstanceNorm(&this->cl_helper_, param);
+  InstanceNorm(&this->cl_helper_, param.InputX(), param.Out(), param.Epsilon());
 }
 template class InstanceNormReluKernel<GPU_CL, float>;

--- a/mobile/src/operators/kernel/expand_kernel.h
+++ b/mobile/src/operators/kernel/expand_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef EXPAND_OP
+DECLARE_KERNEL(Expand, ExpandParam);
+#endif  // EXPAND_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef ELEMENTWISEADD_OP
-#include <math.h>
 #include "operators/kernel/elementwise_add_kernel.h"
+#include <math.h>
 namespace paddle_mobile {
 namespace operators {
@@ -62,14 +62,14 @@ void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) {
  int inputh = ewaddArgs.image0.height;
  int inputw = ewaddArgs.image0.width;
  float inScale0 =
-          (reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
  float inScale1 =
-          (reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
  float outScale =
-          (reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
-  int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
+  int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
-  int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
+  int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
-  int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
+  int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
  int datasize = inputc * inputh * inputw;
  float const0 = inScale0 / outScale;
  float const1 = inScale1 / outScale;

--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
-#include <math.h>
 #include "operators/kernel/elementwise_add_relu_kernel.h"
+#include <math.h>
 namespace paddle_mobile {
 namespace operators {
@@ -63,14 +63,14 @@ void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) {
  int inputh = ewaddArgs.image0.height;
  int inputw = ewaddArgs.image0.width;
  float inScale0 =
-          (reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.image0.scale_address))[0];
  float inScale1 =
-          (reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.image1.scale_address))[0];
  float outScale =
-          (reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
+      (reinterpret_cast<float *>(ewaddArgs.output.scale_address))[0];
-  int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
+  int8_t *inPtr0 = reinterpret_cast<int8_t *>(ewaddArgs.image0.address);
-  int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
+  int8_t *inPtr1 = reinterpret_cast<int8_t *>(ewaddArgs.image1.address);
-  int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
+  int8_t *outPtr = reinterpret_cast<int8_t *>(ewaddArgs.output.address);
  int datasize = inputc * inputh * inputw;
  float const0 = inScale0 / outScale;
  float const1 = inScale1 / outScale;

--- a/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/proposal_kernel.cpp
@@ -331,7 +331,7 @@ std::pair<Tensor, Tensor> ProposalForOneImage(
    keep_nms.Resize({post_nms_top_n});
  }
-  proposals.mutable_data<T>({keep_nms.numel(), 4});   // original
+  proposals.mutable_data<T>({keep_nms.numel(), 4});        // original
  scores_sel.mutable_data<int8_t>({keep_nms.numel(), 1});  // original
  CPUGather<T>(bbox_sel, keep_nms, &proposals);
@@ -371,8 +371,8 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
  for (int h = 0; h < score_height; h++) {
    for (int w = 0; w < score_width; w++) {
      for (int c = 0; c < score_channels; ++c) {
-        int dstidx = h*unalignedCW + w*score_channels + c;
+        int dstidx = h * unalignedCW + w * score_channels + c;
-        int srcidx = h*alignedCW + w*score_channels + c;
+        int srcidx = h * alignedCW + w * score_channels + c;
        score_tensor.data<int8_t>()[dstidx] = input_score_data[srcidx];
      }
    }
@@ -388,11 +388,11 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
  for (int h = 0; h < bbox_height; h++) {
    for (int w = 0; w < bbox_width; w++) {
      for (int c = 0; c < bbox_channels; ++c) {
-        int dstidx = h*unalignedCW + w*bbox_channels + c;
+        int dstidx = h * unalignedCW + w * bbox_channels + c;
-        int srcidx = h*alignedCW + w*bbox_channels + c;
+        int srcidx = h * alignedCW + w * bbox_channels + c;
        bbox_tensor->data<float>()[dstidx] =
-            (static_cast<int>(input_bbox_data[srcidx]))/127.0*
+            (static_cast<int>(input_bbox_data[srcidx])) / 127.0 *
-               input_bbox->scale[0];
+            input_bbox->scale[0];
      }
    }
  }
@@ -412,14 +412,14 @@ void ProposalKernel<FPGA, float>::Compute(const ProposalParam<FPGA> &param) {
  float min_size = param.min_size_;
  float eta = param.eta_;
-  rpn_rois->mutable_data<float>({bbox_tensor->numel()/4, 4});
+  rpn_rois->mutable_data<float>({bbox_tensor->numel() / 4, 4});
-  rpn_roi_probs->mutable_data<int8_t>({input_score->numel()/4, 1});
+  rpn_roi_probs->mutable_data<int8_t>({input_score->numel() / 4, 1});
  framework::LoD lod;
  lod.resize(1);
  auto &lod0 = lod[0];
  lod0.push_back(0);
-  anchors.Resize({anchors.numel()/4, 4});
+  anchors.Resize({anchors.numel() / 4, 4});
-  variances.Resize({variances.numel()/4, 4});
+  variances.Resize({variances.numel() / 4, 4});
  int64_t num_proposals = 0;
  for (int64_t i = 0; i < score_n; ++i) {

--- a/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/psroi_pool_kernel.cpp
@@ -143,7 +143,6 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
      "the channels of input X should equal the product of "
      "output_channels x pooled_height x pooled_width");
  auto output_data = out->mutable_data<float>();
  auto input_rois = rois->data<float>();
@@ -173,11 +172,11 @@ void PSRoiPoolKernel<FPGA, float>::Compute(const PSRoiPoolParam<FPGA>& param) {
    for (int ph = 0; ph < pooled_height; ph++) {
      for (int pw = 0; pw < pooled_width; pw++) {
-        PSROIPoolingForward<float>(
+        PSROIPoolingForward<float>(input_data, height, width, input_channels,
-            input_data, height, width, input_channels, offset_output_data,
+                                   offset_output_data, pooled_height,
-            pooled_height, pooled_width, output_channels, input_rois,
+                                   pooled_width, output_channels, input_rois,
-            bin_size_h, bin_size_w, roi_start_h, roi_start_w, pw, ph,
+                                   bin_size_h, bin_size_w, roi_start_h,
-            scale, roi_batch_ind);
+                                   roi_start_w, pw, ph, scale, roi_batch_ind);
      }
    }
  }

--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
@@ -118,11 +118,10 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
    auto inputdimsize = input->dims().size();
    auto outputdimsize = output->dims().size();
    int smallersize =
-            inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
+        inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
    int i = 0;
    for (i = 0; i < smallersize; i++) {
-      if ((input->dims())[i] != (output->dims())[i])
+      if ((input->dims())[i] != (output->dims())[i]) break;
-        break;
    }
    if (i == smallersize) {
      reshapeNeedFlg = 0;

--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -57,31 +57,30 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
  int len = end - start;
  size_t size = len * sizeof(int8_t);
  DLOG << input->fpga_data_num;
-  fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t));
+  fpga::fpga_invalidate(input_ptr, input->fpga_data_num * sizeof(int8_t));
  DLOG << output->fpga_data_num;
-  fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t));
+  fpga::fpga_invalidate(output_ptr, output->fpga_data_num * sizeof(int8_t));
  int unalignedWC = len * W;
  int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
  if (unalignedWC != alignedWC) {
-      auto tmpOutput = reinterpret_cast<int8_t*>
+    auto tmpOutput =
-              (fpga::fpga_malloc(len*HW * sizeof(int8_t)));
+        reinterpret_cast<int8_t*>(fpga::fpga_malloc(len * HW * sizeof(int8_t)));
-      for (int i = 0; i < HW; i++) {
+    for (int i = 0; i < HW; i++) {
-          memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
+      memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
+    }
+    for (int i = 0; i < H; i++) {
+      for (int j = 0; j < unalignedWC; j++) {
+        *(output_ptr + alignedWC * i + j) = *(tmpOutput + unalignedWC * i + j);
      }
-      for (int i = 0; i < H; i++) {
+    }
-          for (int j = 0; j < unalignedWC; j++) {
+    fpga::fpga_free(tmpOutput);
-              *(output_ptr + alignedWC * i + j) =
-                      *(tmpOutput + unalignedWC * i + j);
-          }
-      }
-      fpga::fpga_free(tmpOutput);
  } else {
-      for (int i = 0; i < HW; i++) {
+    for (int i = 0; i < HW; i++) {
-          memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+      memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
-      }
+    }
  }
-  fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t));
+  fpga::fpga_flush(output_ptr, output->fpga_data_num * sizeof(int8_t));
 }
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/mobile/src/operators/kernel/grid_sampler_kernel.h
+++ b/mobile/src/operators/kernel/grid_sampler_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+#ifdef GRID_SAMPLER_OP
+DECLARE_KERNEL(GridSampler, GridSamplerParam);
+#endif  // GRID_SAMPLER_OP
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/mobile/src/operators/kernel/instancenorm_relu_kernel.h
+++ b/mobile/src/operators/kernel/instancenorm_relu_kernel.h
@@ -30,10 +30,10 @@ using framework::OpKernelBase;
 template <typename DeviceType, typename T>
 class InstanceNormReluKernel
-    : public OpKernelBase<DeviceType, InstanceNormParam<DeviceType>> {
+    : public OpKernelBase<DeviceType, FusionInstanceNormReluParam<DeviceType>> {
 public:
-  void Compute(const InstanceNormParam<DeviceType> &param);
+  void Compute(const FusionInstanceNormReluParam<DeviceType> &param);
-  bool Init(InstanceNormParam<DeviceType> *param);
+  bool Init(FusionInstanceNormReluParam<DeviceType> *param);
 };
 }  // namespace operators

--- a/mobile/src/operators/nearest_interp_op.cpp
+++ b/mobile/src/operators/nearest_interp_op.cpp
@@ -24,8 +24,9 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
                        "Input(X) of BilinearInterOp should not be null.");
  PADDLE_MOBILE_ENFORCE(this->param_.Out() != nullptr,
                        "Output(Out) of BilinearInterOp should not be null.");
  auto dim_x = this->param_.InputX()->dims();  // NCHW format
+  DLOG << "dim_x :" << dim_x;
  int out_h = this->param_.OutH();
  int out_w = this->param_.OutW();
  PADDLE_MOBILE_ENFORCE(dim_x.size() == 4, "X's dimension must be 4");
@@ -37,8 +38,22 @@ void NearestInterpolationOp<DeviceType, T>::InferShape() const {
                          "OutSize's dimension size must be 1");
    PADDLE_MOBILE_ENFORCE(out_size_dim[0] == 2, "OutSize's dim[0] must be 2");
  }
-  std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
-  this->param_.Out()->Resize(framework::make_ddim(dim_out));
+  DLOG << "this->param_.HasScale(): " << this->param_.HasScale();
+  if (this->param_.HasScale()) {
+    const float scale = this->param_.Scale();
+    DLOG << "scale_:  " << scale;
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1],
+                                  static_cast<int>(dim_x[2] * scale),
+                                  static_cast<int>(dim_x[3] * scale)});
+    this->param_.Out()->Resize(framework::make_ddim(dim_out));
+    DLOG << "interp -- dim_out: " << dim_out;
+  } else {
+    std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
+    this->param_.Out()->Resize(framework::make_ddim(dim_out));
+    DLOG << "interp -- dim_out: " << dim_out;
+  }
 }
 }  // namespace operators

--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -337,6 +337,11 @@ class OpParam {
    return GetVarValue<T>("Filter", inputs, scope);
  }
+  template <typename T>
+  static T *GridFrom(const VariableNameMap &inputs, const Scope &scope) {
+    return GetVarValue<T>("Grid", inputs, scope);
+  }
  template <typename T>
  static const T GetAttr(const string &key, const AttributeMap &map) {
    return ((Attribute)map.at(key)).Get<T>();
@@ -927,6 +932,35 @@ class InstanceNormParam : public OpParam {
                    Scope *scope)
      : OpParam(inputs, outputs, attrs, scope) {
    input_x_ = InputXFrom<GType>(inputs, *scope);
+    output_y_ = OutputYFrom<GType>(outputs, *scope);
+    epsilon_ = GetAttr<float>("epsilon", attrs);
+  }
+  const GType *InputX() const { return input_x_; }
+  GType *OutputY() const { return output_y_; }
+  const float &Epsilon() const { return epsilon_; }
+ private:
+  GType *input_x_;
+  GType *output_y_;
+  float epsilon_;
+};
+#endif
+#ifdef FUSION_INSTANCENORM_RELU_OP
+template <typename Dtype>
+class FusionInstanceNormReluParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  FusionInstanceNormReluParam(const VariableNameMap &inputs,
+                              const VariableNameMap &outputs,
+                              const AttributeMap &attrs, Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
    out_ = OutFrom<GType>(outputs, *scope);
    epsilon_ = GetAttr<float>("epsilon", attrs);
  }
@@ -3008,7 +3042,7 @@ class SplitParam : public OpParam {
  int axis;
  int num;
  std::vector<int> sections;
-  //  std::vector<GType> out_ts_;
+//  std::vector<GType> out_ts_;
 #ifdef PADDLE_MOBILE_FPGA
 private:
@@ -3069,12 +3103,20 @@ class NearestInterpolationParam : public OpParam {
    out_ = OutFrom<GType>(outputs, *scope);
    out_h_ = GetAttr<int>("out_h", attrs);
    out_w_ = GetAttr<int>("out_w", attrs);
+    if (HasAttr("scale", attrs)) {
+      has_scale_ = true;
+      scale_ = GetAttr<float>("scale", attrs);
+    }
+    DLOG << "has_scale_:  " << has_scale_;
+    DLOG << "scale_:  " << scale_;
  }
  const GType *InputX() const { return input_x_; }
  const GType *InputOutPutSize() const { return input_outsize_; }
  GType *Out() const { return out_; }
  int OutH() const { return out_h_; }
  int OutW() const { return out_w_; }
+  float Scale() const { return scale_; }
+  bool HasScale() const { return has_scale_; }
 private:
  GType *input_x_;
@@ -3082,6 +3124,8 @@ class NearestInterpolationParam : public OpParam {
  GType *out_;
  int out_h_;
  int out_w_;
+  float scale_;
+  bool has_scale_;
 };
 #endif
@@ -3658,5 +3702,60 @@ class PixelShuffleParam : public OpParam {
 };
 #endif
+#ifdef GRID_SAMPLER_OP
+template <typename Dtype>
+class GridSamplerParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  GridSamplerParam(const VariableNameMap &inputs,
+                   const VariableNameMap &outputs, const AttributeMap &attrs,
+                   Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    grid_ = GridFrom<GType>(inputs, *scope);
+    output_ = OutputFrom<GType>(outputs, *scope);
+  }
+  const GType *InputX() const { return input_x_; }
+  const GType *Grid() const { return grid_; }
+  GType *Output() const { return output_; }
+ private:
+  GType *input_x_;
+  GType *grid_;
+  GType *output_;
+};
+#endif
+#ifdef EXPAND_OP
+template <typename Dtype>
+class ExpandParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  ExpandParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+              const AttributeMap &attrs, Scope *scope)
+      : OpParam(inputs, outputs, attrs, scope) {
+    input_x_ = InputXFrom<GType>(inputs, *scope);
+    out_ = OutFrom<GType>(outputs, *scope);
+    expand_times = OpParam::GetAttr<std::vector<int>>("expand_times", attrs);
+  }
+  const GType *InputX() const { return input_x_; }
+  GType *Out() const { return out_; }
+  std::vector<int> expand_times;
+ private:
+  GType *input_x_;
+  GType *out_;
+};
+#endif
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
@@ -45,7 +45,7 @@ if (CON GREATER -1)
    set(FOUND_MATCH ON)
    # gen test
-    ADD_EXECUTABLE(test-mobilenetgpu  net/test_mobilenet_GPU.cpp test_helper.h  test_include.h)
+    ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h)
    target_link_libraries(test-mobilenetgpu paddle-mobile)
 endif ()
@@ -105,7 +105,7 @@ if (CON GREATER -1)
    ADD_EXECUTABLE(test-marker-api fpga/test_marker_api.cpp)
    target_link_libraries(test-marker-api paddle-mobile)
    #ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h )
    #target_link_libraries(test-marker2 paddle-mobile)
@@ -193,13 +193,16 @@ endif ()
 list(FIND NET "op" CON)
 if (CON GREATER -1)
-    # gen test
+    #    # gen test
-    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
+    #    ADD_EXECUTABLE(test-sigmoid operators/test_sigmoid_op.cpp test_include.h)
-    target_link_libraries(test-sigmoid paddle-mobile)
+    #    target_link_libraries(test-sigmoid paddle-mobile)
+    #
+    #    # gen test log
+    #    ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
+    #    target_link_libraries(test-leakyrelu paddle-mobile)
    # gen test log
-    ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
+    ADD_EXECUTABLE(test-log common/test_log.cpp)
-    target_link_libraries(test-leakyrelu paddle-mobile)
+    target_link_libraries(test-log paddle-mobile)
    set(FOUND_MATCH ON)
 endif ()
@@ -208,342 +211,355 @@ if (ENABLE_ALL_TEST)
        # gen test
        ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-resnet paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-squeezenet paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-yolo net/test_yolo.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-yolo paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test_yolo_combined net/test_yolo_combined.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test_yolo_combined paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-op-in-net net/test_op_in_net.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-op-in-net paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-googlenet net/test_googlenet.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-googlenet paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-googlenet-quali net/test_googlenet_quali.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-googlenet-quali paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-conv-op operators/test_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-conv-op paddle-mobile)
+        # gen test
+        ADD_EXECUTABLE(test-expend-op operators/test_expend_op.cpp test_helper.h test_include.h executor_for_test_opencl.h)
+        target_link_libraries(test-expend-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-mul-op operators/test_mul_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-mul-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-elementwiseadd-op operators/test_elementwise_add_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-elementwiseadd-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-elementwisesub-op operators/test_elementwise_sub_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-elementwisesub-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-im2sequence-op operators/test_im2sequence_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-im2sequence-op paddle-mobile)
-    	# gen test
+        # gen test
        ADD_EXECUTABLE(test-concat-op operators/test_concat_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-concat-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-lrn-op operators/test_lrn_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-lrn-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-batchnorm-op operators/test_batchnorm_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-batchnorm-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-priorbox-op operators/test_prior_box_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-priorbox-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-boxcoder-op operators/test_box_coder_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-boxcoder-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-transpose-op operators/test_transpose_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-transpose-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-transpose2-op operators/test_transpose2_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-transpose2-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-multiclassnms-op operators/test_multiclass_nms_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-multiclassnms-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-polygon-box-transform-op operators/test_polygon_box_transform_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-polygon-box-transform-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-fill-constant-op operators/test_fill_constant_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-fill-constant-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-reshape-op operators/test_reshape_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-reshape-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-reshape2-op operators/test_reshape2_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-reshape2-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-relu-op operators/test_relu_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-relu-op paddle-mobile)
        ADD_EXECUTABLE(test-relu6-op operators/test_relu6_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-relu6-op paddle-mobile)
        ADD_EXECUTABLE(test-tanh-op operators/test_tanh_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-tanh-op paddle-mobile)
        ADD_EXECUTABLE(test-log-op operators/test_log_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-log-op paddle-mobile)
        ADD_EXECUTABLE(test-topk-op operators/test_topk_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-topk-op paddle-mobile)
        ADD_EXECUTABLE(test-cast-op operators/test_cast_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-cast-op paddle-mobile)
        ADD_EXECUTABLE(test-less-than-op operators/test_less_than_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-less-than-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-fc-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-sum-op operators/test_sum_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-sum-op paddle-mobile)
        # test quantize op
        ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-quantize-op paddle-mobile)
        # test dequantize op
        ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-dequantize-op paddle-mobile)
        # gen test log
        ADD_EXECUTABLE(test-log common/test_log.cpp)
        target_link_libraries(test-log paddle-mobile)
        # gen test log
        ADD_EXECUTABLE(test-load framework/test_load.cpp)
        target_link_libraries(test-load paddle-mobile)
        # gen test log
        ADD_EXECUTABLE(test-loadmemory framework/test_load_memory.cpp)
        target_link_libraries(test-loadmemory paddle-mobile)
        # gen test log
        ADD_EXECUTABLE(test-loadmemory-inference framework/test_load_memory_inference_api.cpp)
        target_link_libraries(test-loadmemory-inference paddle-mobile)
        ADD_EXECUTABLE(test-inference-api framework/test_inference_api.cpp)
        target_link_libraries(test-inference-api paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-optimize framework/test_optimize.cpp)
        target_link_libraries(test-optimize paddle-mobile)
        #gen test
        ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-pool-op paddle-mobile)
        #gen test
        ADD_EXECUTABLE(test-softmax-op operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-softmax-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-gemm-accuracy common/test_gemm_accuracy.cpp)
        target_link_libraries(test-gemm-accuracy paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-gemm-int8-accuracy common/test_gemm_int8_accuracy.cpp)
        target_link_libraries(test-gemm-int8-accuracy paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-gemm-perf common/test_gemm_perf.cpp)
        target_link_libraries(test-gemm-perf paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-enforce common/test_enforce.cpp)
        target_link_libraries(test-enforce paddle-mobile)
        # gen test - test if openmp works
        ADD_EXECUTABLE(test-openmp common/test_openmp.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-openmp paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-mobilenetssd net/test_mobilenet+ssd.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-mobilenetssd paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-mobilenet-combine net/test_mobilenet_combine.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-mobilenet-combine paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-genet net/test_genet_combine.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-genet paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-sigmoid-op operators/test_sigmoid_op.cpp test_include.h)
        target_link_libraries(test-sigmoid-op paddle-mobile)
        # gen test log
        ADD_EXECUTABLE(test-leakyrelu operators/test_leaky_relu_op.cpp)
        target_link_libraries(test-leakyrelu paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-depthwise-conv-op operators/test_depthwise_conv_op.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-depthwise-conv-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-mobilenet net/test_mobilenet.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-mobilenet paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-conv-add-relu-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-nlp net/test_nlp.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-nlp paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-gru-op operators/test_gru_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-gru-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-inceptionv4 paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-alexnet paddle-mobile)
        ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h)
        target_link_libraries(test-googlenetv1 paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h)
        target_link_libraries(test-fssd paddle-mobile)
        # gen test
-        ADD_EXECUTABLE(test-mobilenetgpu  net/test_mobilenet_GPU.cpp test_helper.h  test_include.h)
+        ADD_EXECUTABLE(test-mobilenetgpu net/test_mobilenet_GPU.cpp test_helper.h test_include.h)
        target_link_libraries(test-mobilenetgpu paddle-mobile)
        # gen test
-        ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h  test_include.h executor_for_test.h)
+        ADD_EXECUTABLE(test-yologpu net/test_yologpu.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-yologpu paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-multi-process net/test_multi_inference_predict.cpp test_helper.h test_include.h)
        target_link_libraries(test-multi-process paddle-mobile)
        # gen test benchmark
        ADD_EXECUTABLE(test-benchmark net/test_benchmark.cpp)
        target_link_libraries(test-benchmark paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-eng net/test_eng.cpp test_helper.h test_include.h)
        target_link_libraries(test-eng paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-super net/test_super.cpp test_helper.h test_include.h)
        target_link_libraries(test-super paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-ocr net/test_ocr.cpp test_helper.h test_include.h)
        target_link_libraries(test-ocr paddle-mobile)
        ADD_EXECUTABLE(test-gesture net/test_gesture.cpp test_helper.h test_include.h)
        target_link_libraries(test-gesture paddle-mobile)
        ADD_EXECUTABLE(test-sequence-expand-op operators/test_sequence_expand_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-sequence-expand-op paddle-mobile)
        ADD_EXECUTABLE(test-sequence-pool-op operators/test_sequence_pool_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-sequence-pool-op paddle-mobile)
        ADD_EXECUTABLE(test-sequence-softmax-op operators/test_sequence_softmax_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-sequence-softmax-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-vgg16ssd net/test_vgg16ssd.cpp test_helper.h test_include.h)
        target_link_libraries(test-vgg16ssd paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-logical-and-op operators/test_logical_and_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-logical-and-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-logical-or-op operators/test_logical_or_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-logical-or-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-logical-not-op operators/test_logical_not_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-logical-not-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-logical-xor-op operators/test_logical_xor_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-logical-xor-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-increment-op operators/test_increment_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-increment-op paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-is-empty-op operators/test_is_empty_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-is-empty-op paddle-mobile)
        ADD_EXECUTABLE(test-conv-bn-relu-op operators/test_conv_bn_relu_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-conv-bn-relu-op paddle-mobile)
        ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h)
        target_link_libraries(test-dwconv-bn-relu-op paddle-mobile)
        ADD_EXECUTABLE(test-conv-gpu operators/test_conv_gpu.cpp test_helper.h test_include.h)
        target_link_libraries(test-conv-gpu paddle-mobile)
        ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
        target_link_libraries(test-net-benchmark paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-net paddle-mobile)
+        # gen test
+        ADD_EXECUTABLE(test-net-feeds net/test_net_multi_feed.cpp test_helper.h test_include.h executor_for_test.h)
+        target_link_libraries(test-net-feeds paddle-mobile)
        # gen test
        ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-net-performance paddle-mobile)
+        ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
+        target_link_libraries(test-inference-api-v2 paddle-mobile)
    endif ()
-else()
+else ()
    # gen test
    ADD_EXECUTABLE(test-net net/test_net.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-net paddle-mobile)
    ADD_EXECUTABLE(test-net-benchmark net/test_net_benchmark.cpp test_helper.h test_include.h)
    target_link_libraries(test-net-benchmark paddle-mobile)
-endif()
+    ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-inference-api-v2 paddle-mobile)
+endif ()
--- a/mobile/test/common/test_log.cpp
+++ b/mobile/test/common/test_log.cpp
@@ -15,10 +15,8 @@ limitations under the License. */
 #include "common/log.h"
 int main() {
-  DLOGF("DASJFDAFJ%d -- %f", 12345, 344.234);
+  LOG(paddle_mobile::kLOG_DEBUG3) << "test debug"
+                                  << " next log";
-  LOGF(paddle_mobile::kLOG_DEBUG, "DASJFDAFJ%d -- %f", 12345, 344.234);
  LOG(paddle_mobile::kLOG_DEBUG) << "test debug"
                                 << " next log";
@@ -26,9 +24,12 @@ int main() {
                                  << " next log";
  LOG(paddle_mobile::kLOG_DEBUG2) << "test debug2"
                                  << " next log";
+  LOG(paddle_mobile::kLOG_INFO) << "INFO!!!";
+  LOG(paddle_mobile::kLOG_WARNING) << "WARNING!!!";
+  LOG(paddle_mobile::kLOG_VERBOSE) << "VERBOSE!!!";
  DLOG << "test DLOG";
-  LOG(paddle_mobile::kLOG_ERROR) << " error occur !";
+  LOG(paddle_mobile::kLOG_ERROR) << "ERROR !";
  return 0;
 }
--- a/mobile/test/executor_for_test.h
+++ b/mobile/test/executor_for_test.h
@@ -14,9 +14,9 @@ limitations under the License. */
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "common/log.h"
 #include "framework/executor.h"
 #include "framework/op_registry.h"
@@ -74,8 +74,11 @@ class Executor4Test : public Executor<DeviceType> {
        break;
      }
    }
+    if (this->program_.combined) {
-    this->InitMemory();
+      this->InitCombineMemory();
+    } else {
+      this->InitMemory();
+    }
    for (const auto &op : this->ops_of_block0_) {
      op->Init();
    }

--- a/mobile/test/executor_for_test_opencl.h
+++ b/mobile/test/executor_for_test_opencl.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_MOBILE_CL
+#include <memory>
+#include <string>
+#include <vector>
+#include "./test_helper.h"
+#include "common/log.h"
+#include "framework/cl/cl_helper.h"
+#include "framework/cl/cl_tensor.h"
+#include "framework/executor.h"
+#include "framework/op_registry.h"
+#include "operators/feed_op.h"
+#include "operators/fetch_op.h"
+using paddle_mobile::framework::AttributeMap;
+using paddle_mobile::framework::BlockDesc;
+using paddle_mobile::framework::DDim;
+using paddle_mobile::framework::Executor;
+using paddle_mobile::framework::LoDTensor;
+using paddle_mobile::framework::OpDesc;
+using paddle_mobile::framework::OperatorBase;
+using paddle_mobile::framework::Program;
+using paddle_mobile::framework::Tensor;
+using paddle_mobile::framework::Variable;
+using std::string;
+using std::vector;
+namespace paddle_mobile {
+template <typename OpType>
+class OpenClOpTester {
+ public:
+  OpenClOpTester() {
+    framework::CLEngine::Instance()->setClPath("/data/local/tmp/bin");
+    scope_ = std::make_shared<paddle_mobile::framework::Scope>();
+    feed_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
+    fetch_clhelper_ = framework::CLHelper(scope_->GetCLScpoe());
+    this->feed_clhelper_.AddKernel("feed", "feed_kernel.cl");
+    this->fetch_clhelper_.AddKernel("fetch", "fetch_kernel.cl");
+    feed_var = scope_.get()->Var("feed");
+    fetch_var = scope_.get()->Var("fetch");
+    op_in_var = scope_.get()->Var("op_in");
+    op_out_var = scope_.get()->Var("op_out");
+  }
+  void Predict(string op_type, DDim feed_dims, DDim fetch_dims,
+               VariableNameMap inputs_feed, VariableNameMap outputs_feed,
+               AttributeMap attrs_feed) {
+    framework::CLImage *const op_in_cl_image =
+        op_in_var->template GetMutable<framework::CLImage>();
+    op_in_cl_image->Resize(feed_dims);
+    op_in_cl_image->InitEmptyImage(feed_clhelper_.CLContext(),
+                                   feed_clhelper_.CLCommandQueue(), feed_dims);
+    framework::CLImage *const op_out_cl_image =
+        op_out_var->template GetMutable<framework::CLImage>();
+    op_out_cl_image->Resize(fetch_dims);
+    framework::CLScope *const clScpoe = scope_->GetCLScpoe();
+    op_out_cl_image->InitEmptyImage(clScpoe->Context(), clScpoe->CommandQueue(),
+                                    fetch_dims);
+    Feed(feed_dims);
+    auto *op = new OpType(op_type, inputs_feed, outputs_feed, attrs_feed,
+                          scope_.get());
+    op->InferShape();
+    op->Init();
+    op->Run();
+    Fetch(fetch_dims);
+  }
+  void Feed(DDim feed_dims) {
+    auto *feed_var = scope_->Var("feed");
+    auto *_var = scope_->Var("op_in");
+    auto *const input = feed_var->template GetMutable<framework::LoDTensor>();
+    DLOG << "feed_dims: " << feed_dims;
+    SetupTensor<float>(input, feed_dims, -100.0, 100.0);
+    framework::CLImage *const op_in_cl_image =
+        op_in_var->template GetMutable<framework::CLImage>();
+    DLOG << "FeedKernel run ";
+    DLOG << "params.input " << *input;
+    DLOG << "params.op_in_cl_image " << *op_in_cl_image;
+    auto kernel = this->feed_clhelper_.KernelAt(0);
+    DLOG << "kernel get success ";
+    auto default_work_size =
+        this->feed_clhelper_.DefaultWorkSize(*(op_in_cl_image));
+    DLOG << "op_in_cl_image: " << *op_in_cl_image;
+    DLOG << "default_work_size: " << default_work_size;
+    cl_int status;
+    int numel = input->numel();
+    cl_mem output_image = op_in_cl_image->GetCLImage();
+    const int out_C = op_in_cl_image->dims()[1];
+    const int out_H = op_in_cl_image->dims()[2];
+    const int out_W = op_in_cl_image->dims()[3];
+    const int Stride2 = out_C * out_H * out_W;
+    const int Stride1 = out_H * out_W;
+    const int Stride0 = out_W;
+    framework::CLTensor input_cl_tensor(this->feed_clhelper_.CLContext(),
+                                        this->feed_clhelper_.CLCommandQueue());
+    input_cl_tensor.Resize(input->dims());
+    cl_mem inputBuffer;
+    inputBuffer =
+        input_cl_tensor.mutable_with_data<float>(input->data<float>());
+    status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 2, sizeof(cl_int), &out_H);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 3, sizeof(cl_int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 4, sizeof(cl_int), &out_C);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 5, sizeof(cl_int), &Stride0);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 6, sizeof(cl_int), &Stride1);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel, 7, sizeof(cl_int), &Stride2);
+    CL_CHECK_ERRORS(status);
+    status = clEnqueueNDRangeKernel(
+        this->feed_clhelper_.CLCommandQueue(), kernel, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+    DLOG << "*op_in_cl_image: " << *op_in_cl_image;
+  }
+  void Fetch(DDim fetch_dims) {
+    DLOG << "------------------  Fetch op ---------------------";
+    DLOG << "------------------  Fetch op end ---------------------";
+  }
+ private:
+  std::shared_ptr<paddle_mobile::framework::Scope> scope_;
+  framework::CLHelper feed_clhelper_;
+  framework::CLHelper fetch_clhelper_;
+  Variable *feed_var;
+  Variable *fetch_var;
+  Variable *op_in_var;
+  Variable *op_out_var;
+};
+}  // namespace paddle_mobile
+#endif
--- a/mobile/test/net/test_inference_api_v2.cpp
+++ b/mobile/test/net/test_inference_api_v2.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include "../test_helper.h"
+#include "io/paddle_inference_api.h"
+using namespace paddle_mobile;  // NOLINT
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kGPU_CL;
+  config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
+  config.prog_file = "../models/ercy/model";
+  config.param_file = "../models/ercy/params";
+  config.lod_mode = false;
+  config.load_when_predict = false;
+  return config;
+}
+int main() {
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+  // reliable
+  int re_len = 1 * 1 * 64 * 72;
+  std::vector<float> re_v;
+  std::vector<int64_t> re_dims{1, 1, 64, 72};
+  GetInput<float>(g_test_image_1x3x224x224, &re_v, re_dims);
+  PaddleTensor re;
+  re.shape = std::vector<int>({1, 1, 64, 72});
+  re.data = PaddleBuf(re_v.data(), re_len * sizeof(float));
+  re.dtype = PaddleDType::FLOAT32;
+  re.layout = LayoutType::LAYOUT_CHW;
+  // grid
+  int grid_len = 1 * 64 * 72 * 2;
+  std::vector<float> grid_v;
+  std::vector<int64_t> grid_dims{1, 64, 72, 2};
+  GetInput<float>(g_test_image_1x3x224x224, &grid_v, grid_dims);
+  PaddleTensor grid;
+  grid.shape = std::vector<int>({1, 64, 72, 2});
+  grid.data = PaddleBuf(grid_v.data(), grid_len * sizeof(float));
+  grid.dtype = PaddleDType::FLOAT32;
+  grid.layout = LayoutType::LAYOUT_CHW;
+  // last_input
+  int last_len = 1 * 128 * 64 * 72;
+  std::vector<float> last_v;
+  std::vector<int64_t> last_dims{1, 128, 64, 72};
+  GetInput<float>(g_test_image_1x3x224x224, &last_v, last_dims);
+  PaddleTensor last;
+  last.shape = std::vector<int>({1, 128, 64, 72});
+  last.data = PaddleBuf(last_v.data(), last_len * sizeof(float));
+  last.dtype = PaddleDType::FLOAT32;
+  last.layout = LayoutType::LAYOUT_CHW;
+  // input_rgb
+  int input_rgb_len = 1 * 4 * 256 * 288;
+  std::vector<float> input_rgb_v;
+  std::vector<int64_t> input_rgb_dims{1, 4, 256, 288};
+  GetInput<float>(g_test_image_1x3x224x224, &input_rgb_v, input_rgb_dims);
+  PaddleTensor input_rgb;
+  input_rgb.shape = std::vector<int>({1, 4, 256, 288});
+  input_rgb.data = PaddleBuf(input_rgb_v.data(), input_rgb_len * sizeof(float));
+  input_rgb.dtype = PaddleDType::FLOAT32;
+  input_rgb.layout = LayoutType::LAYOUT_CHW;
+  PaddleTensor output0;
+  output0.shape = std::vector<int>({});
+  output0.data = PaddleBuf();
+  output0.dtype = PaddleDType::FLOAT32;
+  output0.layout = LayoutType::LAYOUT_CHW;
+  PaddleTensor output1;
+  output1.shape = std::vector<int>({});
+  output1.data = PaddleBuf();
+  output1.dtype = PaddleDType::FLOAT32;
+  output1.layout = LayoutType::LAYOUT_CHW;
+  predictor->Feed("reliable", re);
+  predictor->Feed("grid", grid);
+  predictor->Feed("last_input", last);
+  predictor->Feed("input_rgb", input_rgb);
+  predictor->Run();
+  predictor->Fetch("save_infer_model/scale_0", &output0);
+  predictor->Fetch("save_infer_model/scale_1", &output1);
+  float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
+  float* out_ptr1 = reinterpret_cast<float*>(output1.data.data());
+  std::cout << " print output0 : " << std::endl;
+  int numel = output0.data.length() / sizeof(float);
+  int stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr0[j] << " ";
+  }
+  std::cout << std::endl;
+  std::cout << " print output1 : " << std::endl;
+  numel = output1.data.length() / sizeof(float);
+  stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr1[j] << " ";
+  }
+  std::cout << std::endl;
+  return 0;
+}
--- a/mobile/test/net/test_net_multi_feed.cpp
+++ b/mobile/test/net/test_net_multi_feed.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_MOBILE_CL
+#include <fstream>
+#include <iostream>
+#include <string>
+#include "../test_helper.h"
+#include "../test_include.h"
+void test(int argc, char *argv[]);
+void feed(PaddleMobile<paddle_mobile::GPU_CL> *paddle_mobile, const DDim &dims,
+          std::string feed_name) {
+  float *input_data_array = new float[product(dims)];
+  std::ifstream in(feed_name, std::ios::in);
+  for (int i = 0; i < product(dims); i++) {
+    float num;
+    in >> num;
+    input_data_array[i] = num;
+  }
+  in.close();
+  framework::Tensor input_tensor(input_data_array, dims);
+  DLOG << feed_name << " : " << input_tensor;
+  paddle_mobile->Feed(feed_name, input_tensor);
+}
+int main(int argc, char *argv[]) {
+  test(argc, argv);
+  return 0;
+}
+void test(int argc, char *argv[]) {
+  int arg_index = 1;
+  bool fuse = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  bool enable_memory_optimization = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  bool quantification = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  int quantification_fold = std::stoi(argv[arg_index]);
+  arg_index++;
+  paddle_mobile::PaddleMobileConfigInternal config;
+  config.memory_optimization_level = enable_memory_optimization
+                                         ? MemoryOptimizationWithoutFeeds
+                                         : NoMemoryOptimization;
+#ifdef PADDLE_MOBILE_CL
+  //  config.load_when_predict = true;
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile(config);
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+  std::cout << "testing opencl yyz " << std::endl;
+#else
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile(config);
+  paddle_mobile.SetThreadNum(1);
+  std::cout << "testing cpu yyz " << std::endl;
+#endif
+  int dim_count = std::stoi(argv[arg_index]);
+  arg_index++;
+  int size = 1;
+  arg_index += dim_count;
+  bool is_lod = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  paddle_mobile::framework::LoD lod{{}};
+  if (is_lod) {
+    int lod_count = std::stoi(argv[arg_index]);
+    arg_index++;
+    for (int i = 0; i < lod_count; i++) {
+      int dim = std::stoi(argv[arg_index + i]);
+      lod[0].push_back(dim);
+    }
+    arg_index += lod_count;
+  }
+  int var_count = std::stoi(argv[arg_index]);
+  arg_index++;
+  bool is_sample_step = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  int sample_arg = std::stoi(argv[arg_index]);
+  int sample_step = sample_arg;
+  int sample_num = sample_arg;
+  arg_index++;
+  std::vector<std::string> var_names;
+  for (int i = 0; i < var_count; i++) {
+    std::string var_name = argv[arg_index + i];
+    var_names.push_back(var_name);
+  }
+  arg_index += var_count;
+  bool check_shape = std::stoi(argv[arg_index]) == 1;
+  arg_index++;
+  auto time1 = time();
+  if (paddle_mobile.Load("./checked_model/model", "./checked_model/params",
+                         fuse, quantification, 1, is_lod,
+                         quantification_fold)) {
+    auto time2 = time();
+    std::cout << "auto-test"
+              << " load-time-cost :" << time_diff(time1, time2) << "ms"
+              << std::endl;
+    feed(&paddle_mobile, {1, 4, 256, 288}, "input_rgb");
+    feed(&paddle_mobile, {1, 128, 64, 72}, "last_input");
+    feed(&paddle_mobile, {1, 64, 72, 2}, "grid");
+    feed(&paddle_mobile, {1, 1, 64, 72}, "reliable");
+    paddle_mobile.Predict();
+#ifdef PADDLE_MOBILE_CL
+    for (auto var_name : var_names) {
+      auto cl_image = paddle_mobile.FetchImage(var_name);
+      if (cl_image == nullptr || cl_image->GetCLImage() == nullptr) {
+        continue;
+      }
+      auto len = cl_image->numel();
+      if (len == 0) {
+        continue;
+      }
+      size_t width = cl_image->ImageDims()[0];
+      size_t height = cl_image->ImageDims()[1];
+      paddle_mobile::framework::half_t *image_data =
+          new paddle_mobile::framework::half_t[height * width * 4];
+      cl_int err;
+      cl_mem image = cl_image->GetCLImage();
+      size_t origin[3] = {0, 0, 0};
+      size_t region[3] = {width, height, 1};
+      err = clEnqueueReadImage(cl_image->CommandQueue(), image, CL_TRUE, origin,
+                               region, 0, 0, image_data, 0, NULL, NULL);
+      CL_CHECK_ERRORS(err);
+      float *tensor_data = new float[cl_image->numel()];
+      auto converter = cl_image->Converter();
+      converter->ImageToNCHW(image_data, tensor_data, cl_image->ImageDims(),
+                             cl_image->dims());
+      auto data = tensor_data;
+      std::string sample = "";
+      if (check_shape) {
+        for (int i = 0; i < cl_image->dims().size(); i++) {
+          sample += " " + std::to_string(cl_image->dims()[i]);
+        }
+      }
+      if (!is_sample_step) {
+        sample_step = len / sample_num;
+      }
+      if (sample_step <= 0) {
+        sample_step = 1;
+      }
+      for (int i = 0; i < len; i += sample_step) {
+        sample += " " + std::to_string(data[i]);
+      }
+      std::cout << "auto-test"
+                << " var " << var_name << sample << std::endl;
+    }
+#else
+    for (auto var_name : var_names) {
+      auto out = paddle_mobile.Fetch(var_name);
+      auto len = out->numel();
+      if (len == 0) {
+        continue;
+      }
+      if (out->memory_size() == 0) {
+        continue;
+      }
+      if (out->type() == type_id<int>()) {
+        auto data = out->data<int>();
+        std::string sample = "";
+        if (check_shape) {
+          for (int i = 0; i < out->dims().size(); i++) {
+            sample += " " + std::to_string(out->dims()[i]);
+          }
+        }
+        if (!is_sample_step) {
+          sample_step = len / sample_num;
+        }
+        if (sample_step <= 0) {
+          sample_step = 1;
+        }
+        for (int i = 0; i < len; i += sample_step) {
+          sample += " " + std::to_string(data[i]);
+        }
+        std::cout << "auto-test"
+                  << " var " << var_name << sample << std::endl;
+      } else if (out->type() == type_id<float>()) {
+        auto data = out->data<float>();
+        std::string sample = "";
+        if (check_shape) {
+          for (int i = 0; i < out->dims().size(); i++) {
+            sample += " " + std::to_string(out->dims()[i]);
+          }
+        }
+        if (!is_sample_step) {
+          sample_step = len / sample_num;
+        }
+        if (sample_step <= 0) {
+          sample_step = 1;
+        }
+        for (int i = 0; i < len; i += sample_step) {
+          sample += " " + std::to_string(data[i]);
+        }
+        std::cout << "auto-test"
+                  << " var " << var_name << sample << std::endl;
+      }
+    }
+#endif
+    std::cout << std::endl;
+  }
+}
+#endif
--- a/mobile/test/operators/test_expend_op.cpp
+++ b/mobile/test/operators/test_expend_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_MOBILE_CL
+#include "../executor_for_test_opencl.h"
+#include "operators/expand_op.h"
+#include "operators/feed_op.h"
+#ifdef EXPAND_OP
+int main() {
+  const int IN_N = 1;
+  const int IN_C = 1;
+  const int IN_H = 2;
+  const int IN_W = 3;
+  const int EXPEND_N = 1;
+  const int EXPEND_C = 1;
+  const int EXPEND_H = 2;
+  const int EXPEND_W = 2;
+  const int OUT_N = IN_N * EXPEND_N;
+  const int OUT_C = IN_C * EXPEND_C;
+  const int OUT_H = IN_H * EXPEND_H;
+  const int OUT_W = IN_W * EXPEND_W;
+  framework::DDim in_dims = framework::make_ddim({IN_N, IN_C, IN_H, IN_W});
+  framework::DDim out_dims = framework::make_ddim({OUT_N, OUT_C, OUT_H, OUT_W});
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  AttributeMap attrs;
+  inputs["X"] = std::vector<std::string>({"op_in"});
+  outputs["Out"] = std::vector<std::string>({"op_out"});
+  std::vector<int> expand_times = {EXPEND_N, EXPEND_C, EXPEND_H, EXPEND_W};
+  attrs["expand_times"].Set<std::vector<int>>(expand_times);
+  OpenClOpTester<operators::ExpandOp<GPU_CL, float>> tester;
+  tester.Predict("expend", in_dims, out_dims, inputs, outputs, attrs);
+}
+#endif
+#else
+int main() {}
+#endif
--- a/mobile/tools/op.cmake
+++ b/mobile/tools/op.cmake
@@ -273,8 +273,9 @@ endif()
 list(FIND NET "op" CON)
 if (CON GREATER -1)
  message("op enabled")
-  set(SIGMOID_OP ON)
+#  set(SIGMOID_OP ON)
-  set(LEAKY_RELU_OP ON)
+#  set(LEAKY_RELU_OP ON)
+  set(BLOG ON)
  set(FOUND_MATCH ON)
 endif()
@@ -379,6 +380,8 @@ if(NOT FOUND_MATCH)
  set(REDUCE_PROD_OP ON)
  set(FUSION_INSTANCENORM_RELU_OP ON)
  set(PIXEL_SHUFFLE_OP ON)
+  set(EXPAND_OP ON)
+  set(GRID_SAMPLER_OP ON)
 endif()
  # option(BATCHNORM_OP "" ON)
@@ -755,3 +758,13 @@ endif()
 if (PIXEL_SHUFFLE_OP)
  add_definitions(-DPIXEL_SHUFFLE_OP)
 endif()
+if (EXPAND_OP)
+  add_definitions(-DEXPAND_OP)
+endif()
+if (GRID_SAMPLER_OP)
+  add_definitions(-DGRID_SAMPLER_OP)
+endif()
+if (BLOG)
+  add_definitions(-DBLOG)
+endif()
--- a/mobile/tools/python/fluidtools/.gitignore
+++ b/mobile/tools/python/fluidtools/.gitignore
@@ -3,3 +3,4 @@
 !.gitignore
 !/model-encrypt-tool
 !test_wrap.py
+!run_multi_feed.py
--- a/mobile/tools/python/fluidtools/run_multi_feed.py
+++ b/mobile/tools/python/fluidtools/run_multi_feed.py
+# -*- coding: utf-8 -*
+import os
+import sys
+import math
+import subprocess
+import numpy as np
+import paddle.fluid as fluid
+model_path = "erciyuan"
+checked_model_path = "checked_model"
+feed_path = "feeds"
+output_path = "outputs"
+diff_threshold = 0.1
+is_lod = False
+mobile_model_path = ""
+fast_check = False
+is_sample_step = False
+sample_step = 1
+sample_num = 20
+need_encrypt = False
+checked_encrypt_model_path = "checked_encrypt_model"
+output_var_filter = []
+output_key_filter = {}
+check_shape = False
+quantification = False
+quantification_fold = 1000
+architecture = "arm-v7a"
+# architecture = "arm-v8a"
+correct_persistable = False
+np.set_printoptions(linewidth=150)
+mobile_exec_root = "/data/local/tmp/bin"
+mobile_src_root = os.path.abspath("../../../")
+if mobile_src_root.endswith("/"):
+    mobile_src_root = mobile_src_root[:-1]
+dot = "•"
+black = lambda x: "\033[30m" + str(x) + "\033[0m"
+red = lambda x: "\033[31m" + str(x) + "\033[0m"
+green = lambda x: "\033[32m" + str(x) + "\033[0m"
+yellow = lambda x: "\033[33m" + str(x) + "\033[0m"
+reset = lambda x: "\033[0m" + str(x)
+feed_names_ = []
+def pp_tab(x, level=0):
+    header = ""
+    for i in range(0, level):
+        header += "\t"
+    print(header + str(x))
+def pp_black(x, level=0):
+    pp_tab(black(x) + reset(""), level)
+def pp_red(x, level=0):
+    pp_tab(red(x) + reset(""), level)
+def pp_green(x, level=0):
+    pp_tab(green(x) + reset(""), level)
+def pp_yellow(x, level=0):
+    pp_tab(yellow(x) + reset(""), level)
+def sh(command):
+    pipe = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    return pipe.stdout.read().decode("utf-8")
+def push(src, dest=""):
+    sh("adb push {} {}".format(src, mobile_exec_root + "/" + dest))
+pp_yellow(dot + " start inspecting fluid model")
+exe = fluid.Executor(fluid.CPUPlace())
+exe.run(fluid.default_startup_program())
+# 加载模型
+def load_model(model_path):
+    prog, feeds, fetches = fluid.io.load_inference_model(dirname=model_path, executor=exe, model_filename="model", params_filename="params")
+    global correct_persistable
+    if correct_persistable:
+        ops = prog.current_block().ops
+        vars = prog.current_block().vars
+        for op in ops:
+            for var_name in op.output_arg_names:
+                if var_name == "fetch":
+                    continue
+                var = vars[var_name]
+                if var.persistable:
+                    pp_red("has found non-persistable output var : {}".format(var_name))
+                    var.persistable = False
+    return (prog, feeds, fetches)
+prog, feeds, fetches = load_model(model_path)
+# 强制要求所有张量的形状，在model和params中一致，并重新保存模型
+def resave_model(feed_kv):
+    if len(mobile_model_path) > 0:
+        pp_green("has set mobile_model_path, stop checking model & params", 1)
+        sh("cp {}/* {}".format(mobile_model_path, checked_model_path))
+        return
+    ops = prog.current_block().ops
+    vars = prog.current_block().vars
+    # 强制所有var为可持久化
+    p_names = []
+    for name in vars:
+        name = str(name)
+        v = fluid.framework._get_var(name, prog)
+        if not v.persistable:
+            v.persistable = True
+            p_names.append(name)
+    outputs = run_model(feed_kv=feed_kv)
+    has_found_wrong_shape = False
+    # 修正每个var的形状
+    for name in vars:
+        name = str(name)
+        v = vars[name]
+        if v.persistable:
+            v1 = fluid.global_scope().find_var(name)
+            try:
+                t1 = v1.get_tensor()
+                shape = t1.shape()
+            except:
+                continue
+            if v.desc.shape() != shape:
+                has_found_wrong_shape = True
+            v.desc.set_shape(shape)
+    # 恢复var的可持久化属性
+    for name in p_names:
+        v = fluid.framework._get_var(name, prog)
+        v.persistable = False
+    if not quantification:
+        fluid.io.save_inference_model(dirname=checked_model_path, feeded_var_names=feeds, target_vars=fetches, executor=exe, main_program=prog, model_filename="model", params_filename="params")
+    if has_found_wrong_shape:
+        pp_red("has found wrong shape", 1)
+    else:
+        pp_green("has not found wrong shape", 1)
+    pp_green("new model is saved into directory 【{}】".format(checked_model_path), 1)
+# 分别加密model和params，加密key使用同一个
+def encrypt_model():
+    if not need_encrypt:
+        return
+    pp_yellow(dot + dot + " encrypting model")
+    if not os.path.exists(checked_encrypt_model_path):
+        os.mkdir(checked_encrypt_model_path)
+    res = sh("model-encrypt-tool/enc_key_gen -l 20 -c 232")
+    lines = res.split("\n")
+    for line in lines:
+        if line.startswith("key:"):
+            line = line.replace('key:','')
+            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/model -o "
+               "checked_model/model.ml".format(line))
+            sh("model-encrypt-tool/enc_model_gen -k '{}' -c 2 -i checked_model/params  -o checked_model/params.ml".format(line))
+            pp_green("model has been encrypted, key is : {}".format(line), 1)
+            sh("mv {} {}".format(checked_model_path + "/*.ml", checked_encrypt_model_path))
+            return
+    pp_red("model encrypt error", 1)
+# 生成feed的key-value对
+def gen_feed_kv():
+    feed_kv = {}
+    for feed_name in feeds:
+        feed_shape = get_feed_var_shape(feed_name)
+        data = np.random.random(feed_shape).astype("float32")
+        feed_kv[feed_name] = data
+    return feed_kv
+# 保存feed的key-value对
+def save_feed_kv(feed_kv):
+    for feed_name in feed_kv:
+        feed_data = feed_kv[feed_name]
+        feed_list = feed_data.flatten().tolist()
+        if not os.path.exists(feed_path):
+            os.mkdir(feed_path)
+        file_name = feed_name.replace("/", "_")
+        out_file = open(feed_path + "/" + file_name, "w")
+        for feed_item in feed_list:
+            out_file.write("{}\n".format(feed_item))
+        out_file.close()
+last_feed_var_name = None
+last_feed_file_name = None
+last_feed_var_lod = None
+# 加载feed的key-value对
+def load_feed_kv():
+    if not os.path.exists(feed_path):
+        return None
+    global last_feed_var_name
+    global last_feed_file_name
+    global last_feed_var_lod
+    feed_kv = {}
+    pp_yellow(dot + dot + " checking feed info")
+    pp_green("feed data is saved into directory 【{}】".format(feed_path), 1)
+    for feed_name in feeds:
+        feed_shape = get_feed_var_shape(feed_name)
+        pp_tab("feed var name : {}; feed var shape : {}".format(feed_name, feed_shape), 1)
+        file_name = feed_name.replace("/", "_")
+        last_feed_var_name = feed_name
+        last_feed_file_name = file_name
+        feed_file_path = feed_path + "/" + file_name
+        if not os.path.exists(feed_file_path):
+            return None
+        data = np.loadtxt(feed_file_path)
+        expected_len = 1
+        for dim in feed_shape:
+            expected_len *= dim
+        if len(np.atleast_1d(data)) != expected_len:
+            return None
+        data = data.reshape(feed_shape).astype("float32")
+        if is_lod:
+            data_shape = [1]
+            for dim in feed_shape:
+                data_shape.append(dim)
+            data = data.reshape(data_shape).astype("float32")
+            tensor = fluid.LoDTensor()
+            seq_lens = [len(seq) for seq in data]
+            cur_len = 0
+            lod = [cur_len]
+            for l in seq_lens:
+                cur_len += l
+                lod.append(cur_len)
+            data = data.reshape(feed_shape)
+            tensor.set(data, fluid.CPUPlace())
+            tensor.set_lod([lod])
+            last_feed_var_lod = lod
+            feed_kv[feed_name] = tensor
+        else:
+            feed_kv[feed_name] = data
+    return feed_kv
+# 运行模型
+def run_model(feed_kv=None):
+    pp_yellow("run_model", 1)
+    if feed_kv is None:
+        feed_kv = gen_feed_kv()
+    feed_names_.clear()
+    for feed_name in feeds:
+        feed_names_.append(feed_name)
+        pp_green(feed_name, 1)
+    pp_green(feed_names_, 1)
+    outputs = exe.run(prog, feed=feed_kv, fetch_list=fetches, return_numpy=False)
+    results = []
+    for output in outputs:
+        results.append(np.array(output))
+    return results
+# 获取变量形状
+def get_var_shape(var_name):
+    vars = prog.current_block().vars
+    shape = vars[var_name].desc.shape()
+    for i in range(len(shape)):
+        dim = shape[i]
+        if dim == -1:
+            shape[i] = 1
+    return shape
+# 获取输入变量形状
+def get_feed_var_shape(var_name):
+    # 如果想写死输入形状，放开以下语句
+    # return [1, 3, 224, 224]
+    return get_var_shape(var_name)
+persistable_cache = []
+# 所有var，全部变成持久化
+def force_all_vars_to_persistable():
+    global persistable_cache
+    for var_name in vars.keys():
+        var_name = str(var_name)
+        v = fluid.framework._get_var(var_name, prog)
+        persistable = v.persistable
+        if not persistable:
+            persistable_cache.append(var_name)
+            v.persistable = True
+# 恢复持久化属性
+def restore_all_vars_persistable():
+    global persistable_cache
+    for var_name in vars.keys():
+        var_name = str(var_name)
+        v = fluid.framework._get_var(var_name, prog)
+        persistable = v.persistable
+        if var_name in persistable_cache:
+            v.persistable = False
+    persistable_cache = []
+# 获取var的数据
+def get_var_data(var_name, feed_kv=None):
+    output = np.array(fluid.global_scope().var(var_name).get_tensor())
+    return output
+output_var_cache = {}
+def tensor_sample(tensor):
+    if is_sample_step:
+        step = sample_step
+    else:
+        step = math.floor(len(tensor) / sample_num)
+    step = max(step, 1)
+    step = int(step)
+    sample = []
+    for i in range(0, len(tensor), step):
+        sample.append(tensor[i])
+    return sample
+op_cache = {}
+# 获取每层输出的数据
+def save_all_op_output(feed_kv=None):
+    force_all_vars_to_persistable()
+    outputs = run_model(feed_kv=feed_kv)
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    ops = prog.current_block().ops
+    fetch_names = []
+    for fetch in fetches:
+        fetch_names.append(fetch.name)
+    feed_names = feeds
+    if len(output_var_filter) > 0:
+        for fetch_name in fetch_names:
+            output_var_filter.append(fetch_name)
+    for i in range(len(ops)):
+        op = ops[i]
+        var_name = None
+        var_name_index = -1
+        for index in range(len(op.output_names)):
+            if op.output_names[index] in ["Y", "Out", "Output"]:
+                var_name_index = index
+                break
+        if var_name_index != -1:
+            var_name = op.output_arg_names[var_name_index]
+        else:
+            for name in op.output_arg_names:
+                var_name = name
+                if "tmp" in name:
+                    break
+        if len(output_var_filter) > 0:
+            if var_name not in output_var_filter:
+                continue
+        # real_var_name = None
+        # if op.type == "fetch":
+        #     for name in op.input_arg_names:
+        #         real_var_name = name
+        #         if "tmp" in name:
+        #             break
+        # else:
+        #     real_var_name = var_name
+        if fast_check:
+            if var_name not in fetch_names and var_name not in feed_names:
+                continue
+        try:
+            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
+            sample = tensor_sample(data)
+            output_var_cache[var_name] = (sample)
+            op_cache[i] = (var_name, op)
+            file_name = var_name.replace("/", "_")
+            out_file = open(output_path + "/" + file_name, "w")
+            if var_name in feed_names:
+                for item in data:
+                    out_file.write("{}\n".format(item))
+            else:
+                for item in sample:
+                    out_file.write("{}\n".format(item))
+            out_file.close()
+        except:
+            pass
+    for i in range(len(ops)):
+        op = ops[i]
+        if op.type not in output_key_filter:
+            continue
+        var_name = None
+        var_name_index = -1
+        for index in range(len(op.output_names)):
+            if op.output_names[index] in output_key_filter[op.type]:
+                var_name_index = index
+                break
+        if var_name_index != -1:
+            var_name = op.output_arg_names[var_name_index]
+        else:
+            continue
+        if len(output_var_filter) > 0:
+            if var_name not in output_var_filter:
+                continue
+        # real_var_name = None
+        # if op.type == "fetch":
+        #     for name in op.input_arg_names:
+        #         real_var_name = name
+        #         if "tmp" in name:
+        #             break
+        # else:
+        #     real_var_name = var_name
+        if fast_check:
+            if var_name not in fetch_names and var_name not in feed_names:
+                continue
+        try:
+            data = get_var_data(var_name, feed_kv=feed_kv).flatten().tolist()
+            sample = tensor_sample(data)
+            output_var_cache[var_name] = (sample)
+            op_cache[i] = (var_name, op)
+            file_name = var_name.replace("/", "_")
+            out_file = open(output_path + "/" + file_name, "w")
+            if var_name in feed_names:
+                for item in data:
+                    out_file.write("{}\n".format(item))
+            else:
+                for item in sample:
+                    out_file.write("{}\n".format(item))
+            out_file.close()
+        except:
+            pass
+    pp_green("all the op outputs are saved into directory 【{}】".format(output_path), 1)
+    restore_all_vars_persistable()
+ops = prog.current_block().ops
+vars = prog.current_block().vars
+pp_yellow(dot + dot + " checking op list")
+op_types = set()
+for op in ops:
+    op_types.add(op.type)
+pp_tab("op types : {}".format(op_types), 1)
+def check_mobile_results(args, fuse, mem_opt):
+    args = "{} {} {} {} {}".format("1" if fuse else "0", "1" if mem_opt else "0", "1" if quantification else "0", quantification_fold, args)
+    pp_green(args, 1)
+    res = sh("adb shell \"cd {} && export LD_LIBRARY_PATH=. && ./test-net-feeds {}\"".format(mobile_exec_root, args))
+    lines = res.split("\n")
+    for line in lines:
+        print(line)
+    # for line in lines:
+    #     if line.startswith("auto-test-debug"):
+    #         print(line)
+    pp_yellow(dot + dot + " checking paddle mobile results for {} -- {} ".format(green("【fusion】" if fuse else "【non fusion】"), green("【memory-optimization】" if mem_opt else "【non-memory-optimization】")))
+    mobile_var_cache = {}
+    for line in lines:
+        parts = line.split(" ")
+        if len(parts) < 2:
+            continue
+        if "auto-test" != parts[0]:
+            continue
+        if parts[1] == "load-time-cost":
+            pp_green("load time cost : {}".format(parts[2]), 1) 
+        elif parts[1] == "predict-time-cost":
+            pp_green("predict time cost : {}".format(parts[2]), 1) 
+        elif parts[1] == "preprocess-time-cost":
+            pp_green("preprocess time cost : {}".format(parts[2]), 1)
+        elif parts[1] == "var":
+            var_name = parts[2]
+            values = list(map(lambda x: float(x), parts[3:]))
+            mobile_var_cache[var_name] = values
+    error_index = None
+    error_values1 = None
+    error_values2 = None
+    checked_names = []
+    fetch_names = []
+    for fetch in fetches:
+        fetch_names.append(fetch.name)
+    fetch_diff = 0.0
+    fetch_count = 0
+    for index in op_cache:
+        op_output_var_name, op = op_cache[index]
+        if not op_output_var_name in output_var_cache:
+            continue
+        if not op_output_var_name in mobile_var_cache:
+            continue
+        if op_output_var_name not in fetch_names:
+            continue
+        values1 = output_var_cache[op_output_var_name]
+        values2 = mobile_var_cache[op_output_var_name]
+        shape = get_var_shape(op_output_var_name) if check_shape else []
+        for i in range(len(values1)):
+            v1 = values1[i]
+            v2 = values2[len(shape) + i]
+            fetch_diff += abs(v1 - v2)
+            fetch_count += 1
+    if fetch_count != 0:
+        pp_yellow("output avg diff : {}".format(fetch_diff / fetch_count), 1)
+    for index in op_cache:
+        op_output_var_name, op = op_cache[index]
+        if mem_opt:
+            found_in_fetch = False
+            for fetch in fetches:
+                if op_output_var_name == fetch.name:
+                    found_in_fetch = True
+                    break
+            if not found_in_fetch:
+                continue
+        if not op_output_var_name in output_var_cache:
+            continue
+        if not op_output_var_name in mobile_var_cache:
+            continue
+        if op_output_var_name not in fetch_names:
+            continue
+        values1 = output_var_cache[op_output_var_name]
+        values2 = mobile_var_cache[op_output_var_name]
+        shape = get_var_shape(op_output_var_name) if check_shape else []
+        if len(values1) + len(shape) != len(values2):
+            error_index = index
+        for i in range(len(shape)):
+            v1 = shape[i]
+            v2 = values2[i]
+            if v1 != v2:
+                error_index = index
+                break
+        if error_index == None:
+            for i in range(len(values1)):
+                v1 = values1[i]
+                v2 = values2[len(shape) + i]
+                if abs(v1 - v2) > diff_threshold:
+                    error_index = index
+                    break
+        checked_names.append(op_output_var_name)
+        if error_index != None:
+            error_values1 = values1
+            error_values2 = values2
+            break
+    if error_index == None:
+        for name in fetch_names:
+            if name not in checked_names:
+                error_index = -1
+                break
+    if error_index == None:
+        pp_green("outputs are all correct", 1)
+    elif error_index == -1:
+        pp_red("outputs are missing")
+    else:
+        error_values1 = np.array(error_values1)
+        error_values2 = np.array(error_values2)
+        # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
+        pp_red("outputs are incorrect", 1)
+        pp_red("fluid results are : ", 1)
+        pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
+        pp_yellow("paddle mobile results are : ", 1)
+        pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
+        if not fuse and not mem_opt:
+            pp_yellow("checking individual ops : ", 1)
+            error_index = None
+            error_values1 = None
+            error_values2 = None
+            checked_names = []
+            fetch_names = []
+            for fetch in fetches:
+                fetch_names.append(fetch.name)
+            for index in op_cache:
+                op_output_var_name, op = op_cache[index]
+                if mem_opt:
+                    found_in_fetch = False
+                    for fetch in fetches:
+                        if op_output_var_name == fetch.name:
+                            found_in_fetch = True
+                            break
+                    if not found_in_fetch:
+                        continue
+                if not op_output_var_name in output_var_cache:
+                    continue
+                if not op_output_var_name in mobile_var_cache:
+                    continue
+                if fuse or mem_opt:
+                    if op_output_var_name not in fetch_names:
+                        continue
+                values1 = output_var_cache[op_output_var_name]
+                values2 = mobile_var_cache[op_output_var_name]
+                shape = get_var_shape(op_output_var_name) if check_shape else []
+                if len(values1) + len(shape) != len(values2):
+                    error_index = index
+                for i in range(len(shape)):
+                    v1 = shape[i]
+                    v2 = values2[i]
+                    if v1 != v2:
+                        error_index = index
+                        break
+                if error_index == None:
+                    for i in range(len(values1)):
+                        v1 = values1[i]
+                        v2 = values2[len(shape) + i]
+                        if ((not math.isnan(v1)) and math.isnan(v2)) or abs(v1 - v2) > diff_threshold:
+                            error_index = index
+                            break
+                checked_names.append(op_output_var_name)
+                if error_index != None:
+                    error_values1 = values1
+                    error_values2 = values2
+                    break
+            if error_index == None:
+                for name in fetch_names:
+                    if name not in checked_names:
+                        error_index = -1
+                        break
+            if error_index == None:
+                pp_green("outputs are all correct", 1)
+            elif error_index == -1:
+                pp_red("outputs are missing")
+            else:
+                error_values1 = np.array(error_values1)
+                error_values2 = np.array(error_values2)
+                # pp_red("mobile op is not correct, error occurs at {}th op, op's type is {}")
+                pp_red("corresponding fluid op is {}th op, op's type is {}, wrong var name is {}".format(
+                    error_index,op_cache[error_index][1].type,op_output_var_name), 1)
+                pp_red("fluid results are : ", 1)
+                pp_red(str(error_values1).replace("\n", "\n" + "\t" * 1), 1)
+                pp_yellow("paddle mobile results are : ", 1)
+                pp_red(str(error_values2).replace("\n", "\n" + "\t" * 1), 1)
+    # print(output_var_cache)
+    # print(mobile_var_cache)
+def main():
+    # 加载kv
+    feed_kv = load_feed_kv()
+    if feed_kv == None:
+        feed_kv = gen_feed_kv()
+        save_feed_kv(feed_kv)
+        feed_kv = load_feed_kv()
+    # 预测
+    pp_yellow(dot + dot + " checking inference")
+    outputs = run_model(feed_kv=feed_kv)
+    pp_tab("fluid output : {}".format(outputs), 1)
+    # 重新保存模型
+    pp_yellow(dot + dot + " checking model correctness")
+    resave_model(feed_kv=feed_kv)
+    # 输出加密模型
+    encrypt_model()
+    # 输出所有中间结果
+    pp_yellow(dot + dot + " checking output result of every op")
+    save_all_op_output(feed_kv=feed_kv)
+    pp_yellow(dot + dot + " checking fetch info")
+    for fetch in fetches:
+        fetch_name = fetch.name
+        fetch_shape = get_var_shape(fetch_name)
+        pp_tab("fetch var name : {}; fetch var shape : {}".format(fetch_name, fetch_shape), 1)
+    # 输出所有op、var信息
+    info_file = open("info.txt", "w")
+    for i in range(len(ops)):
+        op = ops[i]
+        info_file.write("{}th op: type - {}\n".format(i, op.type))
+        info_file.write("inputs:\n")
+        for var_name in op.input_arg_names:
+            try:
+                shape = get_var_shape(var_name)
+                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
+                info_file.write("var {} : {}\n".format(var_name, shape_str))
+            except:
+                pass
+        info_file.write("outputs:\n")
+        for var_name in op.output_arg_names:
+            try:
+                shape = get_var_shape(var_name)
+                shape_str = ", ".join(list(map(lambda x: str(x), shape)))
+                info_file.write("var {} : {}\n".format(var_name, shape_str))
+            except:
+                pass
+    info_file.close()
+    # 开始检查mobile的正确性
+    print("")
+    print("==================================================")
+    print("")
+    pp_yellow(dot + " start inspecting paddle mobile correctness & performance")
+    push(checked_model_path)
+    pp_green(feed_names_, 1)
+    feed_names_argu = ""
+    for n in feed_names_:
+        feed_names_argu += "{}\n".format(n)
+        pp_green("feed name - {} ".format(str(n)), 1)
+        push(feed_path + "/" + str(n), "{}".format(str(n)))
+    push(feed_path + "/" + last_feed_file_name, "input.txt")
+    push(mobile_src_root + "/build/release/{}/build/libpaddle-mobile.so".format(architecture))
+    push(mobile_src_root + "/build/release/{}/build/cl_kernel".format(architecture))
+    push(mobile_src_root + "/test/build/test-net")
+    last_feed_var_shape = get_feed_var_shape(last_feed_var_name)
+    args = str(len(last_feed_var_shape))
+    for dim in last_feed_var_shape:
+        args += " " + str(dim)
+    if is_lod:
+        args += " 1"
+        args += " " + str(len(last_feed_var_lod))
+        for dim in last_feed_var_lod:
+            args += " " + str(dim)
+    else:
+        args += " 0"
+    args += " " + str(len(output_var_cache))
+    args += " " + str(1 if is_sample_step else 0)
+    if is_sample_step:
+        args += " " + str(sample_step)
+    else:
+        args += " " + str(sample_num)
+    for var_name in output_var_cache.keys():
+        args += " " + var_name
+    args += " " + str(1 if check_shape else 0)
+    if not fast_check:
+        check_mobile_results(args, False, False)
+        check_mobile_results(args, False, True)
+    check_mobile_results(args, True, False)
+    check_mobile_results(args, True, True)
+if __name__ == "__main__":
+    main()