From c28971a94d35c12023abe4647ec8890bb3497529 Mon Sep 17 00:00:00 2001
From: "baolei.an" <baolei.an@bitmain.com>
Date: Fri, 6 Dec 2019 11:39:16 +0800
Subject: [PATCH] merge to newest version

---
 cmake/cuda.cmake                              |   25 +-
 cmake/cudnn.cmake                             |    6 +-
 cmake/external/eigen.cmake                    |   16 +-
 cmake/external/xbyak.cmake                    |    6 +-
 cmake/external/xxhash.cmake                   |   13 +-
 cmake/generic.cmake                           |    5 +-
 cmake/lite.cmake                              |    5 +-
 lite/CMakeLists.txt                           |   16 +-
 lite/api/CMakeLists.txt                       |   31 +-
 lite/api/benchmark.cc                         |    7 +-
 lite/api/cxx_api.cc                           |   31 +-
 lite/api/cxx_api.h                            |    7 +
 lite/api/mobilenetv1_test.cc                  |    7 +-
 lite/api/model_optimize_tool.cc               |  161 +-
 lite/api/model_test.cc                        |   16 +-
 lite/api/paddle_api.cc                        |    9 +-
 lite/api/paddle_api.h                         |    2 +-
 lite/api/paddle_place.cc                      |    9 +-
 lite/api/paddle_place.h                       |    3 +-
 lite/api/paddle_use_passes.h                  |    7 +-
 lite/api/test_step_rnn_lite_x86.cc            |   14 -
 lite/backends/arm/math/CMakeLists.txt         |   20 +-
 lite/backends/arm/math/col_im_transform.cc    |   17 +-
 lite/backends/arm/math/col_im_transform.h     |    6 +-
 .../arm/math/conv3x3_winograd_fp32_c4.cc      |  564 ++++
 .../arm/math/conv3x3s1_direct_fp32.cc         |   10 +-
 .../arm/math/conv3x3s1_direct_int8.cc         |    5 +-
 .../arm/math/conv3x3s1p01_depthwise_fp32.cc   | 2539 +++++++++++++++++
 .../arm/math/conv3x3s1px_depthwise_fp32.cc    |  541 ++++
 .../arm/math/conv3x3s2_direct_fp32.cc         |   10 +-
 .../arm/math/conv3x3s2_direct_int8.cc         |   10 +-
 .../arm/math/conv3x3s2p01_depthwise_fp32.cc   | 1862 ++++++++++++
 .../arm/math/conv3x3s2px_depthwise_fp32.cc    |  362 +++
 lite/backends/arm/math/conv_block_utils.h     | 1007 +++----
 lite/backends/arm/math/conv_depthwise.h       |   32 -
 lite/backends/arm/math/conv_impl.cc           |  220 +-
 lite/backends/arm/math/conv_impl.h            |   16 +
 lite/backends/arm/math/conv_winograd_3x3.cc   |    6 +-
 lite/backends/arm/math/funcs.h                |    2 +
 lite/backends/arm/math/interpolate.cc         |   65 +-
 lite/backends/arm/math/interpolate.h          |    5 +-
 lite/backends/arm/math/layout.cc              |  668 +++++
 lite/backends/arm/math/layout.h               |   30 +
 lite/backends/arm/math/packed_sgemm.cc        |  644 ++++-
 lite/backends/arm/math/packed_sgemm_c4.cc     | 1171 ++++++++
 lite/backends/arm/math/packed_sgemm_c4.h      |   53 +
 lite/backends/arm/math/pooling.cc             |   18 +-
 lite/backends/arm/math/sgemv.cc               |  570 +++-
 lite/backends/arm/math/sgemv.h                |    9 +-
 lite/backends/cuda/CMakeLists.txt             |    3 +-
 lite/backends/cuda/cuda_utils.h               |    9 +
 lite/backends/cuda/math/CMakeLists.txt        |    9 +-
 lite/backends/cuda/math/batched_gemm.cc       |  134 +
 lite/backends/cuda/math/batched_gemm.h        |   80 +
 lite/backends/cuda/math/cudnn_conv.cc         |   26 +-
 lite/backends/cuda/math/cudnn_pool.cc         |  159 ++
 lite/backends/cuda/math/cudnn_pool.h          |   79 +
 lite/backends/cuda/math/elementwise.cu        |   94 +-
 lite/backends/cuda/math/elementwise.h         |   21 +
 lite/backends/cuda/math/gemm.cc               |  100 +
 lite/backends/cuda/math/gemm.h                |   74 +
 lite/backends/cuda/math/utils.h               |   18 +
 lite/backends/fpga/KD/pes/conv_process.hpp    |   21 +-
 .../fpga/KD/pes/depthwise_conv_pe.hpp         |   11 +-
 lite/backends/fpga/KD/pes/pooling_pe.hpp      |   10 +-
 lite/backends/npu/builder.cc                  |    8 +-
 lite/backends/npu/builder.h                   |  111 -
 lite/backends/opencl/cl_wrapper.cc            |    2 +-
 lite/backends/x86/math/CMakeLists.txt         |    3 +-
 lite/backends/x86/math/beam_search.cc         |    1 +
 lite/backends/x86/math/pooling.cc             |    8 +-
 lite/backends/x86/math/search_fc.cc           |   79 +
 lite/backends/x86/math/search_fc.h            |  184 ++
 .../x86/math/sequence_topk_avg_pooling.cc     |  151 +
 .../x86/math/sequence_topk_avg_pooling.h      |   46 +
 lite/core/CMakeLists.txt                      |    4 +-
 lite/core/arena/framework.cc                  |    3 +
 lite/core/context.h                           |    9 +-
 lite/core/device_info.cc                      |    4 +-
 lite/core/kernel.h                            |   18 +-
 lite/core/memory.cc                           |    2 +-
 .../mir/fusion/conv_activation_fuse_pass.cc   |    1 +
 lite/core/mir/fusion/conv_bn_fuse_pass.cc     |    2 +-
 .../mir/fusion/conv_elementwise_fuse_pass.cc  |    3 +-
 .../elementwise_add_activation_fuse_pass.cc   |    1 +
 lite/core/mir/fusion/fc_fuse_pass.cc          |    1 +
 .../core/mir/fusion/quant_dequant_op_fuser.cc |    4 +
 lite/core/mir/memory_optimize_pass.cc         |    3 +-
 lite/core/mir/pass.h                          |   33 +-
 lite/core/mir/pass_utils.cc                   |   32 +-
 lite/core/mir/pass_utils.h                    |    4 +-
 lite/core/mir/static_kernel_pick_pass.h       |   24 +-
 .../mir/subgraph/generate_npu_program_pass.cc |   49 +-
 .../mir/subgraph/generate_npu_program_pass.h  |    4 -
 .../generate_npu_program_pass_test.cc         |    4 +-
 .../mir/subgraph/generate_xpu_program_pass.cc |   49 +-
 .../mir/subgraph/generate_xpu_program_pass.h  |    4 -
 lite/core/mir/type_layout_cast_pass.cc        |   24 +-
 lite/core/mir/type_target_cast_pass.cc        |   34 +-
 lite/core/mir/variable_place_inference_pass.h |   70 +-
 lite/core/op_registry.cc                      |    2 +
 lite/core/op_registry.h                       |   16 +-
 lite/core/optimizer.h                         |   87 +-
 lite/core/profile/CMakeLists.txt              |    3 +-
 lite/core/profile/profiler.cc                 |  117 +
 lite/core/profile/profiler.h                  |   59 +
 lite/core/profile/test_timer.cc               |   81 +
 lite/core/profile/timer.h                     |  114 +
 lite/core/program.cc                          |    8 +-
 lite/core/program.h                           |   48 +-
 lite/demo/cxx/Makefile.def                    |   14 +-
 lite/demo/cxx/README.md                       |   26 +-
 .../mobile_detection/Makefile.android.armv7   |   61 +
 .../mobile_detection/Makefile.android.armv8   |   61 +
 .../mobile_full/Makefile.android.armv7        |   20 +-
 .../mobile_full/Makefile.android.armv8        |   20 +-
 .../mobile_light/Makefile.android.armv7       |   14 +-
 .../mobile_light/Makefile.android.armv8       |   14 +-
 .../cxx/mobile_detection/mobile_detection.cc  |  210 ++
 lite/demo/cxx/mobile_detection/test.jpg       |  Bin 0 -> 127499 bytes
 .../cxx/mobile_full/mobilenetv1_full_api.cc   |   30 +-
 .../cxx/mobile_light/mobilenetv1_light_api.cc |   26 +-
 lite/gen_code/CMakeLists.txt                  |    2 -
 lite/kernels/arm/CMakeLists.txt               |   62 +-
 lite/kernels/arm/conv_compute.cc              |   73 +-
 lite/kernels/arm/conv_depthwise.cc            |   35 +-
 lite/kernels/arm/conv_gemmlike.h              |   17 +-
 lite/kernels/arm/conv_transpose_compute.cc    |   25 +-
 lite/kernels/arm/conv_winograd.cc             |  177 +-
 lite/kernels/arm/conv_winograd.h              |    1 +
 lite/kernels/arm/fc_compute.cc                |    3 +-
 lite/kernels/arm/fill_constant_compute.cc     |   37 +
 lite/kernels/arm/interpolate_compute.cc       |   14 +-
 lite/kernels/arm/layout_compute.cc            |  179 ++
 lite/kernels/arm/layout_compute.h             |   43 +
 lite/kernels/arm/lookup_table_compute.cc      |   14 +-
 lite/kernels/arm/lookup_table_compute_test.cc |  115 +
 lite/kernels/arm/lrn_compute.cc               |    7 +-
 lite/kernels/arm/lrn_compute_test.cc          |    4 +-
 lite/kernels/arm/matmul_compute.cc            |    2 +-
 lite/kernels/arm/mul_compute.cc               |    5 +-
 lite/kernels/arm/pool_compute.cc              |   22 +-
 lite/kernels/arm/pool_compute_test.cc         |  176 +-
 lite/kernels/arm/split_compute.cc             |    4 +
 lite/kernels/cuda/CMakeLists.txt              |   38 +-
 .../cuda/attention_padding_mask_compute.cu    |  162 ++
 .../cuda/attention_padding_mask_compute.h     |   38 +
 .../attention_padding_mask_compute_test.cc    |  134 +
 lite/kernels/cuda/bilinear_interp_compute.cu  |   84 +-
 .../cuda/bilinear_interp_compute_test.cc      |  111 +
 lite/kernels/cuda/calib_compute_cuda_test.cc  |   27 +-
 lite/kernels/cuda/concat_compute.cu           |    4 +-
 lite/kernels/cuda/conv_compute.cc             |   25 +-
 lite/kernels/cuda/conv_compute_test.cc        |   15 +-
 lite/kernels/cuda/elementwise_compute.cu      |  318 +++
 lite/kernels/cuda/elementwise_compute.h       |   98 +
 lite/kernels/cuda/elementwise_compute_test.cc |  252 ++
 lite/kernels/cuda/feed_compute.cc             |   45 +-
 lite/kernels/cuda/feed_compute.h              |    3 +-
 lite/kernels/cuda/layout_compute.cc           |   27 +
 lite/kernels/cuda/lookup_table_compute.cu     |   11 +
 .../cuda/match_matrix_tensor_compute.cu       |  145 +
 .../cuda/match_matrix_tensor_compute.h        |   42 +
 .../cuda/match_matrix_tensor_compute_test.cc  |  122 +
 lite/kernels/cuda/mul_compute_test.cc         |    2 +
 lite/kernels/cuda/nearest_interp_compute.cu   |   87 +-
 .../cuda/nearest_interp_compute_test.cc       |  111 +
 lite/kernels/cuda/pool_compute.cu             |   76 +-
 lite/kernels/cuda/pool_compute.h              |   17 +
 lite/kernels/cuda/pool_compute_test.cc        |  252 +-
 .../cuda/search_aligned_mat_mul_compute.cc    |   38 +
 .../cuda/search_aligned_mat_mul_compute.h     |  103 +
 .../search_aligned_mat_mul_compute_test.cc    |  221 ++
 lite/kernels/cuda/search_fc_compute.cu        |  170 ++
 lite/kernels/cuda/search_fc_compute.h         |   52 +
 lite/kernels/cuda/search_fc_compute_test.cc   |  110 +
 lite/kernels/cuda/search_grnn_compute.cu      |  351 +++
 lite/kernels/cuda/search_grnn_compute.h       |   46 +
 lite/kernels/cuda/search_grnn_compute_test.cc |  103 +
 .../cuda/search_group_padding_compute.cu      |  164 ++
 .../cuda/search_group_padding_compute.h       |   38 +
 .../cuda/search_group_padding_compute_test.cc |  127 +
 .../cuda/search_seq_depadding_compute.cu      |  115 +
 .../cuda/search_seq_depadding_compute.h       |   39 +
 .../cuda/search_seq_depadding_compute_test.cc |   88 +
 lite/kernels/cuda/search_seq_fc_compute.cu    |   98 +
 lite/kernels/cuda/search_seq_fc_compute.h     |   43 +
 .../cuda/search_seq_fc_compute_test.cc        |  175 ++
 .../cuda/sequence_arithmetic_compute.cu       |  249 ++
 .../cuda/sequence_arithmetic_compute.h        |   41 +
 .../cuda/sequence_arithmetic_compute_test.cc  |  131 +
 lite/kernels/cuda/sequence_concat_compute.cu  |  151 +
 lite/kernels/cuda/sequence_concat_compute.h   |   40 +
 .../cuda/sequence_concat_compute_test.cc      |  163 ++
 lite/kernels/cuda/sequence_pool_compute.cu    |  258 ++
 lite/kernels/cuda/sequence_pool_compute.h     |   35 +
 .../cuda/sequence_pool_compute_test.cc        |  104 +
 lite/kernels/cuda/sequence_reverse_compute.cu |  130 +
 lite/kernels/cuda/sequence_reverse_compute.h  |   38 +
 .../cuda/sequence_reverse_compute_test.cc     |  105 +
 .../cuda/sequence_topk_avg_pooling_compute.cu |  209 ++
 .../cuda/sequence_topk_avg_pooling_compute.h  |   43 +
 lite/kernels/cuda/softmax_compute.cu          |   23 +-
 lite/kernels/cuda/var_conv_2d_compute.cu      |  263 ++
 lite/kernels/cuda/var_conv_2d_compute.h       |   37 +
 lite/kernels/cuda/var_conv_2d_compute_test.cc |  360 +++
 lite/kernels/fpga/conv_compute.cc             |    7 +
 lite/kernels/fpga/conv_compute_test.cc        |   20 +-
 lite/kernels/npu/bridges/CMakeLists.txt       |    9 +
 lite/kernels/npu/bridges/act_op.cc            |   23 +-
 lite/kernels/npu/bridges/act_op_test.cc       |  122 +-
 lite/kernels/npu/bridges/batch_norm_op.cc     |    6 +-
 lite/kernels/npu/bridges/conv_op.cc           |   36 +-
 lite/kernels/npu/bridges/conv_op_test.cc      |    5 +-
 lite/kernels/npu/bridges/conv_transpose_op.cc |   17 +-
 .../npu/bridges/conv_transpose_op_test.cc     |    3 +-
 lite/kernels/npu/bridges/elementwise_ops.cc   |   85 +-
 .../npu/bridges/elementwise_ops_test.cc       |   87 +-
 lite/kernels/npu/bridges/interpolate_op.cc    |   70 +-
 lite/kernels/npu/bridges/mul_op.cc            |   99 +-
 .../npu/bridges/paddle_use_npu_bridges.h      |   39 +-
 lite/kernels/npu/bridges/pool_op.cc           |   79 +-
 lite/kernels/npu/bridges/pool_op_test.cc      |    5 +-
 lite/kernels/npu/bridges/reduce_mean_op.cc    |  111 +
 .../npu/bridges/reduce_mean_op_test.cc        |  347 +++
 lite/kernels/npu/bridges/reshape_op.cc        |    6 +-
 lite/kernels/npu/bridges/sqrt_op.cc           |   54 +
 lite/kernels/npu/bridges/sqrt_op_test.cc      |   93 +
 lite/kernels/npu/bridges/square_op.cc         |   55 +
 lite/kernels/npu/bridges/square_op_test.cc    |   92 +
 lite/kernels/opencl/CMakeLists.txt            |    2 +-
 lite/kernels/opencl/conv_compute.cc           |   28 +-
 lite/kernels/opencl/conv_compute_test.cc      |   14 +-
 .../opencl/depthwise_conv2d_compute.cc        |    2 +-
 .../opencl/depthwise_conv2d_compute_test.cc   |    3 +-
 lite/kernels/opencl/io_copy_compute.cc        |    5 +-
 lite/kernels/opencl/pool_compute.cc           |   14 +-
 lite/kernels/opencl/pool_compute_test.cc      |    4 +-
 lite/kernels/x86/CMakeLists.txt               |   35 +-
 .../x86/attention_padding_mask_compute.cc     |   28 +
 .../x86/attention_padding_mask_compute.h      |   83 +
 .../attention_padding_mask_compute_test.cc    |  132 +
 lite/kernels/x86/cast_compute.cc              |   25 +
 lite/kernels/x86/cast_compute.h               |   80 +
 lite/kernels/x86/cast_compute_test.cc         |   77 +
 lite/kernels/x86/conv_compute.h               |   41 +-
 lite/kernels/x86/conv_compute_test.cc         |    6 +-
 lite/kernels/x86/fill_constant_compute.cc     |   36 +
 lite/kernels/x86/gather_compute.cc            |   32 +
 lite/kernels/x86/gather_compute.h             |   99 +
 lite/kernels/x86/gather_compute_test.cc       |  159 ++
 lite/kernels/x86/layer_norm_compute.cc        |   29 +
 lite/kernels/x86/layer_norm_compute.h         |   91 +
 lite/kernels/x86/layer_norm_compute_test.cc   |  169 ++
 lite/kernels/x86/lookup_table_compute.cc      |   10 +
 lite/kernels/x86/lookup_table_compute.h       |   11 +-
 lite/kernels/x86/lookup_table_compute_test.cc |   82 +
 .../x86/match_matrix_tensor_compute.cc        |  142 +
 .../kernels/x86/match_matrix_tensor_compute.h |   42 +
 .../x86/match_matrix_tensor_compute_test.cc   |  116 +
 lite/kernels/x86/mean_compute.cc              |   36 -
 lite/kernels/x86/mul_compute.cc               |   18 -
 lite/kernels/x86/mul_compute.h                |   72 -
 lite/kernels/x86/pool_compute.h               |    5 +-
 lite/kernels/x86/pool_compute_test.cc         |    3 +-
 .../x86/search_aligned_mat_mul_compute.cc     |   30 +
 .../x86/search_aligned_mat_mul_compute.h      |   83 +
 lite/kernels/x86/search_fc_compute.cc         |   27 +
 lite/kernels/x86/search_fc_compute.h          |   44 +
 lite/kernels/x86/search_fc_compute_test.cc    |  122 +
 lite/kernels/x86/search_grnn_compute.cc       |  332 +++
 lite/kernels/x86/search_grnn_compute.h        |   44 +
 lite/kernels/x86/search_grnn_compute_test.cc  |  100 +
 .../x86/search_group_padding_compute.cc       |   28 +
 .../x86/search_group_padding_compute.h        |  105 +
 .../x86/search_group_padding_compute_test.cc  |   92 +
 .../x86/search_seq_depadding_compute.cc       |   76 +
 .../x86/search_seq_depadding_compute.h        |   40 +
 .../x86/search_seq_depadding_compute_test.cc  |   83 +
 lite/kernels/x86/search_seq_fc_compute.cc     |   27 +
 lite/kernels/x86/search_seq_fc_compute.h      |   73 +
 .../x86/sequence_arithmetic_compute.cc        |   38 +
 .../kernels/x86/sequence_arithmetic_compute.h |  111 +
 .../x86/sequence_arithmetic_compute_test.cc   |  125 +
 lite/kernels/x86/sequence_concat_compute.cc   |   25 +
 lite/kernels/x86/sequence_concat_compute.h    |   84 +
 .../x86/sequence_concat_compute_test.cc       |  163 ++
 lite/kernels/x86/sequence_reverse_compute.cc  |   32 +
 lite/kernels/x86/sequence_reverse_compute.h   |   63 +
 .../x86/sequence_reverse_compute_test.cc      |  108 +
 .../x86/sequence_topk_avg_pooling_compute.cc  |   29 +
 .../x86/sequence_topk_avg_pooling_compute.h   |   50 +
 lite/kernels/x86/softmax_compute.cc           |   10 +
 lite/kernels/x86/stack_compute.cc             |   25 +
 lite/kernels/x86/stack_compute.h              |   72 +
 lite/kernels/x86/stack_compute_test.cc        |   89 +
 lite/kernels/x86/var_conv_2d_compute.cc       |   27 +
 lite/kernels/x86/var_conv_2d_compute.h        |  213 ++
 lite/kernels/x86/var_conv_2d_compute_test.cc  |  315 ++
 lite/kernels/xpu/bridges/conv_op.cc           |   31 +-
 lite/kernels/xpu/bridges/conv_op_test.cc      |    5 +-
 lite/kernels/xpu/bridges/pool_op_test.cc      |    5 +-
 lite/model_parser/model_parser.cc             |    2 +-
 lite/operators/CMakeLists.txt                 |   92 +-
 lite/operators/activation_ops.cc              |    1 +
 lite/operators/attention_padding_mask_op.cc   |   70 +
 lite/operators/attention_padding_mask_op.h    |   46 +
 lite/operators/conv_op.cc                     |   48 +-
 lite/operators/conv_op.h                      |   52 +-
 lite/operators/conv_transpose_op.cc           |   93 +-
 lite/operators/conv_transpose_op.h            |    1 +
 lite/operators/fill_constant_op.cc            |   23 +
 lite/operators/interpolate_op.cc              |   55 +-
 lite/operators/lookup_table_v2_op.cc          |   68 +
 lite/operators/lookup_table_v2_op.h           |   46 +
 lite/operators/lrn_op.cc                      |    6 +-
 lite/operators/match_matrix_tensor_op.cc      |  105 +
 lite/operators/match_matrix_tensor_op.h       |   49 +
 lite/operators/op_params.h                    |  146 +-
 lite/operators/pool_op.cc                     |   38 +-
 lite/operators/pool_op.h                      |   56 +-
 lite/operators/search_aligned_mat_mul_op.cc   |  101 +
 lite/operators/search_aligned_mat_mul_op.h    |   47 +
 lite/operators/search_fc_op.cc                |   80 +
 lite/operators/search_fc_op.h                 |   46 +
 lite/operators/search_grnn_op.cc              |   94 +
 lite/operators/search_grnn_op.h               |   48 +
 lite/operators/search_group_padding_op.cc     |   67 +
 lite/operators/search_group_padding_op.h      |   41 +
 lite/operators/search_seq_depadding_op.cc     |   71 +
 lite/operators/search_seq_depadding_op.h      |   49 +
 lite/operators/search_seq_fc_op.cc            |   80 +
 lite/operators/search_seq_fc_op.h             |   47 +
 lite/operators/search_seq_softmax_op.cc       |   52 +
 lite/operators/search_seq_softmax_op.h        |   47 +
 lite/operators/sequence_arithmetic_op.cc      |   58 +
 lite/operators/sequence_arithmetic_op.h       |   46 +
 lite/operators/sequence_concat_op.cc          |   85 +
 lite/operators/sequence_concat_op.h           |   41 +
 lite/operators/sequence_reverse_op.cc         |   55 +
 lite/operators/sequence_reverse_op.h          |   41 +
 .../operators/sequence_topk_avg_pooling_op.cc |   85 +
 lite/operators/sequence_topk_avg_pooling_op.h |   49 +
 lite/operators/split_op.cc                    |   31 +-
 lite/operators/unsqueeze_op.cc                |   11 +-
 lite/operators/var_conv_2d_op.cc              |   79 +
 lite/operators/var_conv_2d_op.h               |   41 +
 lite/tests/cv/image_convert_test.cc           |   12 +-
 lite/tests/kernels/CMakeLists.txt             |    2 +
 .../kernels/bilinear_interp_compute_test.cc   |  100 +-
 .../kernels/conv2d_transpose_compute_test.cc  |  185 +-
 .../kernels/fill_constant_compute_test.cc     |  178 ++
 lite/tests/kernels/lrn_compute_test.cc        |    2 +-
 .../kernels/nearest_interp_compute_test.cc    |   84 +-
 .../search_aligned_mat_mul_compute_test.cc    |  220 ++
 .../kernels/search_seq_fc_compute_test.cc     |  177 ++
 .../kernels/shuffle_channel_compute_test.cc   |   15 +-
 lite/tests/kernels/unsqueeze_compute_test.cc  |    5 +-
 lite/tests/math/CMakeLists.txt                |    8 +
 lite/tests/math/conv_compute_test.cc          |  228 +-
 lite/tests/math/conv_int8_compute_test.cc     |  203 +-
 .../tests/math/conv_transpose_compute_test.cc |  121 +-
 lite/tests/math/gemm_int8_compute_test.cc     |   30 +-
 lite/tests/math/gemv_int8_compute_test.cc     |   30 +-
 lite/tests/math/layout_compute_test.cc        |  608 ++++
 lite/tests/math/pool_compute_test.cc          |  106 +-
 lite/tests/math/sgemm_c4_compute_test.cc      |  236 ++
 lite/tests/math/sgemm_compute_test.cc         |   16 +-
 lite/tests/math/sgemv_compute_test.cc         |  194 ++
 lite/tests/utils/naive_math_impl.h            |  138 +-
 lite/tools/build.sh                           |   16 +-
 lite/tools/build_npu.sh                       |    4 +-
 lite/tools/ci_build.sh                        |   24 +-
 lite/tools/debug/debug_utils.h                |    6 +-
 lite/utils/cv/paddle_image_preprocess.cc      |  234 --
 lite/utils/cv/paddle_image_preprocess.h       |    6 +-
 lite/utils/io.h                               |   60 +
 lite/utils/logging.cc                         |    8 +-
 lite/utils/logging.h                          |   31 +
 lite/utils/replace_stl/stream.cc              |   58 +-
 lite/utils/replace_stl/stream.h               |   21 +-
 mobile/src/fpga/V2/api.cpp                    |    2 +-
 mobile/src/fpga/V2/image.cpp                  |   11 +-
 mobile/src/fpga/V2/pe.cpp                     |  473 ++-
 mobile/src/fpga/common/driver.cpp             |    2 +-
 mobile/src/fpga/common/fpga_common.h          |    1 +
 .../fpga/V2/anchor_generator_kernel.cpp       |    2 +-
 .../kernel/fpga/V2/concat_kernel.cpp          |    9 +
 .../kernel/fpga/V2/elementwise_add_kernel.cpp |   33 +-
 .../fpga/V2/elementwise_add_relu_kernel.cpp   |   31 +-
 .../kernel/fpga/V2/reshape2_kernel.cpp        |   23 +-
 .../kernel/fpga/V2/sigmoid_kernel.cpp         |    2 +-
 .../operators/kernel/fpga/V2/slice_kernel.cpp |   29 +-
 .../src/operators/math/depthwise_conv3x3.cpp  |   10 +-
 394 files changed, 30742 insertions(+), 3308 deletions(-)
 create mode 100644 lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
 create mode 100644 lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
 create mode 100644 lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
 create mode 100644 lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
 create mode 100644 lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
 create mode 100644 lite/backends/arm/math/layout.cc
 create mode 100644 lite/backends/arm/math/layout.h
 create mode 100644 lite/backends/arm/math/packed_sgemm_c4.cc
 create mode 100644 lite/backends/arm/math/packed_sgemm_c4.h
 create mode 100644 lite/backends/cuda/math/batched_gemm.cc
 create mode 100644 lite/backends/cuda/math/batched_gemm.h
 create mode 100644 lite/backends/cuda/math/cudnn_pool.cc
 create mode 100644 lite/backends/cuda/math/cudnn_pool.h
 create mode 100644 lite/backends/cuda/math/gemm.cc
 create mode 100644 lite/backends/cuda/math/gemm.h
 mode change 100755 => 100644 lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
 create mode 100644 lite/backends/x86/math/search_fc.cc
 create mode 100644 lite/backends/x86/math/search_fc.h
 create mode 100644 lite/backends/x86/math/sequence_topk_avg_pooling.cc
 create mode 100644 lite/backends/x86/math/sequence_topk_avg_pooling.h
 create mode 100644 lite/core/profile/profiler.cc
 create mode 100644 lite/core/profile/profiler.h
 create mode 100644 lite/core/profile/test_timer.cc
 create mode 100644 lite/core/profile/timer.h
 create mode 100644 lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
 create mode 100644 lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
 create mode 100644 lite/demo/cxx/mobile_detection/mobile_detection.cc
 create mode 100644 lite/demo/cxx/mobile_detection/test.jpg
 create mode 100644 lite/kernels/arm/layout_compute.cc
 create mode 100644 lite/kernels/arm/layout_compute.h
 create mode 100644 lite/kernels/arm/lookup_table_compute_test.cc
 create mode 100644 lite/kernels/cuda/attention_padding_mask_compute.cu
 create mode 100644 lite/kernels/cuda/attention_padding_mask_compute.h
 create mode 100644 lite/kernels/cuda/attention_padding_mask_compute_test.cc
 create mode 100644 lite/kernels/cuda/elementwise_compute.cu
 create mode 100644 lite/kernels/cuda/elementwise_compute.h
 create mode 100644 lite/kernels/cuda/elementwise_compute_test.cc
 create mode 100644 lite/kernels/cuda/match_matrix_tensor_compute.cu
 create mode 100644 lite/kernels/cuda/match_matrix_tensor_compute.h
 create mode 100644 lite/kernels/cuda/match_matrix_tensor_compute_test.cc
 create mode 100644 lite/kernels/cuda/search_aligned_mat_mul_compute.cc
 create mode 100644 lite/kernels/cuda/search_aligned_mat_mul_compute.h
 create mode 100644 lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
 create mode 100644 lite/kernels/cuda/search_fc_compute.cu
 create mode 100644 lite/kernels/cuda/search_fc_compute.h
 create mode 100644 lite/kernels/cuda/search_fc_compute_test.cc
 create mode 100644 lite/kernels/cuda/search_grnn_compute.cu
 create mode 100644 lite/kernels/cuda/search_grnn_compute.h
 create mode 100644 lite/kernels/cuda/search_grnn_compute_test.cc
 create mode 100644 lite/kernels/cuda/search_group_padding_compute.cu
 create mode 100644 lite/kernels/cuda/search_group_padding_compute.h
 create mode 100644 lite/kernels/cuda/search_group_padding_compute_test.cc
 create mode 100644 lite/kernels/cuda/search_seq_depadding_compute.cu
 create mode 100644 lite/kernels/cuda/search_seq_depadding_compute.h
 create mode 100644 lite/kernels/cuda/search_seq_depadding_compute_test.cc
 create mode 100644 lite/kernels/cuda/search_seq_fc_compute.cu
 create mode 100644 lite/kernels/cuda/search_seq_fc_compute.h
 create mode 100644 lite/kernels/cuda/search_seq_fc_compute_test.cc
 create mode 100644 lite/kernels/cuda/sequence_arithmetic_compute.cu
 create mode 100644 lite/kernels/cuda/sequence_arithmetic_compute.h
 create mode 100644 lite/kernels/cuda/sequence_arithmetic_compute_test.cc
 create mode 100644 lite/kernels/cuda/sequence_concat_compute.cu
 create mode 100644 lite/kernels/cuda/sequence_concat_compute.h
 create mode 100644 lite/kernels/cuda/sequence_concat_compute_test.cc
 create mode 100644 lite/kernels/cuda/sequence_pool_compute.cu
 create mode 100644 lite/kernels/cuda/sequence_pool_compute.h
 create mode 100644 lite/kernels/cuda/sequence_pool_compute_test.cc
 create mode 100644 lite/kernels/cuda/sequence_reverse_compute.cu
 create mode 100644 lite/kernels/cuda/sequence_reverse_compute.h
 create mode 100644 lite/kernels/cuda/sequence_reverse_compute_test.cc
 create mode 100644 lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
 create mode 100644 lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
 create mode 100644 lite/kernels/cuda/var_conv_2d_compute.cu
 create mode 100644 lite/kernels/cuda/var_conv_2d_compute.h
 create mode 100644 lite/kernels/cuda/var_conv_2d_compute_test.cc
 create mode 100644 lite/kernels/npu/bridges/reduce_mean_op.cc
 create mode 100644 lite/kernels/npu/bridges/reduce_mean_op_test.cc
 create mode 100644 lite/kernels/npu/bridges/sqrt_op.cc
 create mode 100644 lite/kernels/npu/bridges/sqrt_op_test.cc
 create mode 100644 lite/kernels/npu/bridges/square_op.cc
 create mode 100644 lite/kernels/npu/bridges/square_op_test.cc
 create mode 100644 lite/kernels/x86/attention_padding_mask_compute.cc
 create mode 100644 lite/kernels/x86/attention_padding_mask_compute.h
 create mode 100644 lite/kernels/x86/attention_padding_mask_compute_test.cc
 create mode 100644 lite/kernels/x86/cast_compute.cc
 create mode 100644 lite/kernels/x86/cast_compute.h
 create mode 100644 lite/kernels/x86/cast_compute_test.cc
 create mode 100644 lite/kernels/x86/gather_compute.cc
 create mode 100644 lite/kernels/x86/gather_compute.h
 create mode 100644 lite/kernels/x86/gather_compute_test.cc
 create mode 100644 lite/kernels/x86/layer_norm_compute.cc
 create mode 100644 lite/kernels/x86/layer_norm_compute.h
 create mode 100644 lite/kernels/x86/layer_norm_compute_test.cc
 create mode 100644 lite/kernels/x86/lookup_table_compute_test.cc
 create mode 100644 lite/kernels/x86/match_matrix_tensor_compute.cc
 create mode 100644 lite/kernels/x86/match_matrix_tensor_compute.h
 create mode 100644 lite/kernels/x86/match_matrix_tensor_compute_test.cc
 create mode 100644 lite/kernels/x86/search_aligned_mat_mul_compute.cc
 create mode 100644 lite/kernels/x86/search_aligned_mat_mul_compute.h
 create mode 100644 lite/kernels/x86/search_fc_compute.cc
 create mode 100644 lite/kernels/x86/search_fc_compute.h
 create mode 100644 lite/kernels/x86/search_fc_compute_test.cc
 create mode 100644 lite/kernels/x86/search_grnn_compute.cc
 create mode 100644 lite/kernels/x86/search_grnn_compute.h
 create mode 100644 lite/kernels/x86/search_grnn_compute_test.cc
 create mode 100644 lite/kernels/x86/search_group_padding_compute.cc
 create mode 100644 lite/kernels/x86/search_group_padding_compute.h
 create mode 100644 lite/kernels/x86/search_group_padding_compute_test.cc
 create mode 100644 lite/kernels/x86/search_seq_depadding_compute.cc
 create mode 100644 lite/kernels/x86/search_seq_depadding_compute.h
 create mode 100644 lite/kernels/x86/search_seq_depadding_compute_test.cc
 create mode 100644 lite/kernels/x86/search_seq_fc_compute.cc
 create mode 100644 lite/kernels/x86/search_seq_fc_compute.h
 create mode 100644 lite/kernels/x86/sequence_arithmetic_compute.cc
 create mode 100644 lite/kernels/x86/sequence_arithmetic_compute.h
 create mode 100644 lite/kernels/x86/sequence_arithmetic_compute_test.cc
 create mode 100644 lite/kernels/x86/sequence_concat_compute.cc
 create mode 100644 lite/kernels/x86/sequence_concat_compute.h
 create mode 100644 lite/kernels/x86/sequence_concat_compute_test.cc
 create mode 100644 lite/kernels/x86/sequence_reverse_compute.cc
 create mode 100644 lite/kernels/x86/sequence_reverse_compute.h
 create mode 100644 lite/kernels/x86/sequence_reverse_compute_test.cc
 create mode 100644 lite/kernels/x86/sequence_topk_avg_pooling_compute.cc
 create mode 100644 lite/kernels/x86/sequence_topk_avg_pooling_compute.h
 create mode 100644 lite/kernels/x86/stack_compute.cc
 create mode 100644 lite/kernels/x86/stack_compute.h
 create mode 100644 lite/kernels/x86/stack_compute_test.cc
 create mode 100644 lite/kernels/x86/var_conv_2d_compute.cc
 create mode 100644 lite/kernels/x86/var_conv_2d_compute.h
 create mode 100644 lite/kernels/x86/var_conv_2d_compute_test.cc
 create mode 100644 lite/operators/attention_padding_mask_op.cc
 create mode 100644 lite/operators/attention_padding_mask_op.h
 create mode 100644 lite/operators/lookup_table_v2_op.cc
 create mode 100644 lite/operators/lookup_table_v2_op.h
 create mode 100644 lite/operators/match_matrix_tensor_op.cc
 create mode 100644 lite/operators/match_matrix_tensor_op.h
 create mode 100644 lite/operators/search_aligned_mat_mul_op.cc
 create mode 100644 lite/operators/search_aligned_mat_mul_op.h
 create mode 100644 lite/operators/search_fc_op.cc
 create mode 100644 lite/operators/search_fc_op.h
 create mode 100644 lite/operators/search_grnn_op.cc
 create mode 100644 lite/operators/search_grnn_op.h
 create mode 100644 lite/operators/search_group_padding_op.cc
 create mode 100644 lite/operators/search_group_padding_op.h
 create mode 100644 lite/operators/search_seq_depadding_op.cc
 create mode 100644 lite/operators/search_seq_depadding_op.h
 create mode 100644 lite/operators/search_seq_fc_op.cc
 create mode 100644 lite/operators/search_seq_fc_op.h
 create mode 100644 lite/operators/search_seq_softmax_op.cc
 create mode 100644 lite/operators/search_seq_softmax_op.h
 create mode 100644 lite/operators/sequence_arithmetic_op.cc
 create mode 100644 lite/operators/sequence_arithmetic_op.h
 create mode 100644 lite/operators/sequence_concat_op.cc
 create mode 100644 lite/operators/sequence_concat_op.h
 create mode 100644 lite/operators/sequence_reverse_op.cc
 create mode 100644 lite/operators/sequence_reverse_op.h
 create mode 100644 lite/operators/sequence_topk_avg_pooling_op.cc
 create mode 100644 lite/operators/sequence_topk_avg_pooling_op.h
 create mode 100644 lite/operators/var_conv_2d_op.cc
 create mode 100644 lite/operators/var_conv_2d_op.h
 create mode 100644 lite/tests/kernels/fill_constant_compute_test.cc
 create mode 100644 lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
 create mode 100644 lite/tests/kernels/search_seq_fc_compute_test.cc
 create mode 100644 lite/tests/math/layout_compute_test.cc
 create mode 100644 lite/tests/math/sgemm_c4_compute_test.cc
 create mode 100644 lite/tests/math/sgemv_compute_test.cc
 mode change 100644 => 100755 mobile/src/fpga/V2/image.cpp
 mode change 100644 => 100755 mobile/src/fpga/V2/pe.cpp
 mode change 100644 => 100755 mobile/src/fpga/common/driver.cpp
 mode change 100644 => 100755 mobile/src/fpga/common/fpga_common.h
 mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
 mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
 mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
 mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
 mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
 mode change 100644 => 100755 mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 9ff908a4c8..a5d3d57218 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -174,15 +174,26 @@ if(NOT WITH_DSO)
     endif(WIN32)
 endif(NOT WITH_DSO)
 
-get_filename_component(CUDA_LIB_PATH ${CUDA_curand_LIBRARY} DIRECTORY)
-function(import_static_library alias path)
+function(add_cuda_static_lib alias cuda_lib_paths file_name)
+    unset(ABS_PATH CACHE)
+    find_library(ABS_PATH NAMES ${file_name} PATHS ${${cuda_lib_paths}} NO_DEFAULT_PATH)
     add_library(${alias} STATIC IMPORTED GLOBAL)
-    set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${path})
+    set_property(TARGET ${alias} PROPERTY IMPORTED_LOCATION ${ABS_PATH})
+    set(CUDA_STATIC_MODULES ${CUDA_STATIC_MODULES} ${alias} PARENT_SCOPE)
+    if (NOT ABS_PATH)
+      message(FATAL_ERROR "Can not find CUDA static library: ${file_name}")
+    endif()
 endfunction()
-import_static_library(cudart_static ${CUDA_LIB_PATH}/libcudart_static.a)
-import_static_library(cublas_static ${CUDA_LIB_PATH}/libcublas_static.a)
-import_static_library(curand_static ${CUDA_LIB_PATH}/libcurand_static.a)
-import_static_library(culibos_static ${CUDA_LIB_PATH}/libculibos.a)
+
+add_cuda_static_lib(cudart_static CUDNN_CHECK_LIBRARY_DIRS libcudart_static.a)
+add_cuda_static_lib(cublas_static CUDNN_CHECK_LIBRARY_DIRS libcublas_static.a)
+add_cuda_static_lib(curand_static CUDNN_CHECK_LIBRARY_DIRS libcurand_static.a)
+add_cuda_static_lib(culibos_static CUDNN_CHECK_LIBRARY_DIRS libculibos.a)
+if(NOT ${CUDA_VERSION} LESS 10.1)
+  add_cuda_static_lib(cublasLt_static CUDNN_CHECK_LIBRARY_DIRS libcublasLt_static.a)
+endif()
+
+set_property(GLOBAL PROPERTY CUDA_STATIC_MODULES cudnn_static ${CUDA_STATIC_MODULES})
 
 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 842b94d47e..574baa86a8 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -26,13 +26,15 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
     ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
-    ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+    /usr/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
+    /usr/lib/${TARGET_ARCH}-linux-gnu/
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
     /usr/lib
 	${CUDA_TOOLKIT_ROOT_DIR}
-	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+        ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+	${CUDA_TOOLKIT_ROOT_DIR}/lib64
 	)
 
 if((${CUDA_VERSION} GREATER 10.0) OR (${CUDA_VERSION} EQUAL 10.0))
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index bd0d117a63..599e7bba7e 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -1,5 +1,6 @@
 INCLUDE(ExternalProject)
 
+SET(EIGEN_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/eigen3)
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
@@ -16,9 +17,12 @@ if(WITH_AMD_GPU)
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/sabreshao/hipeigen.git"
-        GIT_TAG         7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e
+        GIT_TAG
+        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fhipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip
+        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
+        DOWNLOAD_NO_PROGRESS  1
         PREFIX          ${EIGEN_SOURCE_DIR}
+        DOWNLOAD_NAME   "hipeigen-upstream-702834151eaebcf955fd09ed0ad83c06.zip"
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
@@ -29,12 +33,14 @@ else()
     ExternalProject_Add(
         extern_eigen3
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY  "https://github.com/eigenteam/eigen-git-mirror"
         # eigen on cuda9.1 missing header of math_funtions.hpp
         # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
-        GIT_TAG         917060c364181f33a735dc023818d5a54f60e54c
+        GIT_TAG
+        URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
+        DOWNLOAD_DIR          ${EIGEN_SOURCECODE_DIR}
+        DOWNLOAD_NO_PROGRESS  1
         PREFIX          ${EIGEN_SOURCE_DIR}
-        DOWNLOAD_NAME   "eigen"
+        DOWNLOAD_NAME   "eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip"
         UPDATE_COMMAND  ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index 1d61154c0d..5166b494c4 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -20,6 +20,7 @@ endif()
 
 include(ExternalProject)
 
+SET(XBYAK_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xbyak)
 set(XBYAK_PROJECT       extern_xbyak)
 set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
 set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
@@ -38,8 +39,11 @@ ExternalProject_Add(
     ${XBYAK_PROJECT}
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS             ""
-    GIT_REPOSITORY      "https://github.com/herumi/xbyak.git"
     GIT_TAG             "v5.661"  # Jul 26th
+    URL                 http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Fxbyak-5.66.zip
+    DOWNLOAD_DIR        ${XBYAK_SOURCECODE_DIR}
+    DOWNLOAD_NAME   "xbyak-5.66.zip"
+    DOWNLOAD_NO_PROGRESS 1
     PREFIX              ${XBYAK_PREFIX_DIR}
     UPDATE_COMMAND      ""
     CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT}
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index 23b1e02108..fdc20351e8 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -1,5 +1,6 @@
 INCLUDE(ExternalProject)
 
+SET(XXHASH_SOURCECODE_DIR ${CMAKE_SOURCE_DIR}/third-party/xxhash)
 set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash)
 set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
 set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
@@ -18,10 +19,12 @@ if(WIN32)
   ExternalProject_Add(
           extern_xxhash
           ${EXTERNAL_PROJECT_LOG_ARGS}
-          GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
           GIT_TAG         "v0.6.5"
+          URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
+          DOWNLOAD_DIR          ${XXHASH_SOURCECODE_DIR}
+          DOWNLOAD_NAME   "xxHash-0.6.5.zip"
+          DOWNLOAD_NO_PROGRESS  1
           PREFIX          ${XXHASH_SOURCE_DIR}
-          DOWNLOAD_NAME   "xxhash"
           UPDATE_COMMAND  ""
           BUILD_IN_SOURCE 1
           PATCH_COMMAND
@@ -41,10 +44,12 @@ else()
   ExternalProject_Add(
       extern_xxhash
       ${EXTERNAL_PROJECT_LOG_ARGS}
-      GIT_REPOSITORY  "https://github.com/Cyan4973/xxHash"
       GIT_TAG         "v0.6.5"
+      URL             http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2FxxHash-0.6.5.zip
+      DOWNLOAD_DIR          ${XXHASH_SOURCECODE_DIR}
+      DOWNLOAD_NO_PROGRESS  1
       PREFIX          ${XXHASH_SOURCE_DIR}
-      DOWNLOAD_NAME   "xxhash"
+      DOWNLOAD_NAME   "xxHash-0.6.5.zip"
       UPDATE_COMMAND  ""
       CONFIGURE_COMMAND ""
       BUILD_IN_SOURCE 1
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 415eb451a9..225a3c19a1 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -490,6 +490,9 @@ function(nv_binary TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
+    target_link_libraries(${TARGET_NAME} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
     if(nv_binary_DEPS)
       target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
       add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
@@ -507,7 +510,7 @@ function(nv_test TARGET_NAME)
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
     get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
     target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest
-gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
+       gflags glog ${os_dependency_modules} ${CUDNN_LIBRARY} ${CUBLAS_LIBRARIES} )
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} lite_gtest_main gtest gflags glog)
     common_link(${TARGET_NAME})
     add_test(${TARGET_NAME} ${TARGET_NAME})
diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 4423e27e1a..3b9b4ece23 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -164,7 +164,9 @@ function(lite_cc_library TARGET)
 endfunction()
 
 function(lite_cc_binary TARGET)
-    set(options "")
+    if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+        set(options " -g ")
+    endif()
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS PROFILE_DEPS
       LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
@@ -255,6 +257,7 @@ endfunction()
 
 set(arm_kernels CACHE INTERNAL "arm kernels")
 set(x86_kernels CACHE INTERNAL "x86 kernels")
+set(cuda_kernels CACHE INTERNAL "cuda kernels")
 set(fpga_kernels CACHE INTERNAL "fpga kernels")
 set(npu_kernels CACHE INTERNAL "npu kernels")
 set(xpu_kernels CACHE INTERNAL "xpu kernels")
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index 173f04126e..036df2a824 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -5,6 +5,7 @@ message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
 message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
 message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
 message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
+message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
 message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
 message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
 message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
@@ -121,6 +122,9 @@ if (LITE_WITH_X86)
     add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
 endif()
 
+if(LITE_WITH_CUDA)
+    add_dependencies(publish_inference paddle_full_api_shared)
+endif(LITE_WITH_CUDA) 
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
     if (NOT LITE_ON_TINY_PUBLISH)
         # add cxx lib
@@ -161,7 +165,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                     COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/include"
                     COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                     COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/lib"
-                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+                    COMMAND cp "${CMAKE_SOURCE_DIR}/lite/utils/cv/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/include"
                     )
             add_dependencies(tiny_publish_lib bundle_light_api)
             add_dependencies(publish_inference tiny_publish_lib)
@@ -177,6 +181,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                     )
                 add_dependencies(tiny_publish_cxx_lib paddle_light_api_shared)
                 add_dependencies(publish_inference tiny_publish_cxx_lib)
+                add_custom_command(TARGET tiny_publish_cxx_lib POST_BUILD
+                            COMMAND ${CMAKE_STRIP} "-s" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/libpaddle_light_api_shared.so)
             endif()
         endif()
     endif()
@@ -199,7 +205,7 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
         endif()
     endif()
 
-    if ((ARM_TARGET_OS STREQUAL "android") AND (NOT LITE_WITH_OPENCL) AND
+    if ((ARM_TARGET_OS STREQUAL "android") AND
             ((ARM_TARGET_ARCH_ABI STREQUAL armv7) OR (ARM_TARGET_ARCH_ABI STREQUAL armv8)))
         if (NOT LITE_ON_TINY_PUBLISH)
             # copy
@@ -214,6 +220,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/include"
             )
             add_dependencies(publish_inference_android_cxx_demos logging gflags)
             add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -225,6 +234,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+
             )
             add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
         endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index aef0fc396e..e660bbcdd6 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -9,7 +9,7 @@ if (LITE_ON_TINY_PUBLISH)
     set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
 endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
-if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
     #full api dynamic library
     add_library(paddle_full_api_shared SHARED "")
     target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
@@ -19,7 +19,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "and
        add_dependencies(paddle_full_api_shared xxhash)
        target_link_libraries(paddle_full_api_shared xxhash)
     endif()
-    
+    if(LITE_WITH_CUDA)
+        target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
+    endif(LITE_WITH_CUDA) 
     #light api dynamic library
     lite_cc_library(paddle_light_api_shared MODULE
     SRCS light_api_shared.cc
@@ -65,6 +67,7 @@ endif()
 
 message(STATUS "get ops ${ops}")
 message(STATUS "get X86 kernels ${x86_kernels}")
+message(STATUS "get CUDA kernels ${cuda_kernels}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 message(STATUS "get NPU kernels ${npu_kernels}")
@@ -83,18 +86,17 @@ if (NOT LITE_ON_TINY_PUBLISH)
                     ARM_DEPS ${arm_kernels}
                     NPU_DEPS ${npu_kernels} ${npu_bridges} npu_pass
                     XPU_DEPS ${xpu_kernels} ${xpu_bridges} xpu_pass
-                    CL_DEPS ${opencl_kenrels}
-                    FPGA_DEPS ${fpga_kenrels}
-                    BM_DEPS ${bm_kenrels})
+                    CL_DEPS ${opencl_kernels}
+                    FPGA_DEPS ${fpga_kernels})
+                    BM_DEPS ${bm_kernels})
 endif()
 
 # for light api
 set(light_api_deps
     scope target_wrapper_host model_parser program)
 if(LITE_WITH_CUDA)
+    get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
     set(light_api_deps ${light_api_deps} target_wrapper_cuda)
-    set(cuda_static_deps cudart_static cublas_static curand_static
-        cudnn_static culibos_static)
 endif()
 lite_cc_library(light_api SRCS light_api.cc
         DEPS scope target_wrapper_host model_parser
@@ -104,9 +106,9 @@ lite_cc_library(light_api SRCS light_api.cc
         ARM_DEPS ${arm_kernels}
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
-        CL_DEPS ${opencl_kenrels}
-        FPGA_DEPS ${fpga_kenrels}
-        BM_DEPS ${bm_kenrels})
+        CL_DEPS ${opencl_kernels}
+        FPGA_DEPS ${fpga_kernels})
+        BM_DEPS ${bm_kernels})
 
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
@@ -305,9 +307,10 @@ if(NOT IOS)
     NPU_DEPS ${npu_kernels}
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
-    FPGA_DEPS ${fpga_kernels}
     BM_DEPS ${bm_kernels}
-    X86_DEPS ${x86_kernels})
+    FPGA_DEPS ${fpga_kernels}
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
   lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
     ${ops} ${host_kernels}
     ARM_DEPS ${arm_kernels}
@@ -316,7 +319,9 @@ if(NOT IOS)
     CL_DEPS ${opencl_kernels}
     BM_DEPS ${bm_kernels}
     FPGA_DEPS ${fpga_kernels}
-    X86_DEPS ${x86_kernels})
+    X86_DEPS ${x86_kernels}
+    CUDA_DEPS ${cuda_kernels})
+
 endif()
 
 #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc
index 462a5e2381..c137324b57 100644
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -44,9 +44,10 @@ void OutputOptModel(const std::string& load_model_dir,
                     const std::vector<std::vector<int64_t>>& input_shapes) {
   lite_api::CxxConfig config;
   config.set_model_dir(load_model_dir);
-  std::vector<Place> vaild_places = {Place{TARGET(kARM), PRECISION(kFloat)},
-                                     Place{TARGET(kX86), PRECISION(kFloat)},
-                                     Place{TARGET(kOpenCL), PRECISION(kFloat)}};
+  std::vector<Place> vaild_places = {
+      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kX86), PRECISION(kFloat)},
+  };
   if (FLAGS_is_quantized_model) {
     vaild_places.insert(vaild_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index a2b538aa77..4647f20bbe 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -24,13 +24,6 @@
 namespace paddle {
 namespace lite {
 
-static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
-    ".tailored_ops_source_list";
-static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
-static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
-    ".tailored_kernels_source_list";
-static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
-
 void Predictor::SaveModel(const std::string &dir,
                           lite_api::LiteModelType model_type,
                           bool record_info) {
@@ -140,21 +133,35 @@ lite::Tensor *Predictor::GetInput(size_t offset) {
 
 // get inputs names
 std::vector<std::string> Predictor::GetInputNames() { return input_names_; }
+
 // get outputnames
 std::vector<std::string> Predictor::GetOutputNames() { return output_names_; }
+
 // append the names of inputs and outputs into input_names_ and output_names_
 void Predictor::PrepareFeedFetch() {
+  std::vector<const cpp::OpDesc *> feeds;
+  std::vector<const cpp::OpDesc *> fetchs;
+#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
+  // The shape of input tensors must be determined before generating NPU and XPU
+  // program.
   auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
-  std::vector<cpp::OpDesc *> feeds;
-  std::vector<cpp::OpDesc *> fetchs;
   for (size_t i = 0; i < current_block->OpsSize(); i++) {
     auto op = current_block->GetOp<cpp::OpDesc>(i);
+#else
+  if (!program_) {
+    GenRuntimeProgram();
+  }
+  const auto &insts = program_->instructions();
+  for (size_t i = 0; i < program_->num_instructions(); i++) {
+    const auto &op = insts[i].op()->op_info();
+#endif
     if (op->Type() == "feed") {
       feeds.push_back(op);
     } else if (op->Type() == "fetch") {
       fetchs.push_back(op);
     }
   }
+
   input_names_.resize(feeds.size());
   output_names_.resize(fetchs.size());
   for (size_t i = 0; i < feeds.size(); i++) {
@@ -190,6 +197,7 @@ std::vector<const lite::Tensor *> Predictor::GetOutputs() const {
 const cpp::ProgramDesc &Predictor::program_desc() const {
   return program_desc_;
 }
+
 const RuntimeProgram &Predictor::runtime_program() const { return *program_; }
 
 void Predictor::Build(const lite_api::CxxConfig &config,
@@ -246,16 +254,18 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
                       const std::vector<Place> &valid_places,
                       const std::vector<std::string> &passes) {
   program_desc_ = desc;
+  // `inner_places` is used to optimize passes
   std::vector<Place> inner_places = valid_places;
   inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
   inner_places.emplace_back(
       TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
   Program program(desc, scope_, inner_places);
-  /// The first place in valid_places is
+
   core::KernelPickFactor factor;
   factor.ConsiderTarget();
   factor.ConsiderPrecision();
   factor.ConsiderDataLayout();
+
   optimizer_.Run(std::move(program), inner_places, factor, passes);
   exec_scope_ = optimizer_.exec_scope();
   PrepareFeedFetch();
@@ -271,6 +281,7 @@ const lite::Tensor *Predictor::GetTensor(const std::string &name) const {
   auto *var = exec_scope_->FindVar(name);
   return &var->Get<lite::Tensor>();
 }
+
 // get input by name
 lite::Tensor *Predictor::GetInputByName(const std::string &name) {
   auto element = std::find(input_names_.begin(), input_names_.end(), name);
diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h
index 502ce812e1..504710d9fa 100644
--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -29,6 +29,13 @@
 namespace paddle {
 namespace lite {
 
+static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
+    ".tailored_ops_source_list";
+static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
+static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
+    ".tailored_kernels_source_list";
+static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
+
 /*
  * Predictor for inference, input a model, it will optimize and execute it.
  */
diff --git a/lite/api/mobilenetv1_test.cc b/lite/api/mobilenetv1_test.cc
index 63a401745b..79f9bea762 100644
--- a/lite/api/mobilenetv1_test.cc
+++ b/lite/api/mobilenetv1_test.cc
@@ -123,8 +123,11 @@ TEST(MobileNetV1, test_arm) {
 #ifdef LITE_WITH_OPENCL
 TEST(MobileNetV1, test_opencl) {
   std::vector<Place> valid_places({
-      Place{TARGET(kOpenCL), PRECISION(kFloat)},
-      Place{TARGET(kARM), PRECISION(kFloat)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      TARGET(kARM),  // enable kARM CPU kernel when no opencl kernel
   });
 
   TestModel(valid_places);
diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc
index 1aef522b2a..1c426e8568 100644
--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -20,6 +20,7 @@
 // model_optimize_tool's compiling period
 #include "all_kernel_faked.cc"  // NOLINT
 #include "kernel_src_map.h"     // NOLINT
+#include "lite/api/cxx_api.h"
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
@@ -31,6 +32,18 @@ DEFINE_string(model_dir,
               "",
               "path of the model. This option will be ignored if model_file "
               "and param_file are exist");
+DEFINE_string(model_filename,
+              "",
+              "model topo filename of the model in models set. This option"
+              " will be used to specific tailoring");
+DEFINE_string(param_filename,
+              "",
+              "model param filename of the model in models set. This option"
+              " will be used to specific tailoring");
+DEFINE_string(model_set_dir,
+              "",
+              "path of the models set. This option will be used to specific"
+              " tailoring");
 DEFINE_string(model_file, "", "model file path of the combined-param model");
 DEFINE_string(param_file, "", "param file path of the combined-param model");
 DEFINE_string(
@@ -58,29 +71,23 @@ void DisplayKernels() {
   LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString();
 }
 
-void Main() {
-  if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) {
-    LOG(WARNING)
-        << "Load combined-param model. Option model_dir will be ignored";
-  }
-
-  if (FLAGS_display_kernels) {
-    DisplayKernels();
-    exit(0);
-  }
-
-  lite_api::CxxConfig config;
-  config.set_model_dir(FLAGS_model_dir);
-  config.set_model_file(FLAGS_model_file);
-  config.set_param_file(FLAGS_param_file);
-
+std::vector<Place> ParserValidPlaces() {
   std::vector<Place> valid_places;
-  auto target_reprs = lite::Split(FLAGS_valid_targets, " ");
+  auto target_reprs = lite::Split(FLAGS_valid_targets, ",");
   for (auto& target_repr : target_reprs) {
     if (target_repr == "arm") {
       valid_places.emplace_back(TARGET(kARM));
     } else if (target_repr == "opencl") {
-      valid_places.emplace_back(TARGET(kOpenCL));
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)});
+      valid_places.emplace_back(
+          Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)});
+      valid_places.emplace_back(
+          TARGET(kARM));  // enable kARM CPU kernel when no opencl kernel
     } else if (target_repr == "x86") {
       valid_places.emplace_back(TARGET(kX86));
     } else {
@@ -100,26 +107,130 @@ void Main() {
     valid_places.insert(valid_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
   }
+  return valid_places;
+}
+
+void RunOptimize(const std::string& model_dir,
+                 const std::string& model_file,
+                 const std::string& param_file,
+                 const std::string& optimize_out,
+                 const std::string& optimize_out_type,
+                 const std::vector<Place>& valid_places,
+                 bool record_tailoring_info) {
+  if (!model_file.empty() && !param_file.empty()) {
+    LOG(WARNING)
+        << "Load combined-param model. Option model_dir will be ignored";
+  }
+
+  lite_api::CxxConfig config;
+  config.set_model_dir(model_dir);
+  config.set_model_file(model_file);
+  config.set_param_file(param_file);
+
   config.set_valid_places(valid_places);
 
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   LiteModelType model_type;
-  if (FLAGS_optimize_out_type == "protobuf") {
+  if (optimize_out_type == "protobuf") {
     model_type = LiteModelType::kProtobuf;
-  } else if (FLAGS_optimize_out_type == "naive_buffer") {
+  } else if (optimize_out_type == "naive_buffer") {
     model_type = LiteModelType::kNaiveBuffer;
   } else {
-    LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type;
+    LOG(FATAL) << "Unsupported Model type :" << optimize_out_type;
   }
-  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
 
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);
   predictor->SaveOptimizedModel(
-      FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info);
-  if (FLAGS_record_tailoring_info) {
+      optimize_out, model_type, record_tailoring_info);
+  if (record_tailoring_info) {
     LOG(INFO) << "Record the information of tailored model into :"
-              << FLAGS_optimize_out;
+              << optimize_out;
+  }
+}
+
+void CollectModelMetaInfo(const std::string& output_dir,
+                          const std::vector<std::string>& models,
+                          const std::string& filename) {
+  std::set<std::string> total;
+  for (const auto& name : models) {
+    std::string model_path =
+        lite::Join<std::string>({output_dir, name, filename}, "/");
+    auto lines = lite::ReadLines(model_path);
+    total.insert(lines.begin(), lines.end());
+  }
+  std::string output_path =
+      lite::Join<std::string>({output_dir, filename}, "/");
+  lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
+                   output_path);
+}
+
+void Main() {
+  if (FLAGS_display_kernels) {
+    DisplayKernels();
+    exit(0);
   }
+
+  auto valid_places = ParserValidPlaces();
+  if (FLAGS_model_set_dir == "") {
+    RunOptimize(FLAGS_model_dir,
+                FLAGS_model_file,
+                FLAGS_param_file,
+                FLAGS_optimize_out,
+                FLAGS_optimize_out_type,
+                valid_places,
+                FLAGS_record_tailoring_info);
+    return;
+  }
+
+  if (!FLAGS_record_tailoring_info) {
+    LOG(WARNING) << "--model_set_dir option only be used with "
+                    "--record_tailoring_info=true together";
+    return;
+  }
+
+  auto model_dirs = lite::ListDir(FLAGS_model_set_dir, true);
+  if (model_dirs.size() == 0) {
+    LOG(FATAL) << "[" << FLAGS_model_set_dir << "] does not contain any model";
+  }
+  // Optimize models in FLAGS_model_set_dir
+  for (const auto& name : model_dirs) {
+    std::string input_model_dir =
+        lite::Join<std::string>({FLAGS_model_set_dir, name}, "/");
+    std::string output_model_dir =
+        lite::Join<std::string>({FLAGS_optimize_out, name}, "/");
+
+    std::string model_file = "";
+    std::string param_file = "";
+
+    if (FLAGS_model_filename != "" && FLAGS_param_filename != "") {
+      model_file =
+          lite::Join<std::string>({input_model_dir, FLAGS_model_filename}, "/");
+      param_file =
+          lite::Join<std::string>({input_model_dir, FLAGS_param_filename}, "/");
+    }
+
+    LOG(INFO) << "Start optimize model: " << input_model_dir;
+    RunOptimize(input_model_dir,
+                model_file,
+                param_file,
+                output_model_dir,
+                FLAGS_optimize_out_type,
+                valid_places,
+                FLAGS_record_tailoring_info);
+    LOG(INFO) << "Optimize done. ";
+  }
+
+  // Collect all models information
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_SOURCE_LIST_FILENAME);
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_OPS_LIST_NAME);
+  CollectModelMetaInfo(FLAGS_optimize_out,
+                       model_dirs,
+                       lite::TAILORD_KERNELS_SOURCE_LIST_FILENAME);
+  CollectModelMetaInfo(
+      FLAGS_optimize_out, model_dirs, lite::TAILORD_KERNELS_LIST_NAME);
 }
 
 }  // namespace lite_api
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index 1358267000..a04e86b7d2 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -21,14 +21,14 @@
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
-#include "lite/tests/utils/timer.h"
+#include "lite/core/profile/timer.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
 
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_string(input_shape,
               "1,3,224,224",
@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
 
   Timer ti;
   for (int j = 0; j < repeat; ++j) {
-    ti.start();
+    ti.Start();
     predictor->Run();
-    ti.end();
-    LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms";
+    float t = ti.Stop();
+    LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
   }
 
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir
             << ", power_mode: " << static_cast<int>(power_mode)
             << ", threads num " << thread_num << ", warmup: " << warmup_times
-            << ", repeats: " << repeat << ", avg time: " << ti.get_average_ms()
+            << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
             << " ms"
-            << ", min time: " << ti.get_min_time() << " ms"
-            << ", max time: " << ti.get_max_time() << " ms.";
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
 
   auto output = predictor->GetOutput(0);
   auto out = output->data<float>();
diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc
index f148096bb6..aabb535292 100644
--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -93,7 +93,7 @@ void Tensor::CopyFromCpu(const T *src_data) {
   }
 }
 template <typename T>
-void Tensor::CopyToCpu(T *data) {
+void Tensor::CopyToCpu(T *data) const {
   const T *src_data = tensor(raw_tensor_)->data<T>();
   int64_t num = tensor(raw_tensor_)->numel();
   CHECK(num > 0) << "You should call Resize interface first";
@@ -121,12 +121,13 @@ template void Tensor::CopyFromCpu<int, TargetType::kARM>(const int *);
 template void Tensor::CopyFromCpu<float, TargetType::kARM>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kARM>(const int8_t *);
 template void Tensor::CopyFromCpu<int, TargetType::kCUDA>(const int *);
+template void Tensor::CopyFromCpu<int64_t, TargetType::kCUDA>(const int64_t *);
 template void Tensor::CopyFromCpu<float, TargetType::kCUDA>(const float *);
 template void Tensor::CopyFromCpu<int8_t, TargetType::kCUDA>(const int8_t *);
 
-template void Tensor::CopyToCpu(int8_t *);
-template void Tensor::CopyToCpu(float *);
-template void Tensor::CopyToCpu(int *);
+template void Tensor::CopyToCpu(int8_t *) const;
+template void Tensor::CopyToCpu(float *) const;
+template void Tensor::CopyToCpu(int *) const;
 
 shape_t Tensor::shape() const {
   return ctensor(raw_tensor_)->dims().Vectorize();
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index 42b455da81..c578769bd5 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -49,7 +49,7 @@ struct LITE_API Tensor {
   void CopyFromCpu(const T* data);
 
   template <typename T>
-  void CopyToCpu(T* data);
+  void CopyToCpu(T* data) const;
   /// Shape of the tensor.
   shape_t shape() const;
   TargetType target() const;
diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc
index 3d7d496afb..894d839185 100644
--- a/lite/api/paddle_place.cc
+++ b/lite/api/paddle_place.cc
@@ -55,8 +55,7 @@ const std::string& TargetToStr(TargetType target) {
                                               "any",
                                               "fpga",
                                               "npu",
-                                              "xpu",
-                                              "bm"};
+                                              "xpu"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -94,8 +93,7 @@ const std::string& TargetRepr(TargetType target) {
                                               "kAny",
                                               "kFPGA",
                                               "kNPU",
-                                              "kXPU",
-                                              "kBM"};
+                                              "kXPU"};
   auto x = static_cast<int>(target);
   CHECK_LT(x, static_cast<int>(TARGET(NUM)));
   return target2string[x];
@@ -131,8 +129,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
                                                TARGET(kOpenCL),
                                                TARGET(kNPU),
                                                TARGET(kXPU),
-                                               TARGET(kFPGA),
-                                               TARGET(kBM)});
+                                               TARGET(kFPGA)});
   if (target == TARGET(kAny)) {
     return valid_set;
   }
diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h
index a13abb699c..07284be095 100644
--- a/lite/api/paddle_place.h
+++ b/lite/api/paddle_place.h
@@ -52,9 +52,8 @@ enum class TargetType : int {
   kFPGA = 7,
   kNPU = 8,
   kXPU = 9,
-  kBM = 10,
   kAny = 6,  // any target
-  NUM = 11,  // number of fields.
+  NUM = 10,  // number of fields.
 };
 enum class PrecisionType : int {
   kUnk = 0,
diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h
index 70355fdf89..9d56d262ab 100644
--- a/lite/api/paddle_use_passes.h
+++ b/lite/api/paddle_use_passes.h
@@ -20,7 +20,12 @@ USE_MIR_PASS(static_kernel_pick_pass);
 USE_MIR_PASS(variable_place_inference_pass);
 USE_MIR_PASS(type_target_cast_pass);
 USE_MIR_PASS(generate_program_pass);
-USE_MIR_PASS(subgraph_program_pass);
+#ifdef LITE_WITH_NPU
+USE_MIR_PASS(generate_npu_program_pass);
+#endif
+#ifdef LITE_WITH_XPU
+USE_MIR_PASS(generate_xpu_program_pass);
+#endif
 
 USE_MIR_PASS(io_copy_kernel_pick_pass);
 USE_MIR_PASS(argument_type_display_pass);
diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc
index c483373dc7..5314c5ed75 100644
--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
@@ -12,20 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <vector>
diff --git a/lite/backends/arm/math/CMakeLists.txt b/lite/backends/arm/math/CMakeLists.txt
index cbbcf49a5f..076c791daa 100644
--- a/lite/backends/arm/math/CMakeLists.txt
+++ b/lite/backends/arm/math/CMakeLists.txt
@@ -57,9 +57,10 @@ endif()
 
 if (NOT HAS_ARM_MATH_LIB_DIR)
   # TODO(xxx): seperate them and do not deps proto, eigen3
-  cc_library(math_arm SRCS  
-      funcs.cc 
+  cc_library(math_arm SRCS
+      funcs.cc
       packed_sgemm.cc
+      packed_sgemm_c4.cc
       sgemm.cc
       gemm_prepacked_int8.cc
       gemm_s8.cc
@@ -67,8 +68,10 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       gemv_arm_int8.cc
       conv3x3s1_direct_fp32.cc
       conv3x3s2_direct_fp32.cc
-      conv3x3s1_depthwise_fp32.cc
-      conv3x3s2_depthwise_fp32.cc
+      conv3x3s1p01_depthwise_fp32.cc
+      conv3x3s2p01_depthwise_fp32.cc
+      conv3x3s1px_depthwise_fp32.cc
+      conv3x3s2px_depthwise_fp32.cc
       conv3x3s1_direct_int8.cc
       conv3x3s2_direct_int8.cc
       conv3x3s1_depthwise_int8.cc
@@ -76,16 +79,14 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       conv5x5s1_depthwise_int8.cc
       conv5x5s1_depthwise_fp32.cc
       conv5x5s2_depthwise_fp32.cc
-      conv_depthwise_3x3p0.cc
-      conv_depthwise_3x3p1.cc
-      conv_depthwise_3x3s1.cc
-      conv_depthwise_3x3s2.cc
+      conv3x3_winograd_fp32_c4.cc
       conv_winograd_3x3.cc
       conv_impl.cc
-      softmax.cc 
+      softmax.cc
       scale.cc
       pooling.cc
       elementwise.cc
+      layout.cc
       lrn.cc
       decode_bboxes.cc
       concat.cc
@@ -121,4 +122,3 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       anchor_generator.cc
       DEPS ${lite_kernel_deps} context tensor)
 endif()
- 
diff --git a/lite/backends/arm/math/col_im_transform.cc b/lite/backends/arm/math/col_im_transform.cc
index b5d2c6af13..38be1d689d 100644
--- a/lite/backends/arm/math/col_im_transform.cc
+++ b/lite/backends/arm/math/col_im_transform.cc
@@ -32,8 +32,10 @@ void col2im<float>(const float* data_col,
                    const int width,
                    const int kernel_h,
                    const int kernel_w,
-                   const int pad_h,
-                   const int pad_w,
+                   const int pad_h0,
+                   const int pad_h1,
+                   const int pad_w0,
+                   const int pad_w1,
                    const int stride_h,
                    const int stride_w,
                    const int dilation_h,
@@ -41,19 +43,22 @@ void col2im<float>(const float* data_col,
                    float* data_im) {
   memset(data_im, 0, height * width * channels * sizeof(float));
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
   const int channel_size = height * width;
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_h0 + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             data_col += output_w;
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_w0 + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 data_im[input_row * width + input_col] += *data_col;
diff --git a/lite/backends/arm/math/col_im_transform.h b/lite/backends/arm/math/col_im_transform.h
index 8560679d7f..e3e32c4715 100644
--- a/lite/backends/arm/math/col_im_transform.h
+++ b/lite/backends/arm/math/col_im_transform.h
@@ -26,8 +26,10 @@ void col2im(const Dtype* data_col,
             const int width,
             const int kernel_h,
             const int kernel_w,
-            const int pad_h,
-            const int pad_w,
+            const int pad_h0,
+            const int pad_h1,
+            const int pad_w0,
+            const int pad_w1,
             const int stride_h,
             const int stride_w,
             const int dilation_h,
diff --git a/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
new file mode 100644
index 0000000000..5834461b8f
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3_winograd_fp32_c4.cc
@@ -0,0 +1,564 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+#include <arm_neon.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void input_trans_c4(const float* src,
+                    int src_stride,
+                    float* dest,
+                    int dest_stride);
+void output_trans_c4(const float* src,
+                     int src_stride,
+                     float* dest,
+                     int dest_stride);
+void output_trans_c4_post(const float* src,
+                          int src_stride,
+                          float* dest,
+                          int dest_stride,
+                          float* bias_value,
+                          bool has_relu);
+void weight_trans_c4(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+
+/*
+*The following function conv_compute_6x6_3x3 is base on
+*MNN[https://github.com/alibaba/MNN]
+*
+*Copyright © 2018, Alibaba Group Holding Limited
+*/
+void conv_compute_6x6_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx) {
+  const int pad_h = (*param.paddings)[0];
+  const int pad_w = (*param.paddings)[2];
+  float* tmp_work_space =
+      ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
+
+  int in_n_stride = chin * hin * win;
+  int out_n_stride = chout * hout * wout;
+  int ic_stride = win * hin;
+  int oc_stride = wout * hout;
+  int ic_4 = (chin + 3) / 4;
+  int oc_4 = (chout + 3) / 4;
+
+  int tile_w = (wout + 5) / 6;
+  int tile_h = (hout + 5) / 6;
+  int size_tile = tile_h * tile_w;
+  float zero_ptr[8];
+  memset(zero_ptr, 0, 8 * sizeof(float));
+
+  int w_pad = win + pad_w * 2;
+  int h_pad = hin + pad_h * 2;
+  float* input_c4 = tmp_work_space;
+  int new_h_stride = w_pad * 4;
+  int new_c_stride = new_h_stride * h_pad;
+
+  int ic_4_stride = w_pad * h_pad * 4;
+  int oc_4_stride = wout * hout * 4;
+
+  int tile_block = 8;
+#ifdef __aarch64__
+  tile_block = 16;
+#endif
+  int block_count = (size_tile + tile_block - 1) / tile_block;
+
+  int threads = ctx->threads();
+  float* g_tmp_data = tmp_work_space + ic_4 * new_c_stride;
+  int tmp_data_thread_stride = tile_block * (oc_4 + ic_4) * 256;
+  memset(g_tmp_data, 0, threads * tmp_data_thread_stride * sizeof(float));
+  float* g_trans_tmp_data = g_tmp_data + threads * tmp_data_thread_stride;
+  float* g_trans_remain_tmp_data = g_trans_tmp_data + threads * 256;
+
+  // begin compute
+  for (int ni = 0; ni < num; ++ni) {
+    // trans input to c4
+    for (int i = 0; i < ic_4; ++i) {
+      prepack_input_nxwc4_dw(input + ni * in_n_stride,
+                             input_c4 + i * new_c_stride,
+                             i * 4,
+                             -pad_h,
+                             hin + pad_h,
+                             -pad_w,
+                             win + pad_w,
+                             chin,
+                             win,
+                             hin,
+                             zero_ptr);
+    }
+    float* output_ptr = output + ni * out_n_stride;
+
+    const float* weight_ptr = weight;
+    const float* bias_ptr = bias;
+#pragma omp parallel for num_threads(threads)
+    for (int tbi = 0; tbi < block_count; ++tbi) {
+#ifdef ARM_WITH_OMP
+      float* tmp_data =
+          g_tmp_data + omp_get_thread_num() * tmp_data_thread_stride;
+      float* trans_tmp_data = g_trans_tmp_data + omp_get_thread_num() * 256;
+      float* trans_remain_tmp_data =
+          g_trans_remain_tmp_data + omp_get_thread_num() * 256;
+#else
+      float* tmp_data = g_tmp_data;
+      float* trans_tmp_data = g_trans_tmp_data;
+      float* trans_remain_tmp_data = g_trans_remain_tmp_data;
+#endif
+      int tile_index = tbi * tile_block;
+      int tile_remain = size_tile - tile_index;
+      int tile_count = tile_remain > tile_block ? tile_block : tile_remain;
+
+      // input trans
+      int c_gi_stride = tile_count * oc_4 * 4;
+      int b_gi_stride = tile_count * ic_4 * 4;
+      //*
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int src_x = tw_index * 6;
+        int src_y = th_index * 6;
+        int ex = src_x + 8 > w_pad ? w_pad - src_x : 8;
+        int ey = src_y + 8 > h_pad ? h_pad - src_y : 8;
+
+        float* dst_ptr = tmp_data + ti * 4;
+        const float* src_ptr = input_c4 + (src_y * w_pad + src_x) * 4;
+
+        if (ex == 8 && ey == 8) {
+          // trans input
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            for (int i = 0; i < 8; ++i) {
+              const float* ci_ptr = src_ci + i * w_pad * 4;
+              input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32);
+            }
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              input_trans_c4(trans_tmp_data + i * 32,
+                             4,
+                             dst_ci + i * b_gi_stride * 8,
+                             b_gi_stride);
+            }
+          }
+        } else {
+          // trans remain input
+          int x_size = ex;
+          for (int ci = 0; ci < ic_4; ++ci) {
+            const float* src_ci = src_ptr + ci * ic_4_stride;
+            // pad
+            memset(trans_remain_tmp_data, 0, 256 * sizeof(float));
+            if (x_size > 0) {
+              for (int yi = 0; yi < ey; ++yi) {
+                float* dst_yi = trans_remain_tmp_data + yi * 32;
+                const float* src_yi = src_ci + w_pad * yi * 4;
+                memcpy(dst_yi, src_yi, x_size * sizeof(float) * 4);
+              }
+            }
+
+            // trans
+            for (int i = 0; i < 8; ++i) {
+              float* ci_ptr = trans_remain_tmp_data + i * 32;
+              input_trans_c4(ci_ptr, 4, trans_tmp_data + i * 4, 32);
+            }
+            float* dst_ci = dst_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              input_trans_c4(trans_tmp_data + i * 32,
+                             4,
+                             dst_ci + i * b_gi_stride * 8,
+                             b_gi_stride);
+            }
+          }  // for ci_4
+        }
+      }
+      //*/
+      // input trans end
+      // *begin compute dot
+      // *
+      //*
+      float* dst_temp_data = tmp_data + tile_block * ic_4 * 256;
+      float* b_ptr = tmp_data;
+      int w_gi_stride = ic_4 * oc_4 * 16;
+      for (int gi = 0; gi < 64; ++gi) {
+        float* origin_C = dst_temp_data + gi * c_gi_stride;
+        float* origin_B = b_ptr + gi * b_gi_stride;
+        const float* origin_A = weight + gi * w_gi_stride;
+        sgemm_prepack_c4_small(oc_4 * 4,
+                               tile_count,
+                               ic_4 * 4,
+                               origin_A,
+                               origin_B,
+                               origin_C,
+                               nullptr,
+                               false,
+                               false,
+                               ctx);
+      }
+      //*/
+      //*
+      // output trans
+      float bias_value[4];
+      memset(bias_value, 0, 4 * sizeof(float));
+
+      for (int ti = 0; ti < tile_count; ++ti) {
+        int index = tile_index + ti;
+
+        int tw_index = index % tile_w;
+        int th_index = index / tile_w;
+
+        int dst_x = tw_index * 6;
+        int dst_y = th_index * 6;
+
+        int ex = dst_x + 6 > wout ? wout - dst_x : 6;
+        int ey = dst_y + 6 > hout ? hout - dst_y : 6;
+
+        float* dst_ptr = output + (dst_y * wout + dst_x) * 4;
+        float* src_ptr = dst_temp_data + ti * 4;
+
+        if (ex == 6) {
+          // trans output
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              output_trans_c4(src_ci + i * c_gi_stride * 8,
+                              c_gi_stride,
+                              trans_tmp_data + i * 4,
+                              32);
+            }
+            for (int i = 0; i < ey; ++i) {
+              output_trans_c4_post(trans_tmp_data + i * 32,
+                                   4,
+                                   trans_remain_tmp_data + i * 24,
+                                   4,
+                                   bias_value,
+                                   param.fuse_relu);
+            }
+            write_to_output_c4_fp32(trans_remain_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr);
+          }
+        } else {
+          for (int ci = 0; ci < oc_4; ++ci) {
+            if (param.bias) {
+              bias_value[0] = bias[ci * 4];
+              bias_value[1] = bias[ci * 4 + 1];
+              bias_value[2] = bias[ci * 4 + 2];
+              bias_value[3] = bias[ci * 4 + 3];
+            }
+            // trans output
+            float* dst_ci = dst_ptr + ci * oc_4_stride;
+            float* src_ci = src_ptr + ci * tile_count * 4;
+            for (int i = 0; i < 8; ++i) {
+              output_trans_c4(src_ci + i * c_gi_stride * 8,
+                              c_gi_stride,
+                              trans_tmp_data + i * 4,
+                              32);
+            }
+            for (int i = 0; i < ey; ++i) {
+              output_trans_c4_post(trans_tmp_data + i * 32,
+                                   4,
+                                   trans_remain_tmp_data + i * 24,
+                                   4,
+                                   bias_value,
+                                   param.fuse_relu);
+            }
+            // copy to dest
+            memset(trans_tmp_data, 0, 144 * sizeof(float));
+            for (int i = 0; i < ey; ++i) {
+              memcpy(trans_tmp_data + i * ex * 4,
+                     trans_remain_tmp_data + i * 24,
+                     ex * sizeof(float) * 4);
+            }
+            write_to_output_c4_fp32(trans_tmp_data,
+                                    output_ptr,
+                                    ci * 4,
+                                    ci * 4 + 4,
+                                    dst_y,
+                                    dst_y + ey,
+                                    dst_x,
+                                    dst_x + ex,
+                                    chout,
+                                    hout,
+                                    wout,
+                                    false,
+                                    zero_ptr);
+          }
+        }
+      }
+      //*/
+    }  // for block_count
+  }    // for num
+}  // conv_compute
+
+void output_trans_c4(const float* src,
+                     int src_stride,
+                     float* dest,
+                     int dest_stride) {
+  const float32x4_t src0 = vld1q_f32(src);
+  const float32x4_t src1 = vld1q_f32(src + src_stride);
+  const float32x4_t src2 = vld1q_f32(src + src_stride * 2);
+  const float32x4_t src3 = vld1q_f32(src + src_stride * 3);
+  const float32x4_t src4 = vld1q_f32(src + src_stride * 4);
+  const float32x4_t src5 = vld1q_f32(src + src_stride * 5);
+  const float32x4_t src6 = vld1q_f32(src + src_stride * 6);
+  const float32x4_t src7 = vld1q_f32(src + src_stride * 7);
+
+  float32x4_t tmp024a = vaddq_f32(src1, src2);
+  float32x4_t tmp135a = vsubq_f32(src1, src2);
+  float32x4_t tmp024b = vaddq_f32(src3, src4);
+  float32x4_t tmp135b = vsubq_f32(src3, src4);
+  float32x4_t tmp024c = vaddq_f32(src5, src6);
+  float32x4_t tmp135c = vsubq_f32(src5, src6);
+
+  float32x4_t dest0 =
+      vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c);
+  float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)),
+                                vmulq_n_f32(tmp024c, 0.25f));
+  float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)),
+                                vmulq_n_f32(tmp024c, 0.0625f));
+
+  float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)),
+                                vmulq_n_f32(tmp135c, 0.5f));
+  float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)),
+                                vmulq_n_f32(tmp135c, 0.125f));
+  float32x4_t dest5 =
+      vaddq_f32(src7,
+                vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)),
+                          vmulq_n_f32(tmp135c, 0.03125f)));
+
+  vst1q_f32(dest, dest0);
+  vst1q_f32(dest + dest_stride, dest1);
+  vst1q_f32(dest + dest_stride * 2, dest2);
+  vst1q_f32(dest + dest_stride * 3, dest3);
+  vst1q_f32(dest + dest_stride * 4, dest4);
+  vst1q_f32(dest + dest_stride * 5, dest5);
+}
+void output_trans_c4_post(const float* src,
+                          int src_stride,
+                          float* dest,
+                          int dest_stride,
+                          float* bias_value,
+                          bool has_relu = false) {
+  const float32x4_t src0 = vld1q_f32(src);
+  const float32x4_t src1 = vld1q_f32(src + src_stride);
+  const float32x4_t src2 = vld1q_f32(src + src_stride * 2);
+  const float32x4_t src3 = vld1q_f32(src + src_stride * 3);
+  const float32x4_t src4 = vld1q_f32(src + src_stride * 4);
+  const float32x4_t src5 = vld1q_f32(src + src_stride * 5);
+  const float32x4_t src6 = vld1q_f32(src + src_stride * 6);
+  const float32x4_t src7 = vld1q_f32(src + src_stride * 7);
+
+  float32x4_t tmp024a = vaddq_f32(src1, src2);
+  float32x4_t tmp135a = vsubq_f32(src1, src2);
+  float32x4_t tmp024b = vaddq_f32(src3, src4);
+  float32x4_t tmp135b = vsubq_f32(src3, src4);
+  float32x4_t tmp024c = vaddq_f32(src5, src6);
+  float32x4_t tmp135c = vsubq_f32(src5, src6);
+
+  float32x4_t dest0 =
+      vaddq_f32(vaddq_f32(vaddq_f32(src0, tmp024a), tmp024b), tmp024c);
+  float32x4_t dest2 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 4)),
+                                vmulq_n_f32(tmp024c, 0.25f));
+  float32x4_t dest4 = vaddq_f32(vaddq_f32(tmp024a, vmulq_n_f32(tmp024b, 16)),
+                                vmulq_n_f32(tmp024c, 0.0625f));
+
+  float32x4_t dest1 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 2)),
+                                vmulq_n_f32(tmp135c, 0.5f));
+  float32x4_t dest3 = vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 8)),
+                                vmulq_n_f32(tmp135c, 0.125f));
+  float32x4_t dest5 =
+      vaddq_f32(src7,
+                vaddq_f32(vaddq_f32(tmp135a, vmulq_n_f32(tmp135b, 32)),
+                          vmulq_n_f32(tmp135c, 0.03125f)));
+
+  if (bias_value) {
+    float32x4_t bias = vld1q_f32(bias_value);
+    dest0 = vaddq_f32(dest0, bias);
+    dest1 = vaddq_f32(dest1, bias);
+    dest2 = vaddq_f32(dest2, bias);
+    dest3 = vaddq_f32(dest3, bias);
+    dest4 = vaddq_f32(dest4, bias);
+    dest5 = vaddq_f32(dest5, bias);
+  }
+
+  if (has_relu) {
+    float32x4_t zeros = vdupq_n_f32(0);
+    dest0 = vmaxq_f32(dest0, zeros);
+    dest1 = vmaxq_f32(dest1, zeros);
+    dest2 = vmaxq_f32(dest2, zeros);
+    dest3 = vmaxq_f32(dest3, zeros);
+    dest4 = vmaxq_f32(dest4, zeros);
+    dest5 = vmaxq_f32(dest5, zeros);
+  }
+
+  vst1q_f32(dest, dest0);
+  vst1q_f32(dest + dest_stride, dest1);
+  vst1q_f32(dest + dest_stride * 2, dest2);
+  vst1q_f32(dest + dest_stride * 3, dest3);
+  vst1q_f32(dest + dest_stride * 4, dest4);
+  vst1q_f32(dest + dest_stride * 5, dest5);
+}
+
+void input_trans_c4(const float* src,
+                    int src_stride,
+                    float* dest,
+                    int dest_stride) {
+  float32x4_t src0 = vld1q_f32(src);
+  float32x4_t src1 = vld1q_f32(src + src_stride);
+  float32x4_t src2 = vld1q_f32(src + src_stride * 2);
+  float32x4_t src3 = vld1q_f32(src + src_stride * 3);
+  float32x4_t src4 = vld1q_f32(src + src_stride * 4);
+  float32x4_t src5 = vld1q_f32(src + src_stride * 5);
+  float32x4_t src6 = vld1q_f32(src + src_stride * 6);
+  float32x4_t src7 = vld1q_f32(src + src_stride * 7);
+
+  float32x4_t dst0 = vaddq_f32(vsubq_f32(src0, src6),
+                               vmulq_n_f32(vsubq_f32(src4, src2), 5.25));
+  float32x4_t dst7 = vaddq_f32(vsubq_f32(src7, src1),
+                               vmulq_n_f32(vsubq_f32(src3, src5), 5.25));
+
+  float32x4_t tmp12a =
+      vsubq_f32(vaddq_f32(src2, src6), vmulq_n_f32(src4, 4.25));
+  float32x4_t tmp12b =
+      vsubq_f32(vaddq_f32(src1, src5), vmulq_n_f32(src3, 4.25));
+  float32x4_t dst1 = vaddq_f32(tmp12a, tmp12b);
+  float32x4_t dst2 = vsubq_f32(tmp12a, tmp12b);
+
+  float32x4_t tmp34a = vsubq_f32(vaddq_f32(src6, vmulq_n_f32(src2, 0.25)),
+                                 vmulq_n_f32(src4, 1.25));
+  float32x4_t tmp34b =
+      vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 0.5), vmulq_n_f32(src3, 2.5)),
+                vmulq_n_f32(src5, 2));
+  float32x4_t dst3 = vaddq_f32(tmp34a, tmp34b);
+  float32x4_t dst4 = vsubq_f32(tmp34a, tmp34b);
+
+  float32x4_t tmp56a =
+      vaddq_f32(src6, vmulq_n_f32(vsubq_f32(src2, vmulq_n_f32(src4, 1.25)), 4));
+  float32x4_t tmp56b =
+      vaddq_f32(vsubq_f32(vmulq_n_f32(src1, 2), vmulq_n_f32(src3, 2.5)),
+                vmulq_n_f32(src5, 0.5));
+  float32x4_t dst5 = vaddq_f32(tmp56a, tmp56b);
+  float32x4_t dst6 = vsubq_f32(tmp56a, tmp56b);
+
+  vst1q_f32(dest, dst0);
+  vst1q_f32(dest + dest_stride, dst1);
+  vst1q_f32(dest + dest_stride * 2, dst2);
+  vst1q_f32(dest + dest_stride * 3, dst3);
+  vst1q_f32(dest + dest_stride * 4, dst4);
+  vst1q_f32(dest + dest_stride * 5, dst5);
+  vst1q_f32(dest + dest_stride * 6, dst6);
+  vst1q_f32(dest + dest_stride * 7, dst7);
+}
+void weight_trans_c4(
+    float* dest, const float* din, int ch_in, int ch_out, void* workspace) {
+  const float coeff[8][3] = {{1.0f, 0.0f, 0.0f},
+                             {-2.0f / 9, -2.0f / 9, -2.0f / 9},
+                             {-2.0f / 9, 2.0f / 9, -2.0f / 9},
+                             {1.0f / 90, 1.0f / 45, 2.0f / 45},
+                             {1.0f / 90, -1.0f / 45, 2.0f / 45},
+                             {32.0f / 45, 16.0f / 45, 8.0f / 45},
+                             {32.0f / 45, -16.0f / 45, 8.0f / 45},
+                             {0.0f, 0.0f, 1.0f}};
+
+  float* ptr_out = static_cast<float*>(workspace);
+
+  for (int i = 0; i < ch_out; i++) {
+    for (int j = 0; j < ch_in; j++) {
+      const float* kernel0 =
+          static_cast<const float*>(din) + (i * ch_in + j) * 9;
+      float* ptr_channel = ptr_out + (i * ch_in + j) * 64;
+
+      //! transform kernel, transposed
+      const float* k0 = kernel0;
+      const float* k1 = kernel0 + 3;
+      const float* k2 = kernel0 + 6;
+
+      //! h
+      float tmp[8][3];
+      for (int i = 0; i < 8; i++) {
+        tmp[i][0] =
+            k0[0] * coeff[i][0] + k0[1] * coeff[i][1] + k0[2] * coeff[i][2];
+        tmp[i][1] =
+            k1[0] * coeff[i][0] + k1[1] * coeff[i][1] + k1[2] * coeff[i][2];
+        tmp[i][2] =
+            k2[0] * coeff[i][0] + k2[1] * coeff[i][1] + k2[2] * coeff[i][2];
+      }
+
+      //! v
+      for (int j = 0; j < 8; j++) {
+        float* tmpp = &tmp[j][0];
+        for (int i = 0; i < 8; i++) {
+          ptr_channel[j * 8 + i] = tmpp[0] * coeff[i][0] +
+                                   tmpp[1] * coeff[i][1] +
+                                   tmpp[2] * coeff[i][2];
+        }
+      }
+    }
+  }
+
+  int oc_pad = (ch_out + 3) / 4 * 4;
+  int ic_pad = (ch_in + 3) / 4 * 4;
+  int c_stride = ic_pad * oc_pad;
+  for (int i = 0; i < ch_out * ch_in * 64; ++i) {
+    int new_c = i % 64;
+    int new_oc = i / ch_in / 64 / 4;
+    int new_ic = i / 64 % (ch_in * 4) % ch_in;
+    int new_inner = i / ch_in / 64 % 4;
+    int dest_ind =
+        new_c * c_stride + new_oc * ic_pad * 4 + new_ic * 4 + new_inner;
+    dest[dest_ind] = ptr_out[i];
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
index 6a1fa37681..b4972a1eca 100644
--- a/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_fp32.cc
@@ -35,9 +35,10 @@ size_t conv3x3s1_direct_workspace_size(const operators::ConvParam& param,
   auto dim_in = param.x->dims();
   auto dim_out = param.output->dims();
   const int threads = ctx->threads();
+  auto paddings = *param.paddings;
   int llc_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
   int ow = dim_out[3];
   int oh = dim_out[2];
   int ic = dim_in[1];
@@ -74,9 +75,10 @@ void conv_3x3s1_direct_fp32(const float* i_data,
                             ARMContext* ctx) {
   const int threads = ctx->threads();
   int l2_size = ctx->llc_size() / sizeof(float);
+  auto paddings = *param.paddings;
 
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
   const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
   const int win_round = wout_round + 2;
   bool flag_relu = param.fuse_relu;
diff --git a/lite/backends/arm/math/conv3x3s1_direct_int8.cc b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
index f966313e11..64e72bc441 100644
--- a/lite/backends/arm/math/conv3x3s1_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s1_direct_int8.cc
@@ -41,10 +41,11 @@ void conv_3x3s1_direct_int8(const int8_t* din,
                             const operators::ConvParam& param,
                             Context<TARGET(kARM)>* ctx,
                             const float* scale) {
+  auto paddings = *param.paddings;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
 
   const int threads = ctx->threads();
   int llc_size = ctx->llc_size() / 4;
diff --git a/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
new file mode 100644
index 0000000000..e4c9fb99ef
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s1p01_depthwise_fp32.cc
@@ -0,0 +1,2539 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void conv_depthwise_3x3s1p0_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext *ctx);
+
+void conv_depthwise_3x3s1p0_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext *ctx);
+
+void conv_depthwise_3x3s1p1_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext *ctx);
+
+void conv_depthwise_3x3s1p1_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext *ctx);
+
+void conv_depthwise_3x3s1_fp32(const float *din,
+                               float *dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const float *weights,
+                               const float *bias,
+                               int pad,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext *ctx) {
+  if (pad == 0) {
+    if (w_in > 5) {
+      conv_depthwise_3x3s1p0_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  flag_relu,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s1p0_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    flag_relu,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    ctx);
+    }
+  }
+  if (pad == 1) {
+    if (w_in > 4) {
+      conv_depthwise_3x3s1p1_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  flag_relu,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s1p1_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    flag_relu,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    ctx);
+    }
+  }
+}
+
+#ifdef __aarch64__
+#define INIT_S1                                                   \
+  "PRFM PLDL1KEEP, [%[din_ptr0]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr1]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr2]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr3]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr4]] \n"                              \
+  "PRFM PLDL1KEEP, [%[din_ptr5]] \n"                              \
+  "movi   v21.4s, #0x0\n" /* out0 = 0 */                          \
+                                                                  \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
+                                                                  \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/       \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/       \
+                                                                  \
+  "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
+
+#define LEFT_COMPUTE_S1                                                   \
+  "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */         \
+  "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/   \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/         \
+  "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */                \
+  "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */                \
+                                                                          \
+  "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/           \
+  "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */                  \
+  "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */                  \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/  \
+  "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */                  \
+  "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */                  \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext v17.16b, v4.16b, v5.16b, #4 \n"         /* v16=1234 */             \
+  "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/           \
+                                                                          \
+  /* r2 */                                                                \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n"           /* v16 = 00123*/ \
+  "ext  v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */         \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/           \
+  "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
+
+#define LEFT_RESULT_S1                                                      \
+  /* r4 */                                                                  \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/    \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/    \
+                                                                            \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */                  \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */                  \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/            \
+                                                                            \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/   \
+                                                                            \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/             \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/   \
+                                                                            \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n"            /* v16 = 00123*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */         \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/   \
+                                                                            \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n"    /* vst1q_f32() */                  \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/            \
+                                                                            \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
+                                                                            \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/             \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
+                                                                            \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                     \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                    \
+                                                                            \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                     \
+  "cmp  %w[cnt], #1                \n"                                      \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
+                                                                            \
+  "blt 3f                         \n"
+
+#define MID_COMPUTE_S1                                                    \
+  "1:                             \n"   /* r0 */                          \
+  "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
+
+#define MID_RESULT_S1                                                      \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                           \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+                                                                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define RIGHT_COMPUTE_S1                                                  \
+  "3:                             \n"                                     \
+  "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"                           \
+  "ld1 {v22.4s}, [%[doutr0]]         \n"                                  \
+  "ld1 {v23.4s}, [%[doutr1]]         \n"                                  \
+  "ld1 {v24.4s}, [%[doutr2]]         \n"                                  \
+  "ld1 {v25.4s}, [%[doutr3]]         \n"                                  \
+                                                                          \
+  "bif v0.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v1.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v2.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v3.16b, %[vzero].16b, v19.16b \n"                                  \
+                                                                          \
+  "bif v4.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v5.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v6.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v7.16b, %[vzero].16b, v19.16b \n"                                  \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */         \
+  "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                          \
+  "bif v8.16b, %[vzero].16b, v18.16b \n"                                  \
+  "bif v9.16b, %[vzero].16b, v19.16b \n"                                  \
+  "bif v10.16b, %[vzero].16b, v18.16b \n"                                 \
+  "bif v11.16b, %[vzero].16b, v19.16b \n"                                 \
+                                                                          \
+  "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                          \
+  "ld1 {v18.4s}, [%[rmask]]         \n"                                   \
+                                                                          \
+  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
+  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
+  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
+  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                          \
+  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
+                                                                          \
+  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
+                                                                          \
+  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
+
+#define RIGHT_RESULT_S1                                                    \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define LEFT_RESULT_S1_RELU                                               \
+  /* r4 */                                                                \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
+                                                                          \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                          \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */                   \
+  "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/               \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
+                                                                          \
+  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
+  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
+  "ld1 {v12.4s}, [%[bias_val]]      \n"         /*vdupq_n_f32(bias_val)*/ \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
+                                                                          \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
+                                                                          \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
+                                                                          \
+  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
+                                                                          \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
+                                                                          \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
+                                                                          \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+                                                                          \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
+                                                                          \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                          \
+                                                                          \
+  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                   \
+  "cmp  %w[cnt], #1                \n"                                    \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
+  "blt 3f                         \n"
+
+#define MID_RESULT_S1_RELU                                                 \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/                          \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/                          \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+                                                                           \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/                         \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
+  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
+  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
+                                                                           \
+  "subs %w[cnt], %w[cnt], #1 \n"                                           \
+                                                                           \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
+  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
+                                                                           \
+  "bne 1b \n"
+
+#define RIGHT_RESULT_S1_RELU                                               \
+  /* r3 */                                                                 \
+  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v12.16b, v22.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
+  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
+  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
+                                                                           \
+  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
+  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v13.16b, v23.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
+  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
+                                                                           \
+  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                         \
+  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
+                                                                           \
+  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
+                                                                           \
+  "bif v14.16b, v24.16b, v18.16b \n"                                       \
+                                                                           \
+  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
+                                                                           \
+  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
+                                                                           \
+  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
+                                                                           \
+  "bif v15.16b, v25.16b, v18.16b \n"                                       \
+                                                                           \
+  "st1 {v15.4s}, [%[doutr3]], #16     \n"
+
+#define COMPUTE_S_S1                       \
+  "prfm pldl1keep, [%[din0]]\n"            \
+  "prfm pldl1keep, [%[din1]]\n"            \
+  "prfm pldl1keep, [%[din2]]\n"            \
+  "prfm pldl1keep, [%[din3]]\n"            \
+                                           \
+  "ld1 {v0.4s}, [%[din0]], #16\n"          \
+  "ld1 {v1.4s}, [%[din1]], #16\n"          \
+  "ld1 {v2.4s}, [%[din2]], #16\n"          \
+  "ld1 {v3.4s}, [%[din3]], #16\n"          \
+                                           \
+  "bif v0.16b, %[zero].16b, %[mask].16b\n" \
+  "bif v1.16b, %[zero].16b, %[mask].16b\n" \
+  "bif v2.16b, %[zero].16b, %[mask].16b\n" \
+  "bif v3.16b, %[zero].16b, %[mask].16b\n" \
+                                           \
+  "ext v4.16b, %[zero].16b, v0.16b, #12\n" \
+  "ext v5.16b, %[zero].16b, v1.16b, #12\n" \
+  "ext v6.16b, %[zero].16b, v2.16b, #12\n" \
+  "ext v7.16b, %[zero].16b, v3.16b, #12\n" \
+                                           \
+  "ext v8.16b, v0.16b, %[zero].16b, #4\n"  \
+  "ext v9.16b, v1.16b, %[zero].16b, #4\n"  \
+  "ext v10.16b, v2.16b, %[zero].16b, #4\n" \
+  "ext v11.16b, v3.16b, %[zero].16b, #4\n" \
+                                           \
+  "fmul v12.4s, v0.4s, %[wr0].s[1]\n"      \
+  "fmul v13.4s, v1.4s, %[wr0].s[1]\n"      \
+                                           \
+  "fmul v14.4s, v1.4s, %[wr1].s[1]\n"      \
+  "fmul v15.4s, v2.4s, %[wr1].s[1]\n"      \
+                                           \
+  "fmul v16.4s, v2.4s, %[wr2].s[1]\n"      \
+  "fmul v17.4s, v3.4s, %[wr2].s[1]\n"      \
+                                           \
+  "fmla v12.4s, v4.4s, %[wr0].s[0]\n"      \
+  "fmla v13.4s, v5.4s, %[wr0].s[0]\n"      \
+                                           \
+  "fmla v14.4s, v5.4s, %[wr1].s[0]\n"      \
+  "fmla v15.4s, v6.4s, %[wr1].s[0]\n"      \
+                                           \
+  "fmla v16.4s, v6.4s, %[wr2].s[0]\n"      \
+  "fmla v17.4s, v7.4s, %[wr2].s[0]\n"      \
+                                           \
+  "fmla v12.4s, v8.4s, %[wr0].s[2]\n"      \
+  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"      \
+                                           \
+  "fmla v14.4s, v9.4s, %[wr1].s[2]\n"      \
+  "fmla v15.4s, v10.4s, %[wr1].s[2]\n"     \
+                                           \
+  "fmla v16.4s, v10.4s, %[wr2].s[2]\n"     \
+  "fmla v17.4s, v11.4s, %[wr2].s[2]\n"     \
+                                           \
+  "fadd v12.4s, v12.4s, v14.4s\n"          \
+  "fadd v12.4s, v12.4s, v16.4s\n"          \
+                                           \
+  "fadd v13.4s, v13.4s, v15.4s\n"          \
+  "fadd v13.4s, v13.4s, v17.4s\n"          \
+                                           \
+  "fadd v12.4s, v12.4s, %[bias].4s\n"      \
+  "fadd v13.4s, v13.4s, %[bias].4s\n"
+
+#define RESULT_S_S1             \
+  "prfm pldl1keep, [%[out1]]\n" \
+  "prfm pldl1keep, [%[out2]]\n" \
+                                \
+  "st1 {v12.4s}, [%[out1]]\n"   \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU              \
+  "prfm pldl1keep, [%[out1]]\n"       \
+  "prfm pldl1keep, [%[out2]]\n"       \
+                                      \
+  "fmax v12.4s, v12.4s, %[zero].4s\n" \
+  "fmax v13.4s, v13.4s, %[zero].4s\n" \
+                                      \
+  "st1 {v12.4s}, [%[out1]]\n"         \
+  "st1 {v13.4s}, [%[out2]]\n"
+
+#define COMPUTE_S_S1_P0                                   \
+  "prfm pldl1keep, [%[din0]]\n"                           \
+  "prfm pldl1keep, [%[din1]]\n"                           \
+  "prfm pldl1keep, [%[din2]]\n"                           \
+  "prfm pldl1keep, [%[din3]]\n"                           \
+                                                          \
+  "ld1 {v0.4s, v1.4s}, [%[din0]]\n"                       \
+  "ld1 {v2.4s, v3.4s}, [%[din1]]\n"                       \
+  "ld1 {v4.4s, v5.4s}, [%[din2]]\n"                       \
+  "ld1 {v6.4s, v7.4s}, [%[din3]]\n"                       \
+                                                          \
+  "bif v0.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v1.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "bif v2.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v3.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "bif v4.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v5.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "bif v6.16b, %[zero].16b, %[mask1].16b\n"               \
+  "bif v7.16b, %[zero].16b, %[mask2].16b\n"               \
+                                                          \
+  "ext v8.16b, v0.16b, v1.16b, #4\n"                      \
+  "ext v9.16b, v0.16b, v1.16b, #8\n"                      \
+                                                          \
+  "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"          \
+  "and  v13.16b, %[vbias].16b, %[vbias].16b  \n" /* r0 */ \
+  "fmul v10.4s, v0.4s, %[wr0].s[0]\n"                     \
+  "fmul v11.4s, v8.4s, %[wr0].s[1]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr0].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v2.16b, v3.16b, #4\n"                      \
+  "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */             \
+  "fmul v14.4s, v2.4s, %[wr0].s[0]\n"                     \
+  "fmla v10.4s, v2.4s, %[wr1].s[0]\n"                     \
+                                                          \
+  "fmul v15.4s, v8.4s, %[wr0].s[1]\n"                     \
+  "fmla v11.4s, v8.4s, %[wr1].s[1]\n"                     \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr1].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v4.16b, v5.16b, #4\n"                      \
+  "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */             \
+  "fmla v14.4s, v4.4s, %[wr1].s[0]\n"                     \
+  "fmla v10.4s, v4.4s, %[wr2].s[0]\n"                     \
+                                                          \
+  "fmla v15.4s, v8.4s, %[wr1].s[1]\n"                     \
+  "fmla v11.4s, v8.4s, %[wr2].s[1]\n"                     \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr1].s[2]\n"                     \
+  "fmla v12.4s, v9.4s, %[wr2].s[2]\n"                     \
+                                                          \
+  "ext v8.16b, v6.16b, v7.16b, #4\n"                      \
+  "ext v9.16b, v6.16b, v7.16b, #8\n"                      \
+                                                          \
+  "fmla v14.4s, v6.4s, %[wr2].s[0]\n"                     \
+                                                          \
+  "fmla v15.4s, v8.4s, %[wr2].s[1]\n"                     \
+                                                          \
+  "fadd v12.4s, v12.4s, v10.4s\n"                         \
+                                                          \
+  "fmla v13.4s, v9.4s, %[wr2].s[2]\n"                     \
+                                                          \
+  "fadd v12.4s, v12.4s, v11.4s\n"                         \
+  "fadd v13.4s, v13.4s, v14.4s\n"                         \
+  "fadd v13.4s, v13.4s, v15.4s\n"  // \
+                    // "prfm pldl1keep, [%[out1]]\n" \
+                    // "prfm pldl1keep, [%[out2]]\n" \
+                    // \
+                    // "st1 {v12.4s}, [%[out1]]\n" \
+                    // "st1 {v13.4s}, [%[out2]]\n" \
+
+
+#else
+#define INIT_S1                                                    \
+  "pld [%[din0_ptr]]                             @ preload data\n" \
+  "pld [%[din1_ptr]]                      @ preload data\n"        \
+  "pld [%[din2_ptr]]                      @ preload data\n"        \
+  "pld [%[din3_ptr]]                      @ preload data\n"        \
+                                                                   \
+  "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"          \
+  "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"          \
+  "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"          \
+  "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"          \
+                                                                   \
+  "vdup.32 q4, %[bias_val]                            @ and \n"    \
+  "vdup.32 q5, %[bias_val]                            @ and \n"
+
+#define LEFT_COMPUTE_S1                                            \
+  "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"                    \
+  "vext.32  q7, q8, q9, #1     @ 1234\n" /* r0 */                  \
+  "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"           \
+                                                                   \
+  "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+  "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"           \
+                                                                   \
+  "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "pld [%[din0_ptr]]                             @ preload data\n" \
+  "pld [%[din1_ptr]]                             @ preload data\n" \
+  "pld [%[din2_ptr]]                             @ preload data\n" \
+  "pld [%[din3_ptr]]                             @ preload data\n" \
+                                                                   \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"                   \
+  "vext.32  q7, q10, q11, #1     @ 1234\n"                         \
+                                                                   \
+  /* r1 */                                                         \
+  "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"          \
+  "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
+                                                                   \
+  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"          \
+  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"          \
+                                                                   \
+  "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
+  "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"               \
+  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"               \
+                                                                   \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"                   \
+  "vext.32  q7, q12, q13, #1     @ 1234\n"                         \
+                                                                   \
+  /* r2 */                                                         \
+  "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
+  "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"          \
+                                                                   \
+  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"          \
+                                                                   \
+  "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
+  "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"           \
+                                                                   \
+  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"               \
+                                                                   \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"           \
+                                                                   \
+  "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"                   \
+  "vext.32  q7, q14, q15, #1     @ 1234\n"
+
+#define LEFT_RESULT_S1                                                        \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define MID_COMPUTE_S1                                                 \
+  "1:                                    @ right pad entry\n" /* r0 */ \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                       \
+  "pld [%[din0_ptr]]                             @ preload data\n"     \
+  "pld [%[din1_ptr]]                             @ preload data\n"     \
+  "pld [%[din2_ptr]]                             @ preload data\n"     \
+  "pld [%[din3_ptr]]                             @ preload data\n"     \
+                                                                       \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"               \
+                                                                       \
+  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                             \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                    \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"              \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"              \
+                                                                       \
+  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                             \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                    \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"              \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"              \
+                                                                       \
+  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"              \
+                                                                       \
+  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"                   \
+                                                                       \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"               \
+                                                                       \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                             \
+  "vext.32  q7, q14, q15, #2     @ 2345\n"
+
+#define MID_RESULT_S1                                                    \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define RIGHT_COMPUTE_S1                                                      \
+  "3:                                    @ right pad entry\n"                 \
+  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
+                                                                              \
+  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"                     \
+  "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"                     \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
+  "vext.32  q7, q14, q15, #2     @ 2345\n"
+
+#define RIGHT_RESULT_S1                                                 \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define LEFT_RESULT_S1_RELU                                                   \
+  /* r3 */                                                                    \
+  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
+                                                                              \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
+                                                                              \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                                    \
+                                                                              \
+  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
+                                                                              \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
+                                                                              \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+  "blt  3f                                @ jump to main loop start point\n"
+
+#define MID_RESULT_S1_RELU                                               \
+  /* r3 */                                                               \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
+                                                                         \
+  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
+                                                                         \
+  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
+                                                                         \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
+                                                                         \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
+  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
+  "vdup.32 q4, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                               \
+                                                                         \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
+                                                                         \
+  "subs %[cnt], #1 @ loop count minus 1\n"                               \
+                                                                         \
+  "vdup.32 q5, %[bias_val]                            @ and \n"          \
+                                                                         \
+  "bne    1b                             @ jump to main loop start point\n"
+
+#define RIGHT_RESULT_S1_RELU                                            \
+  /* r3 */                                                              \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
+                                                                        \
+  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
+                                                                        \
+  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
+  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
+                                                                        \
+  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
+  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
+                                                                        \
+  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                              \
+                                                                        \
+  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
+  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
+                                                                        \
+  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
+
+#define COMPUTE_S_S1                 \
+  "pld [%[din0]]\n"                  \
+  "pld [%[din1]]\n"                  \
+  "pld [%[din2]]\n"                  \
+  "pld [%[din3]]\n"                  \
+                                     \
+  "vld1.32 {d12-d13}, [%[din0]]!\n"  \
+  "vld1.32 {d14-d15}, [%[din1]]!\n"  \
+  "vld1.32 {d16-d17}, [%[din2]]!\n"  \
+  "vld1.32 {d18-d19}, [%[din3]]!\n"  \
+                                     \
+  "vbif q6, %q[vzero], %q[mask]\n"   \
+  "vbif q7, %q[vzero], %q[mask]\n"   \
+  "vbif q8, %q[vzero], %q[mask]\n"   \
+  "vbif q9, %q[vzero], %q[mask]\n"   \
+                                     \
+  "vmul.f32 q14, q6, %e[wr0][1]\n"   \
+  "vmul.f32 q15, q7, %e[wr0][1]\n"   \
+                                     \
+  "vmla.f32 q14, q7, %e[wr1][1]\n"   \
+  "vmla.f32 q15, q8, %e[wr1][1]\n"   \
+                                     \
+  "vmla.f32 q14, q8, %e[wr2][1]\n"   \
+  "vmla.f32 q15, q9, %e[wr2][1]\n"   \
+                                     \
+  "vext.32 q10, %q[vzero], q6, #3\n" \
+  "vext.32 q11, %q[vzero], q7, #3\n" \
+  "vext.32 q12, %q[vzero], q8, #3\n" \
+  "vext.32 q13, %q[vzero], q9, #3\n" \
+                                     \
+  "vmla.f32 q14, q10, %e[wr0][0]\n"  \
+  "vmla.f32 q15, q11, %e[wr0][0]\n"  \
+                                     \
+  "vmla.f32 q14, q11, %e[wr1][0]\n"  \
+  "vmla.f32 q15, q12, %e[wr1][0]\n"  \
+                                     \
+  "vmla.f32 q14, q12, %e[wr2][0]\n"  \
+  "vmla.f32 q15, q13, %e[wr2][0]\n"  \
+                                     \
+  "vext.32 q10, q6, %q[vzero], #1\n" \
+  "vext.32 q11, q7, %q[vzero], #1\n" \
+  "vext.32 q12, q8, %q[vzero], #1\n" \
+  "vext.32 q13, q9, %q[vzero], #1\n" \
+                                     \
+  "vmla.f32 q14, q10, %f[wr0][0]\n"  \
+  "vmla.f32 q15, q11, %f[wr0][0]\n"  \
+                                     \
+  "vmla.f32 q14, q11, %f[wr1][0]\n"  \
+  "vmla.f32 q15, q12, %f[wr1][0]\n"  \
+                                     \
+  "vmla.f32 q14, q12, %f[wr2][0]\n"  \
+  "vmla.f32 q15, q13, %f[wr2][0]\n"  \
+                                     \
+  "vadd.f32 q14, q14, %q[bias]\n"    \
+  "vadd.f32 q15, q15, %q[bias]\n"
+
+#define RESULT_S_S1                \
+  "pld [%[out1]]\n"                \
+  "pld [%[out2]]\n"                \
+                                   \
+  "vst1.32 {d28-d29}, [%[out1]]\n" \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define RESULT_S_S1_RELU           \
+  "pld [%[out1]]\n"                \
+  "pld [%[out2]]\n"                \
+                                   \
+  "vmax.f32 q14, q14, %q[vzero]\n" \
+  "vmax.f32 q15, q15, %q[vzero]\n" \
+                                   \
+  "vst1.32 {d28-d29}, [%[out1]]\n" \
+  "vst1.32 {d30-d31}, [%[out2]]\n"
+
+#define COMPUTE_S_S1_P0                                                       \
+  "pld [%[din0]]\n"                                                           \
+  "pld [%[din1]]\n"                                                           \
+  "pld [%[din2]]\n"                                                           \
+  "pld [%[din3]]\n"                                                           \
+  "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"                          \
+  "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"                          \
+  "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"                          \
+  "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"                          \
+                                                                              \
+  "vdup.32 q4, %[bias_val]                            @ and \n"               \
+  "vdup.32 q5, %[bias_val]                            @ and \n"               \
+                                                                              \
+  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
+  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
+                                                                              \
+  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
+                                                                              \
+  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
+                                                                              \
+  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
+  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
+  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
+                                                                              \
+  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+                                                                              \
+  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
+  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
+  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
+                                                                              \
+  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
+                                                                              \
+  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
+  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
+  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
+  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
+  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
+  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
+                                                                              \
+  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
+  "vext.32  q7, q14, q15, #2     @ 2345\n" /* r3 */                           \
+  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                     \
+                                                                              \
+  "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
+  "vadd.f32 q4, q4, q10         @ q4 += q10 \n"                               \
+                                                                              \
+  "pld [%[out1]]\n"                                                           \
+  "pld [%[out2]]\n"                                                           \
+                                                                              \
+  "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                      \
+  "vadd.f32 q14, q4, q11         @ q4 += q10 \n"                              \
+                                                                              \
+  "vadd.f32 q5, q5, q8         @ q4 += q10 \n"                                \
+  "vadd.f32 q15, q5, q9         @ q4 += q10 \n"
+
+#endif
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width > 4
+ */
+void conv_depthwise_3x3s1p1_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = (w_in + 3) >> 2;
+  int cnt_col = tile_w - 2;
+
+  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_in; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          din_ptr4 = dr3;
+          din_ptr5 = dr4;
+          dr0 = dr3;
+          dr1 = dr4;
+          dr2 = dr5;
+        } else {
+          dr0 = dr4;
+          dr1 = dr5;
+          dr2 = dr1 + w_in;
+        }
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 > h_in) {
+          switch (i + 5 - h_in) {
+            case 5:
+              din_ptr1 = zero_ptr;
+            case 4:
+              din_ptr2 = zero_ptr;
+            case 3:
+              din_ptr3 = zero_ptr;
+            case 2:
+              din_ptr4 = zero_ptr;
+            case 1:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = cnt_col;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+              : [cnt] "+r"(cnt),
+                [din_ptr0] "+r"(din_ptr0),
+                [din_ptr1] "+r"(din_ptr1),
+                [din_ptr2] "+r"(din_ptr2),
+                [din_ptr3] "+r"(din_ptr3),
+                [din_ptr4] "+r"(din_ptr4),
+                [din_ptr5] "+r"(din_ptr5),
+                [doutr0] "+r"(doutr0),
+                [doutr1] "+r"(doutr1),
+                [doutr2] "+r"(doutr2),
+                [doutr3] "+r"(doutr3)
+              : [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [bias_val] "r"(vbias),
+                [vmask] "r"(vmask),
+                [rmask] "r"(rmask),
+                [vzero] "w"(vzero)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21",
+                "v22",
+                "v23",
+                "v24",
+                "v25");
+        } else {
+          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                       : [cnt] "+r"(cnt),
+                         [din_ptr0] "+r"(din_ptr0),
+                         [din_ptr1] "+r"(din_ptr1),
+                         [din_ptr2] "+r"(din_ptr2),
+                         [din_ptr3] "+r"(din_ptr3),
+                         [din_ptr4] "+r"(din_ptr4),
+                         [din_ptr5] "+r"(din_ptr5),
+                         [doutr0] "+r"(doutr0),
+                         [doutr1] "+r"(doutr1),
+                         [doutr2] "+r"(doutr2),
+                         [doutr3] "+r"(doutr3)
+                       : [w0] "w"(wr0),
+                         [w1] "w"(wr1),
+                         [w2] "w"(wr2),
+                         [bias_val] "r"(vbias),
+                         [vmask] "r"(vmask),
+                         [rmask] "r"(rmask),
+                         [vzero] "w"(vzero)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21",
+                         "v22",
+                         "v23",
+                         "v24",
+                         "v25");
+        }
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_in; i += 2) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+        // unsigned int* rst_mask = rmask;
+
+        if (i == 0) {
+          din_ptr0 = zero_ptr;
+          din_ptr1 = dr0;
+          din_ptr2 = dr1;
+          din_ptr3 = dr2;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr3;
+          dr3 = dr2 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr3;
+          dr2 = dr1 + w_in;
+          dr3 = dr2 + w_in;
+        }
+        //! process bottom pad
+        if (i + 3 > h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = cnt_col;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
+                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
+              : [dout_ptr1] "+r"(doutr0),
+                [dout_ptr2] "+r"(doutr1),
+                [din0_ptr] "+r"(din_ptr0),
+                [din1_ptr] "+r"(din_ptr1),
+                [din2_ptr] "+r"(din_ptr2),
+                [din3_ptr] "+r"(din_ptr3),
+                [cnt] "+r"(cnt),
+                [rmask] "+r"(rmask_ptr),
+                [vmask] "+r"(vmask_ptr)
+              : [wr0] "w"(wr0),
+                [wr1] "w"(wr1),
+                [wr2] "w"(wr2),
+                [bias_val] "r"(bias_val),
+                [vzero] "w"(vzero)
+              : "cc",
+                "memory",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14",
+                "q15");
+        } else {
+          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
+                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
+                       : [dout_ptr1] "+r"(doutr0),
+                         [dout_ptr2] "+r"(doutr1),
+                         [din0_ptr] "+r"(din_ptr0),
+                         [din1_ptr] "+r"(din_ptr1),
+                         [din2_ptr] "+r"(din_ptr2),
+                         [din3_ptr] "+r"(din_ptr3),
+                         [cnt] "+r"(cnt),
+                         [rmask] "+r"(rmask_ptr),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias_val] "r"(bias_val),
+                         [vzero] "w"(vzero)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width <= 4
+ */
+void conv_depthwise_3x3s1p1_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[4] = {3, 2, 1, 0};
+  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+
+      int hs = -1;
+      int he = 3;
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      int h_cnt = (h_out + 1) >> 1;
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_cnt; ++j) {
+        const float *dr0 = din_channel + hs * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        if (hs == -1) {
+          dr0 = zero;
+        }
+
+        switch (he - h_in) {
+          case 2:
+            dr2 = zero;
+            doutr1 = trash_buf;
+          case 1:
+            dr3 = zero;
+          default:
+            break;
+        }
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [zero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17");
+        } else {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [zero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17");
+        }
+#else
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S1 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [mask] "w"(vmask_rp),
+                         [bias] "w"(wbias),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+        doutr0 = doutr1;
+        doutr1 += w_out;
+        hs += 2;
+        he += 2;
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width > 4
+ */
+void conv_depthwise_3x3s1p0_bias(float *dout,
+                                 const float *din,
+                                 const float *weights,
+                                 const float *bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext *ctx) {
+  //! pad is done implicit
+  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
+  //! for 4x6 convolution window
+  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+
+  float *zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float *write_ptr = zero_ptr + w_in;
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  int w_stride = 9;
+
+  int tile_w = w_out >> 2;
+  int remain = w_out % 4;
+
+  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
+  const int remian_idx[4] = {0, 1, 2, 3};
+
+  uint32x4_t vmask_rp1 =
+      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_rp2 =
+      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
+  uint32x4_t vmask_result =
+      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  unsigned int rmask[4];
+  vst1q_u32(rmask, vmask_result);
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int c = 0; c < ch_in; c++) {
+      float *dout_ptr = dout_batch + c * size_out_channel;
+
+      const float *din_ch_ptr = din_batch + c * size_in_channel;
+
+      float bias_val = flag_bias ? bias[c] : 0.f;
+      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
+
+      const float *wei_ptr = weights + c * w_stride;
+
+      float32x4_t wr0 = vld1q_f32(wei_ptr);
+      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
+
+      float *doutr0 = dout_ptr;
+      float *doutr1 = doutr0 + w_out;
+      float *doutr2 = doutr1 + w_out;
+      float *doutr3 = doutr2 + w_out;
+
+      const float *dr0 = din_ch_ptr;
+      const float *dr1 = dr0 + w_in;
+      const float *dr2 = dr1 + w_in;
+      const float *dr3 = dr2 + w_in;
+      const float *dr4 = dr3 + w_in;
+      const float *dr5 = dr4 + w_in;
+
+      const float *din_ptr0 = dr0;
+      const float *din_ptr1 = dr1;
+      const float *din_ptr2 = dr2;
+      const float *din_ptr3 = dr3;
+      const float *din_ptr4 = dr4;
+      const float *din_ptr5 = dr5;
+
+      float *ptr_zero = const_cast<float *>(zero);
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 4) {
+        //! process top pad pad_h = 1
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+        din_ptr4 = dr4;
+        din_ptr5 = dr5;
+
+        doutr0 = dout_ptr;
+        doutr1 = doutr0 + w_out;
+        doutr2 = doutr1 + w_out;
+        doutr3 = doutr2 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr5;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+        dr5 = dr4 + w_in;
+
+        //! process bottom pad
+        if (i + 5 >= h_in) {
+          switch (i + 5 - h_in) {
+            case 4:
+              din_ptr1 = zero_ptr;
+            case 3:
+              din_ptr2 = zero_ptr;
+            case 2:
+              din_ptr3 = zero_ptr;
+            case 1:
+              din_ptr4 = zero_ptr;
+            case 0:
+              din_ptr5 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 4 > h_out) {
+          switch (i + 4 - h_out) {
+            case 3:
+              doutr1 = write_ptr;
+            case 2:
+              doutr2 = write_ptr;
+            case 1:
+              doutr3 = write_ptr;
+            default:
+              break;
+          }
+        }
+
+        int cnt = tile_w;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S1
+              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+              MID_COMPUTE_S1 MID_RESULT_S1_RELU
+              "cmp  %w[remain], #1             \n"
+              "blt 0f                         \n" RIGHT_COMPUTE_S1
+                  RIGHT_RESULT_S1_RELU "0: \n"
+              : [cnt] "+r"(cnt),
+                [din_ptr0] "+r"(din_ptr0),
+                [din_ptr1] "+r"(din_ptr1),
+                [din_ptr2] "+r"(din_ptr2),
+                [din_ptr3] "+r"(din_ptr3),
+                [din_ptr4] "+r"(din_ptr4),
+                [din_ptr5] "+r"(din_ptr5),
+                [doutr0] "+r"(doutr0),
+                [doutr1] "+r"(doutr1),
+                [doutr2] "+r"(doutr2),
+                [doutr3] "+r"(doutr3)
+              : [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [bias_val] "r"(vbias),
+                [vmask] "r"(vmask),
+                [rmask] "r"(rmask),
+                [vzero] "w"(vzero),
+                [remain] "r"(remain)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21",
+                "v22",
+                "v23",
+                "v24",
+                "v25");
+        } else {
+          asm volatile(
+              INIT_S1
+              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
+              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
+              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
+              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
+              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
+              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
+              MID_COMPUTE_S1 MID_RESULT_S1
+              "cmp  %w[remain], #1             \n"
+              "blt 0f                         \n" RIGHT_COMPUTE_S1
+                  RIGHT_RESULT_S1 "0: \n"
+              : [cnt] "+r"(cnt),
+                [din_ptr0] "+r"(din_ptr0),
+                [din_ptr1] "+r"(din_ptr1),
+                [din_ptr2] "+r"(din_ptr2),
+                [din_ptr3] "+r"(din_ptr3),
+                [din_ptr4] "+r"(din_ptr4),
+                [din_ptr5] "+r"(din_ptr5),
+                [doutr0] "+r"(doutr0),
+                [doutr1] "+r"(doutr1),
+                [doutr2] "+r"(doutr2),
+                [doutr3] "+r"(doutr3)
+              : [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [bias_val] "r"(vbias),
+                [vmask] "r"(vmask),
+                [rmask] "r"(rmask),
+                [vzero] "w"(vzero),
+                [remain] "r"(remain)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21",
+                "v22",
+                "v23",
+                "v24",
+                "v25");
+        }
+        dout_ptr = dout_ptr + 4 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i += 2) {
+        din_ptr0 = dr0;
+        din_ptr1 = dr1;
+        din_ptr2 = dr2;
+        din_ptr3 = dr3;
+
+        doutr0 = dout_ptr;
+        doutr1 = dout_ptr + w_out;
+
+        dr0 = dr2;
+        dr1 = dr3;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        //! process bottom pad
+        if (i + 3 >= h_in) {
+          switch (i + 3 - h_in) {
+            case 3:
+              din_ptr1 = zero_ptr;
+            case 2:
+              din_ptr2 = zero_ptr;
+            case 1:
+              din_ptr3 = zero_ptr;
+            case 0:
+              din_ptr3 = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process bottom remain
+        if (i + 2 > h_out) {
+          doutr1 = write_ptr;
+        }
+        int cnt = tile_w;
+        unsigned int *rmask_ptr = rmask;
+        unsigned int *vmask_ptr = vmask;
+        if (flag_relu) {
+          asm volatile(INIT_S1
+                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "vext.32  q6, q8, q9, #1     @ 0012\n"
+                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                           MID_RESULT_S1_RELU
+                       "cmp  %[remain], #1             \n"
+                       "blt 0f                         \n" RIGHT_COMPUTE_S1
+                           RIGHT_RESULT_S1_RELU "0:                         \n"
+                       : [dout_ptr1] "+r"(doutr0),
+                         [dout_ptr2] "+r"(doutr1),
+                         [din0_ptr] "+r"(din_ptr0),
+                         [din1_ptr] "+r"(din_ptr1),
+                         [din2_ptr] "+r"(din_ptr2),
+                         [din3_ptr] "+r"(din_ptr3),
+                         [cnt] "+r"(cnt),
+                         [rmask] "+r"(rmask_ptr),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias_val] "r"(bias_val),
+                         [vzero] "w"(vzero),
+                         [remain] "r"(remain)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(INIT_S1
+                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
+                       "vext.32  q6, q8, q9, #1     @ 0012\n"
+                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
+                           MID_RESULT_S1
+                       "cmp  %[remain], #1             \n"
+                       "blt 0f                         \n" RIGHT_COMPUTE_S1
+                           RIGHT_RESULT_S1 "0:                         \n"
+                       : [dout_ptr1] "+r"(doutr0),
+                         [dout_ptr2] "+r"(doutr1),
+                         [din0_ptr] "+r"(din_ptr0),
+                         [din1_ptr] "+r"(din_ptr1),
+                         [din2_ptr] "+r"(din_ptr2),
+                         [din3_ptr] "+r"(din_ptr3),
+                         [cnt] "+r"(cnt),
+                         [rmask] "+r"(rmask_ptr),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias_val] "r"(bias_val),
+                         [vzero] "w"(vzero),
+                         [remain] "r"(remain)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        dout_ptr += 2 * w_out;
+      }  //! end of processing mid rows
+#endif
+    }
+  }
+}
+/**
+ * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
+ * width <= 4
+ */
+void conv_depthwise_3x3s1p0_bias_s(float *dout,
+                                   const float *din,
+                                   const float *weights,
+                                   const float *bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext *ctx) {
+  //! 3x3s1 convolution, implemented by direct algorithm
+  //! pad is done implicit
+  //! for 4x6 convolution window
+  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask_rp1 =
+      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
+  uint32x4_t vmask_rp2 =
+      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
+
+  unsigned int vmask[8];
+  vst1q_u32(vmask, vmask_rp1);
+  vst1q_u32(vmask + 4, vmask_rp2);
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+  for (int n = 0; n < num; ++n) {
+    const float *din_batch = din + n * ch_in * size_in_channel;
+    float *dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      float *dout_channel = dout_batch + i * size_out_channel;
+      const float *din_channel = din_batch + i * size_in_channel;
+      const float *weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#endif  // __aarch64__
+
+      float out_buf1[4];
+      float out_buf2[4];
+      float trash_buf[4];
+
+      float *doutr0 = dout_channel;
+      float *doutr1 = dout_channel + w_out;
+
+      for (int j = 0; j < h_out; j += 2) {
+        const float *dr0 = din_channel + j * w_in;
+        const float *dr1 = dr0 + w_in;
+        const float *dr2 = dr1 + w_in;
+        const float *dr3 = dr2 + w_in;
+
+        doutr0 = dout_channel + j * w_out;
+        doutr1 = doutr0 + w_out;
+
+        if (j + 3 >= h_in) {
+          switch (j + 3 - h_in) {
+            case 3:
+              dr1 = zero_ptr;
+            case 2:
+              dr2 = zero_ptr;
+            case 1:
+              dr3 = zero_ptr;
+              doutr1 = trash_buf;
+            case 0:
+              dr3 = zero_ptr;
+              doutr1 = trash_buf;
+            default:
+              break;
+          }
+        }
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vbias] "w"(wbias),
+                         [mask1] "w"(vmask_rp1),
+                         [mask2] "w"(vmask_rp2),
+                         [zero] "w"(vzero),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        } else {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vbias] "w"(wbias),
+                         [mask1] "w"(vmask_rp1),
+                         [mask2] "w"(vmask_rp2),
+                         [zero] "w"(vzero),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        }
+#else
+        unsigned int *vmask_ptr = vmask;
+        float bias_val = flag_bias ? bias[i] : 0.f;
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [bias_val] "r"(bias_val),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
+                       : [din0] "+r"(dr0),
+                         [din1] "+r"(dr1),
+                         [din2] "+r"(dr2),
+                         [din3] "+r"(dr3),
+                         [vmask] "+r"(vmask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [vzero] "w"(vzero),
+                         [bias_val] "r"(bias_val),
+                         [out1] "r"(out_buf1),
+                         [out2] "r"(out_buf2)
+                       : "cc",
+                         "memory",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *doutr0++ = out_buf1[w];
+          *doutr1++ = out_buf2[w];
+        }
+      }  // end of processing heights
+    }    // end of processing channels
+  }      // end of processing batchs
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
new file mode 100644
index 0000000000..08e5efecd7
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s1px_depthwise_fp32.cc
@@ -0,0 +1,541 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void conv_3x3s1_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx) {
+  int threads = ctx->threads();
+
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+
+  const int out_c_block = 4;
+  const int out_h_kernel = 2;
+  const int out_w_kernel = 4;
+  const int win_ext = ow + 2;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh + 2;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_relu = param.fuse_relu;
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  float* ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      float pre_out[out_c_block * out_w_kernel * out_h_kernel];  // NOLINT
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+      float32x4_t vbias = vld1q_f32(bias_local);
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc00 = dout_c00 + h * ow;
+        float* outc01 = outc00 + ow;
+        float* outc10 = outc00 + size_out_channel;
+        float* outc11 = outc10 + ow;
+        float* outc20 = outc10 + size_out_channel;
+        float* outc21 = outc20 + ow;
+        float* outc30 = outc20 + size_out_channel;
+        float* outc31 = outc30 + ow;
+        const float* inr0 = pre_din + h * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        const float* inr3 = inr2 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:
+              outc10 = ptr_write;
+              outc11 = ptr_write;
+            case 2:
+              outc20 = ptr_write;
+              outc21 = ptr_write;
+            case 1:
+              outc30 = ptr_write;
+              outc31 = ptr_write;
+            default:
+              break;
+          }
+        }
+        if (h + out_h_kernel > oh) {
+          outc01 = ptr_write;
+          outc11 = ptr_write;
+          outc21 = ptr_write;
+          outc31 = ptr_write;
+        }
+        float* outl[] = {outc00,
+                         outc10,
+                         outc20,
+                         outc30,
+                         outc01,
+                         outc11,
+                         outc21,
+                         outc31,
+                         reinterpret_cast<float*>(bias_local),
+                         reinterpret_cast<float*>(flag_relu)};
+        void* outl_ptr = reinterpret_cast<void*>(outl);
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          float* out0 = pre_out;
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
+          "ldp    q6, q7,   [%[inr1]], #32\n" /* load input r1*/
+          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
+          "ldp    q8, q9,   [%[inr1]], #32\n" /* load input r1*/
+          "ldp    q4, q5,   [%[inr0]]\n"      /* load input r0*/
+          "ldp    q10, q11, [%[inr1]]\n"      /* load input r1*/
+          /*  r0, r1, mul w0, get out r0, r1 */
+          "fmul   v15.4s ,  %[w0].4s,  v0.4s\n" /* outr00 = w0 * r0, 0*/
+          "fmul   v16.4s ,  %[w0].4s,  v1.4s\n" /* outr01 = w0 * r0, 1*/
+          "fmul   v17.4s ,  %[w0].4s,  v2.4s\n" /* outr02 = w0 * r0, 2*/
+          "fmul   v18.4s ,  %[w0].4s,  v3.4s\n" /* outr03 = w0 * r0, 3*/
+          "fmul   v19.4s ,  %[w0].4s,  v6.4s\n" /* outr10 = w0 * r1, 0*/
+          "fmul   v20.4s ,  %[w0].4s,  v7.4s\n" /* outr11 = w0 * r1, 1*/
+          "fmul   v21.4s ,  %[w0].4s,  v8.4s\n" /* outr12 = w0 * r1, 2*/
+          "fmul   v22.4s ,  %[w0].4s,  v9.4s\n" /* outr13 = w0 * r1, 3*/
+          /*  r0, r1, mul w1, get out r0, r1 */
+          "fmla   v15.4s ,  %[w1].4s,  v1.4s\n" /* outr00 = w1 * r0[1]*/
+          "ldp    q0, q1,   [%[inr2]], #32\n"     /* load input r2*/
+          "fmla   v16.4s ,  %[w1].4s,  v2.4s\n" /* outr01 = w1 * r0[2]*/
+          "fmla   v17.4s ,  %[w1].4s,  v3.4s\n" /* outr02 = w1 * r0[3]*/
+          "fmla   v18.4s ,  %[w1].4s,  v4.4s\n" /* outr03 = w1 * r0[4]*/
+          "fmla   v19.4s ,  %[w1].4s,  v7.4s\n" /* outr10 = w1 * r1[1]*/
+          "fmla   v20.4s ,  %[w1].4s,  v8.4s\n" /* outr11 = w1 * r1[2]*/
+          "fmla   v21.4s ,  %[w1].4s,  v9.4s\n" /* outr12 = w1 * r1[3]*/
+          "fmla   v22.4s ,  %[w1].4s,  v10.4s\n"/* outr13 = w1 * r1[4]*/
+          /*  r0, r1, mul w2, get out r0, r1 */
+          "fmla   v15.4s ,  %[w2].4s,  v2.4s\n" /* outr00 = w2 * r0[2]*/
+          "fmla   v16.4s ,  %[w2].4s,  v3.4s\n" /* outr01 = w2 * r0[3]*/
+          "ldp    q2, q3,   [%[inr2]], #32\n"     /* load input r2*/
+          "fmla   v17.4s ,  %[w2].4s,  v4.4s\n" /* outr02 = w2 * r0[4]*/
+          "fmla   v18.4s ,  %[w2].4s,  v5.4s\n" /* outr03 = w2 * r0[5]*/
+          "ldp    q4, q5,   [%[inr2]]\n"          /* load input r2*/
+          "fmla   v19.4s ,  %[w2].4s,  v8.4s\n" /* outr10 = w2 * r1[2]*/
+          "fmla   v20.4s ,  %[w2].4s,  v9.4s\n" /* outr11 = w2 * r1[3]*/
+          "fmla   v21.4s ,  %[w2].4s,  v10.4s\n"/* outr12 = w2 * r1[4]*/
+          "fmla   v22.4s ,  %[w2].4s,  v11.4s\n"/* outr13 = w2 * r1[5]*/
+          /*  r1, r2, mul w3, get out r0, r1 */
+          "fmla   v15.4s ,  %[w3].4s,  v6.4s\n" /* outr00 = w3 * r1[0]*/
+          "fmla   v16.4s ,  %[w3].4s,  v7.4s\n" /* outr01 = w3 * r1[1]*/
+          "fmla   v17.4s ,  %[w3].4s,  v8.4s\n" /* outr02 = w3 * r1[2]*/
+          "fmla   v18.4s ,  %[w3].4s,  v9.4s\n" /* outr03 = w3 * r1[3]*/
+          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr10 = w3 * r2[0]*/
+          "fmla   v20.4s ,  %[w3].4s,  v1.4s\n" /* outr11 = w3 * r2[1]*/
+          "fmla   v21.4s ,  %[w3].4s,  v2.4s\n" /* outr12 = w3 * r2[2]*/
+          "fmla   v22.4s ,  %[w3].4s,  v3.4s\n" /* outr13 = w3 * r2[3]*/
+          /*  r1, r2, mul w4, get out r0, r1 */
+          "fmla   v15.4s ,  %[w4].4s,  v7.4s\n" /* outr00 = w4 * r1[1]*/
+          "ldp    q6, q7,   [%[inr3]], #32\n"     /* load input r3*/
+          "fmla   v16.4s ,  %[w4].4s,  v8.4s\n" /* outr01 = w4 * r1[2]*/
+          "fmla   v17.4s ,  %[w4].4s,  v9.4s\n" /* outr02 = w4 * r1[3]*/
+          "fmla   v18.4s ,  %[w4].4s,  v10.4s\n"/* outr03 = w4 * r1[4]*/
+          "ldp    x0, x1, [%[outl]]  \n"
+          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr10 = w4 * r2[1]*/
+          "fmla   v20.4s ,  %[w4].4s,  v2.4s\n" /* outr11 = w4 * r2[2]*/
+          "fmla   v21.4s ,  %[w4].4s,  v3.4s\n" /* outr12 = w4 * r2[3]*/
+          "fmla   v22.4s ,  %[w4].4s,  v4.4s\n" /* outr13 = w4 * r2[4]*/
+          /*  r1, r2, mul w5, get out r0, r1 */
+          "fmla   v15.4s ,  %[w5].4s,  v8.4s\n" /* outr00 = w5 * r1[2]*/
+          "fmla   v16.4s ,  %[w5].4s,  v9.4s\n" /* outr01 = w5 * r1[3]*/
+          "ldp    q8, q9,   [%[inr3]], #32\n"     /* load input r3*/
+          "fmla   v17.4s ,  %[w5].4s,  v10.4s\n"/* outr02 = w5 * r1[4]*/
+          "fmla   v18.4s ,  %[w5].4s,  v11.4s\n"/* outr03 = w5 * r1[5]*/
+          "ldp    q10, q11,   [%[inr3]]\n"        /* load input r3*/
+          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr10 = w5 * r2[2]*/
+          "fmla   v20.4s ,  %[w5].4s,  v3.4s\n" /* outr11 = w5 * r2[3]*/
+          "fmla   v21.4s ,  %[w5].4s,  v4.4s\n" /* outr12 = w5 * r2[4]*/
+          "fmla   v22.4s ,  %[w5].4s,  v5.4s\n" /* outr13 = w5 * r2[5]*/
+          /*  r2, r3, mul w6, get out r0, r1 */
+          "fmla   v15.4s ,  %[w6].4s,  v0.4s\n" /* outr00 = w6 * r2[0]*/
+          "fmla   v16.4s ,  %[w6].4s,  v1.4s\n" /* outr01 = w6 * r2[1]*/
+          "fmla   v17.4s ,  %[w6].4s,  v2.4s\n" /* outr02 = w6 * r2[2]*/
+          "fmla   v18.4s ,  %[w6].4s,  v3.4s\n" /* outr03 = w6 * r2[3]*/
+          "ldp    x2, x3, [%[outl], #16]  \n"
+          "fmla   v19.4s ,  %[w6].4s,  v6.4s\n" /* outr10 = w6 * r3[0]*/
+          "fmla   v20.4s ,  %[w6].4s,  v7.4s\n" /* outr11 = w6 * r3[1]*/
+          "fmla   v21.4s ,  %[w6].4s,  v8.4s\n" /* outr12 = w6 * r3[2]*/
+          "fmla   v22.4s ,  %[w6].4s,  v9.4s\n" /* outr13 = w6 * r3[3]*/
+          /*  r2, r3, mul w7, get out r0, r1 */
+          "fmla   v15.4s ,  %[w7].4s,  v1.4s\n" /* outr00 = w7 * r2[1]*/
+          "fmla   v16.4s ,  %[w7].4s,  v2.4s\n" /* outr01 = w7 * r2[2]*/
+          "fmla   v17.4s ,  %[w7].4s,  v3.4s\n" /* outr02 = w7 * r2[3]*/
+          "fmla   v18.4s ,  %[w7].4s,  v4.4s\n" /* outr03 = w7 * r2[4]*/
+          "ldp    x4, x5, [%[outl], #32]  \n"
+          "fmla   v19.4s ,  %[w7].4s,  v7.4s\n" /* outr10 = w7 * r3[1]*/
+          "fmla   v20.4s ,  %[w7].4s,  v8.4s\n" /* outr11 = w7 * r3[2]*/
+          "fmla   v21.4s ,  %[w7].4s,  v9.4s\n" /* outr12 = w7 * r3[3]*/
+          "fmla   v22.4s ,  %[w7].4s,  v10.4s\n"/* outr13 = w7 * r3[4]*/
+          /*  r2, r3, mul w8, get out r0, r1 */
+          "fmla   v15.4s ,  %[w8].4s,  v2.4s\n" /* outr00 = w8 * r2[2]*/
+          "fmla   v16.4s ,  %[w8].4s,  v3.4s\n" /* outr01 = w8 * r2[3]*/
+          "fmla   v17.4s ,  %[w8].4s,  v4.4s\n" /* outr02 = w8 * r2[0]*/
+          "fmla   v18.4s ,  %[w8].4s,  v5.4s\n" /* outr03 = w8 * r2[1]*/
+          "ldp    x6, x7, [%[outl], #48]  \n"
+          "fmla   v19.4s ,  %[w8].4s,  v8.4s\n" /* outr10 = w8 * r3[2]*/
+          "fmla   v20.4s ,  %[w8].4s,  v9.4s\n" /* outr11 = w8 * r3[3]*/
+          "fmla   v21.4s ,  %[w8].4s,  v10.4s\n"/* outr12 = w8 * r3[0]*/
+          "fmla   v22.4s ,  %[w8].4s,  v11.4s\n"/* outr13 = w8 * r3[1]*/
+
+          "fadd   v15.4s, v15.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v16.4s, v16.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v17.4s, v17.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v18.4s, v18.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v19.4s, v19.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v20.4s, v20.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v21.4s, v21.4s, %[vbias].4s\n"/* add bias */
+          "fadd   v22.4s, v22.4s, %[vbias].4s\n"/* add bias */
+
+          /* transpose */
+          "trn1   v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/
+          "trn2   v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/
+          "trn1   v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/
+          "trn2   v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/
+          "trn1   v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/
+          "trn2   v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/
+          "trn1   v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/
+          "trn2   v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/
+          "trn1   v15.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
+          "trn2   v19.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
+          "trn1   v17.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
+          "trn2   v21.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
+          "trn1   v16.2d, v4.2d, v6.2d\n"  /* r1: a0a1a2a3*/
+          "trn2   v20.2d, v4.2d, v6.2d\n"  /* r1: c0c1c2c3*/
+          "trn1   v18.2d, v5.2d, v7.2d\n"  /* r1: b0b1b2b3*/
+          "trn2   v22.2d, v5.2d, v7.2d\n"  /* r1: d0d1d2d3*/
+
+          "cbz    %w[flag_relu],  0f\n"    /* skip relu*/
+          "movi   v0.4s, #0\n"             /* for relu */
+          "fmax   v15.4s, v15.4s, v0.4s\n"
+          "fmax   v16.4s, v16.4s, v0.4s\n"
+          "fmax   v17.4s, v17.4s, v0.4s\n"
+          "fmax   v18.4s, v18.4s, v0.4s\n"
+          "fmax   v19.4s, v19.4s, v0.4s\n"
+          "fmax   v20.4s, v20.4s, v0.4s\n"
+          "fmax   v21.4s, v21.4s, v0.4s\n"
+          "fmax   v22.4s, v22.4s, v0.4s\n"
+          "0:\n"
+          "cbnz   %w[flag_mask], 1f\n"
+          "str    q15, [x0]\n" /* save outc00 */
+          "str    q16, [x4]\n" /* save outc01 */
+          "str    q17, [x1]\n" /* save outc10 */
+          "str    q18, [x5]\n" /* save outc11 */
+          "str    q19, [x2]\n" /* save outc20 */
+          "str    q20, [x6]\n" /* save outc21 */
+          "str    q21, [x3]\n" /* save outc30 */
+          "str    q22, [x7]\n" /* save outc31 */
+          "b 2f\n"
+          "1:\n"
+          "str  q15, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q17, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q19, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q21, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q16, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q18, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q20, [%[out]], #16 \n" /* save remain to pre_out */
+          "str  q22, [%[out]], #16 \n" /* save remain to pre_out */
+          "2:\n"
+          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
+           [inr2] "+r"(inr2), [inr3] "+r"(inr3),
+           [out]"+r"(out0)
+          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
+           [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
+           [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
+           [vbias]"w" (vbias), [outl] "r" (outl_ptr),
+           [flag_mask] "r" (flag_mask), [flag_relu] "r" (flag_relu)
+          : "cc", "memory",
+            "v0","v1","v2","v3","v4","v5","v6","v7",
+            "v8", "v9", "v10", "v11", "v15",
+            "v16","v17","v18","v19","v20","v21","v22",
+            "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"
+          );
+#else
+          asm volatile(
+          /* load weights */
+          "vld1.32    {d10-d13}, [%[wc0]]!      @ load w0, w1, to q5, q6\n"
+          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w2, to q7\n"
+          /* load r0, r1 */
+          "vld1.32    {d0-d3}, [%[r0]]!         @ load r0, q0, q1\n"
+          "vld1.32    {d4-d7}, [%[r0]]!         @ load r0, q2, q3\n"
+          /* main loop */
+          "0:                                   @ main loop\n"
+          /* mul r0 with w0, w1, w2, get out r0 */
+          "vmul.f32   q8, q5, q0                @ w0 * inr00\n"
+          "vmul.f32   q9, q5, q1                @ w0 * inr01\n"
+          "vmul.f32   q10, q5, q2               @ w0 * inr02\n"
+          "vmul.f32   q11, q5, q3               @ w0 * inr03\n"
+          "vmla.f32   q8, q6, q1                @ w1 * inr01\n"
+          "vld1.32    {d0-d3}, [%[r0]]          @ load r0, q0, q1\n"
+          "vmla.f32   q9, q6, q2                @ w1 * inr02\n"
+          "vmla.f32   q10, q6, q3               @ w1 * inr03\n"
+          "vmla.f32   q11, q6, q0               @ w1 * inr04\n"
+          "vmla.f32   q8, q7, q2                @ w2 * inr02\n"
+          "vmla.f32   q9, q7, q3                @ w2 * inr03\n"
+          "vld1.32    {d4-d7}, [%[r1]]!         @ load r0, q2, q3\n"
+          "vmla.f32   q10, q7, q0               @ w2 * inr04\n"
+          "vmla.f32   q11, q7, q1               @ w2 * inr05\n"
+          "vld1.32    {d0-d3}, [%[r1]]!         @ load r0, q0, q1\n"
+          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w3 to q4\n"
+          /* mul r1 with w0-w5, get out r0, r1 */
+          "vmul.f32   q12, q5, q2               @ w0 * inr10\n"
+          "vmul.f32   q13, q5, q3               @ w0 * inr11\n"
+          "vmul.f32   q14, q5, q0               @ w0 * inr12\n"
+          "vmul.f32   q15, q5, q1               @ w0 * inr13\n"
+          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w4 to q5\n"
+          "vmla.f32   q8, q4, q2                @ w3 * inr10\n"
+          "vmla.f32   q9, q4, q3                @ w3 * inr11\n"
+          "vmla.f32   q10, q4, q0               @ w3 * inr12\n"
+          "vmla.f32   q11, q4, q1               @ w3 * inr13\n"
+          /* mul r1 with w1, w4, get out r1, r0 */
+          "vmla.f32   q8, q5, q3                @ w4 * inr11\n"
+          "vmla.f32   q12, q6, q3               @ w1 * inr11\n"
+          "vld1.32    {d4-d7}, [%[r1]]          @ load r1, q2, q3\n"
+          "vmla.f32   q9, q5, q0                @ w4 * inr12\n"
+          "vmla.f32   q13, q6, q0               @ w1 * inr12\n"
+          "vmla.f32   q10, q5, q1               @ w4 * inr13\n"
+          "vmla.f32   q14, q6, q1               @ w1 * inr13\n"
+          "vmla.f32   q11, q5, q2               @ w4 * inr14\n"
+          "vmla.f32   q15, q6, q2               @ w1 * inr14\n"
+          "vld1.32    {d12-d13}, [%[wc0]]!      @ load w5 to q6\n"
+          /* mul r1 with w2, w5, get out r1, r0 */
+          "vmla.f32   q12, q7, q0               @ w2 * inr12\n"
+          "vmla.f32   q13, q7, q1               @ w2 * inr13\n"
+          "vmla.f32   q8, q6, q0                @ w5 * inr12\n"
+          "vmla.f32   q9, q6, q1                @ w5 * inr13\n"
+          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, q0, q1\n"
+          "vmla.f32   q14, q7, q2               @ w2 * inr14\n"
+          "vmla.f32   q15, q7, q3               @ w2 * inr15\n"
+          "vmla.f32   q10, q6, q2               @ w5 * inr14\n"
+          "vmla.f32   q11, q6, q3               @ w5 * inr15\n"
+          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, q0, q1\n"
+          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w6, to q7\n"
+          /* mul r2 with w3-w8, get out r0, r1 */
+          "vmla.f32   q12, q4, q0               @ w3 * inr20\n"
+          "vmla.f32   q13, q4, q1               @ w3 * inr21\n"
+          "vmla.f32   q14, q4, q2               @ w3 * inr22\n"
+          "vmla.f32   q15, q4, q3               @ w3 * inr23\n"
+          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w7, to q4\n"
+          "vmla.f32   q8,  q7, q0               @ w6 * inr20\n"
+          "vmla.f32   q9,  q7, q1               @ w6 * inr21\n"
+          "vmla.f32   q10, q7, q2               @ w6 * inr22\n"
+          "vmla.f32   q11, q7, q3               @ w6 * inr23\n"
+          /* mul r2 with w4, w7, get out r1, r0 */
+          "vmla.f32   q8,  q4, q1               @ w7 * inr21\n"
+          "vmla.f32   q12, q5, q1               @ w4 * inr21\n"
+          "vld1.32    {d0-d3}, [%[r2]]          @ load r2, q0, q1\n"
+          "vmla.f32   q9,  q4, q2               @ w7 * inr22\n"
+          "vmla.f32   q13, q5, q2               @ w4 * inr22\n"
+          "vmla.f32   q10, q4, q3               @ w7 * inr23\n"
+          "vmla.f32   q14, q5, q3               @ w4 * inr23\n"
+          "vmla.f32   q11, q4, q0               @ w7 * inr24\n"
+          "vmla.f32   q15, q5, q0               @ w4 * inr24\n"
+          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w8 to q5\n"
+          /* mul r1 with w5, w8, get out r1, r0 */
+          "vmla.f32   q12, q6, q2               @ w5 * inr22\n"
+          "vmla.f32   q13, q6, q3               @ w5 * inr23\n"
+          "vmla.f32   q8,  q5, q2               @ w8 * inr22\n"
+          "vmla.f32   q9,  q5, q3               @ w8 * inr23\n"
+          "vld1.32    {d4-d7}, [%[r3]]!         @ load r3, q2, q3\n"
+          "ldr r4,    [%[outl], #32]            @ load bias addr to r4\n"
+          "vmla.f32   q14, q6, q0               @ w5 * inr24\n"
+          "vmla.f32   q15, q6, q1               @ w5 * inr25\n"
+          "vmla.f32   q10, q5, q0               @ w8 * inr24\n"
+          "vmla.f32   q11, q5, q1               @ w8 * inr25\n"
+          "vld1.32    {d0-d3}, [%[r3]]!         @ load r3, q0, q1\n"
+          "sub %[wc0], %[wc0], #144      @ wc0 - 144 to start address\n"
+          /* mul r3 with w6, w7, w8, get out r1 */
+          "vmla.f32   q12, q7, q2               @ w6 * inr30\n"
+          "vmla.f32   q13, q7, q3               @ w6 * inr31\n"
+          "vmla.f32   q14, q7, q0               @ w6 * inr32\n"
+          "vmla.f32   q15, q7, q1               @ w6 * inr33\n"
+          "vmla.f32   q12, q4, q3               @ w7 * inr31\n"
+          "vld1.32    {d4-d7}, [%[r3]]          @ load r3, q2, q3\n"
+          "vld1.32    {d12-d13}, [r4]           @ load bias\n"
+          "vmla.f32   q13, q4, q0               @ w7 * inr32\n"
+          "vmla.f32   q14, q4, q1               @ w7 * inr33\n"
+          "vmla.f32   q15, q4, q2               @ w7 * inr34\n"
+          "ldr r0,    [%[outl]]                 @ load outc00 to r0\n"
+          "vmla.f32   q12, q5, q0               @ w8 * inr32\n"
+          "vmla.f32   q13, q5, q1               @ w8 * inr33\n"
+          "ldr r5,    [%[outl], #36]            @ load flag_relu to r5\n"
+          "vmla.f32   q14, q5, q2               @ w8 * inr34\n"
+          "vmla.f32   q15, q5, q3               @ w8 * inr35\n"
+          "ldr r1,    [%[outl], #4]             @ load outc10 to r1\n"
+          "vadd.f32   q8, q8, q6                @ r00 add bias\n"
+          "vadd.f32   q9, q9, q6                @ r01 add bias\n"
+          "vadd.f32   q10, q10, q6              @ r02 add bias\n"
+          "vadd.f32   q11, q11, q6              @ r03 add bias\n"
+          "ldr r2,    [%[outl], #8]             @ load outc20 to r2\n"
+          "vadd.f32   q12, q12, q6              @ r10 add bias\n"
+          "vadd.f32   q13, q13, q6              @ r11 add bias\n"
+          "vadd.f32   q14, q14, q6              @ r12 add bias\n"
+          "vadd.f32   q15, q15, q6              @ r13 add bias\n"
+          "ldr r3,    [%[outl], #12]            @ load outc30 to r3\n"
+          "vmov.u32   q7, #0                    @ mov zero to q7\n"
+          "cmp  r5, #0                          @ cmp flag relu\n"
+          "beq  1f                              @ skip relu\n"
+          "vmax.f32  q8, q8, q7                 @ r00 relu\n"
+          "vmax.f32  q9, q9, q7                 @ r01 relu\n"
+          "vmax.f32  q10, q10, q7               @ r02 relu\n"
+          "vmax.f32  q11, q11, q7               @ r03 relu\n"
+          "vmax.f32  q12, q12, q7               @ r10 relu\n"
+          "vmax.f32  q13, q13, q7               @ r11 relu\n"
+          "vmax.f32  q14, q14, q7               @ r12 relu\n"
+          "vmax.f32  q15, q15, q7               @ r13 relu\n"
+          "1:\n"
+          "ldr r4,   [%[outl], #16]   @ load outc01 to r4\n"
+          "vtrn.32   q8, q9           @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n"
+          "vtrn.32   q10, q11         @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n"
+          "vtrn.32   q12, q13         @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n"
+          "vtrn.32   q14, q15         @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n"
+          "ldr r5,   [%[outl], #20]   @ load outc11 to r5\n"
+          "vswp      d17, d20         @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n"
+          "vswp      d19, d22         @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n"
+          "vswp      d25, d28         @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n"
+          "vswp      d27, d30         @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n"
+          "cmp %[flag_mask], #0       @ cmp flag mask\n"
+          "bne 2f\n"
+          "vst1.32   {d16-d17}, [r0]  @ save outc00\n"
+          "vst1.32   {d18-d19}, [r1]  @ save outc10\n"
+          "vst1.32   {d20-d21}, [r2]  @ save outc20\n"
+          "vst1.32   {d22-d23}, [r3]  @ save outc30\n"
+          "vst1.32   {d24-d25}, [r4]  @ save outc01\n"
+          "vst1.32   {d26-d27}, [r5]  @ save outc11\n"
+          "ldr r0,   [%[outl], #24]   @ load outc21 to r0\n"
+          "ldr r1,   [%[outl], #28]   @ load outc31 to r1\n"
+          "vst1.32   {d28-d29}, [r0]  @ save outc21\n"
+          "vst1.32   {d30-d31}, [r1]  @ save outc31\n"
+          "b 3f                       @ branch end\n"
+          "2: \n"
+          "vst1.32 {d16-d17}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d18-d19}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d20-d21}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d22-d23}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d24-d25}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d26-d27}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d28-d29}, [%[out0]]!  @ save remain to pre_out\n"
+          "vst1.32 {d30-d31}, [%[out0]]!  @ save remain to pre_out\n"
+          "3: \n"
+          : [r0] "+r"(inr0), [r1] "+r"(inr1),
+            [r2] "+r"(inr2), [r3] "+r"(inr3),
+            [out0] "+r"(out0), [wc0] "+r"(weight_c)
+          : [flag_mask] "r" (flag_mask), [outl] "r" (outl_ptr)
+          : "cc", "memory",
+            "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+            "q10", "q11", "q12", "q13","q14", "q15", "r0", "r1", "r2", "r3", "r4", "r5"
+          );
+#endif  //  __arch64__
+          // clang-format on
+          outl[0] += 4;
+          outl[1] += 4;
+          outl[2] += 4;
+          outl[3] += 4;
+          outl[4] += 4;
+          outl[5] += 4;
+          outl[6] += 4;
+          outl[7] += 4;
+          if (flag_mask) {
+            memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
+            memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
+            memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float));
+            memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float));
+            memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float));
+            memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float));
+            memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float));
+            memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float));
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
index 8260718a50..807135f57d 100644
--- a/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_fp32.cc
@@ -32,10 +32,11 @@ size_t conv3x3s2_direct_workspace_size(const operators::ConvParam& param,
                                        ARMContext* ctx) {
   auto dim_in = param.x->dims();
   auto dim_out = param.output->dims();
+  auto paddings = *param.paddings;
   const int threads = ctx->threads();
   int llc_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
   int ow = dim_out[3];
   int oh = dim_out[2];
   int ic = dim_in[1];
@@ -73,10 +74,11 @@ void conv_3x3s2_direct_fp32(const float* i_data,
   //! 3x3s2 convolution, implemented by direct algorithm
   //! prepack input to tmp buffer
   //! write output to tmp buffer
+  auto paddings = *param.paddings;
   const int threads = ctx->threads();
   int l2_size = ctx->llc_size() / sizeof(float);
-  const int pad_w = param.paddings[1];
-  const int pad_h = param.paddings[0];
+  const int pad_w = paddings[2];
+  const int pad_h = paddings[0];
   const int wout_round = ROUNDUP(ow, OUT_W_BLOCK);
   const int win_round = wout_round * 2 /*stride_w*/ + 1;
   bool flag_relu = param.fuse_relu;
diff --git a/lite/backends/arm/math/conv3x3s2_direct_int8.cc b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
index 01b7a812eb..26829544bf 100644
--- a/lite/backends/arm/math/conv3x3s2_direct_int8.cc
+++ b/lite/backends/arm/math/conv3x3s2_direct_int8.cc
@@ -46,10 +46,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
   //! 3x3s2 int8 convolution, implemented by direct algorithm
   //! prepack input to tmp buffer
   //! write output to tmp buffer
+  auto paddings = *param.paddings;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
 
   const int threads = ctx->threads();
   int llc_size = ctx->llc_size() / 4;
@@ -472,10 +473,11 @@ void conv_3x3s2_direct_int8(const int8_t* din,
   //! 3x3s2 int8 convolution, implemented by direct algorithm
   //! prepack input to tmp buffer
   //! write output to tmp buffer
+  auto paddings = *param.paddings;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias;
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
   const int threads = ctx->threads();
   //! set 1/4 l2 cache
   int llc_size = ctx->llc_size() / 4;
diff --git a/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
new file mode 100644
index 0000000000..455781e37e
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s2p01_depthwise_fp32.cc
@@ -0,0 +1,1862 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_depthwise.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+void conv_depthwise_3x3s2p0_bias(float* dout,
+                                 const float* din,
+                                 const float* weights,
+                                 const float* bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext* ctx);
+
+void conv_depthwise_3x3s2p0_bias_s(float* dout,
+                                   const float* din,
+                                   const float* weights,
+                                   const float* bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias(float* dout,
+                                 const float* din,
+                                 const float* weights,
+                                 const float* bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext* ctx);
+
+void conv_depthwise_3x3s2p1_bias_s(float* dout,
+                                   const float* din,
+                                   const float* weights,
+                                   const float* bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext* ctx);
+
+void conv_depthwise_3x3s2_fp32(const float* din,
+                               float* dout,
+                               int num,
+                               int ch_out,
+                               int h_out,
+                               int w_out,
+                               int ch_in,
+                               int h_in,
+                               int w_in,
+                               const float* weights,
+                               const float* bias,
+                               int pad,
+                               bool flag_bias,
+                               bool flag_relu,
+                               ARMContext* ctx) {
+  if (pad == 0) {
+    if (w_in > 7) {
+      conv_depthwise_3x3s2p0_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  flag_relu,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s2p0_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    flag_relu,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    ctx);
+    }
+  }
+  if (pad == 1) {
+    if (w_in > 7) {
+      conv_depthwise_3x3s2p1_bias(dout,
+                                  din,
+                                  weights,
+                                  bias,
+                                  flag_bias,
+                                  flag_relu,
+                                  num,
+                                  ch_in,
+                                  h_in,
+                                  w_in,
+                                  h_out,
+                                  w_out,
+                                  ctx);
+    } else {
+      conv_depthwise_3x3s2p1_bias_s(dout,
+                                    din,
+                                    weights,
+                                    bias,
+                                    flag_bias,
+                                    flag_relu,
+                                    num,
+                                    ch_in,
+                                    h_in,
+                                    w_in,
+                                    h_out,
+                                    w_out,
+                                    ctx);
+    }
+  }
+}
+#ifdef __aarch64__
+#define INIT_S2                                  \
+  "prfm pldl1keep, [%[inptr0]]             \n"   \
+  "prfm pldl1keep, [%[inptr1]]             \n"   \
+  "prfm pldl1keep, [%[inptr2]]             \n"   \
+  "prfm pldl1keep, [%[inptr3]]             \n"   \
+  "prfm pldl1keep, [%[inptr4]]             \n"   \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"  \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"  \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"  \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"  \
+                                                 \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n" \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"
+
+#define LEFT_COMPUTE_S2                                                   \
+  "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n" /* r0 */               \
+  "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   /*  {0,2,4,6} * w01 */ \
+  "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   /* {1,3,5,7} * w02 */  \
+  "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  /* {0,1,3,5} * w00*/   \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n" /* v10 = {0,1,3,5} */  \
+                                                                          \
+  "sub %[inptr0], %[inptr0], #4            \n"                            \
+  "sub %[inptr1], %[inptr1], #4             \n" /* r1 */                  \
+  "fmla v11.4s, v2.4s, %[w1].s[1]            \n"                          \
+  "fmla v12.4s, v3.4s, %[w1].s[2]            \n"                          \
+  "fmla v16.4s, v10.4s, %[w1].s[0]            \n"                         \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"                        \
+                                                                          \
+  "sub %[inptr2], %[inptr2], #4            \n"                            \
+  "sub %[inptr3], %[inptr3], #4             \n" /* r2 */                  \
+  "fmul v13.4s, v4.4s, %[w0].s[1]            \n"                          \
+  "fmla v11.4s, v4.4s, %[w2].s[1]            \n"                          \
+                                                                          \
+  "fmul v14.4s, v5.4s, %[w0].s[2]            \n"                          \
+  "fmla v12.4s, v5.4s, %[w2].s[2]            \n"                          \
+                                                                          \
+  "fmla v17.4s, v10.4s, %[w0].s[0]            \n"                         \
+  "fmla v16.4s, v10.4s, %[w2].s[0]            \n"                         \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"                        \
+                                                                          \
+  "sub %[inptr4], %[inptr4], #4            \n" /* r3 */                   \
+  "fmla v13.4s, v6.4s, %[w1].s[1]            \n"                          \
+  "fmla v14.4s, v7.4s, %[w1].s[2]            \n"                          \
+  "fmla v17.4s, v10.4s, %[w1].s[0]            \n"                         \
+                                                                          \
+  "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"                        \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"                       \
+  "fadd v16.4s, v16.4s, v12.4s                  \n"
+
+#define LEFT_RESULT_S2                              \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
+                                                    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+                                                    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+                                                    \
+  "cmp %w[cnt], #1                             \n"  \
+                                                    \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "blt 1f                                     \n"
+
+#define MID_COMPUTE_S2                                      \
+  "2:                                          \n" /* r0 */ \
+  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"            \
+  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"            \
+  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v2.16b, v18.16b, #4     \n"                \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" /* r1 */    \
+  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"            \
+  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"            \
+  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v4.16b, v19.16b, #4     \n"                \
+                                                            \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n" /* r2 */    \
+  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"            \
+  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"            \
+                                                            \
+  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"            \
+  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"            \
+                                                            \
+  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"           \
+  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v6.16b, v20.16b, #4     \n"                \
+                                                            \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n" /* r3 */    \
+  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"            \
+  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"            \
+  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"           \
+                                                            \
+  "ext  v10.16b, v8.16b, v21.16b, #4     \n"                \
+                                                            \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"             \
+                                                            \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"         \
+  "fadd v16.4s, v16.4s, v12.4s                  \n"
+
+#define MID_RESULT_S2                               \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
+                                                    \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+  "subs %w[cnt], %w[cnt], #1                    \n" \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+                                                    \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "bne  2b                                    \n"
+
+#define RIGHT_COMPUTE_S2                                   \
+  "1:                                          \n"         \
+  "cmp %w[remain], #1                           \n"        \
+  "blt 4f                                     \n"          \
+  "3:                                         \n"          \
+  "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"          \
+                                                           \
+  "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"          \
+                                                           \
+  "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"          \
+                                                           \
+  "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"          \
+                                                           \
+  "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n" /* r0 */ \
+  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"           \
+  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"           \
+  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"          \
+  "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"          \
+  "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n" /* r1 */ \
+  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"           \
+  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"           \
+  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n" /* r2 */ \
+  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"           \
+  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"           \
+                                                           \
+  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"           \
+  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"           \
+                                                           \
+  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"          \
+  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n" /* r3 */ \
+  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"           \
+  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"           \
+  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"          \
+                                                           \
+  "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"          \
+  "ld1 {v0.4s}, [%[outptr0]]                  \n"          \
+                                                           \
+  "fadd v16.4s, v16.4s, v11.4s                  \n"        \
+  "fadd v16.4s, v16.4s, v12.4s                  \n"        \
+  "ld1 {v1.4s}, [%[outptr1]]                  \n"
+
+#define RIGHT_RESULT_S2                             \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
+                                                    \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"        \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"        \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+  "4:                                          \n"
+
+#define LEFT_RESULT_S2_RELU                         \
+  /* r4 */                                          \
+  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
+  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
+  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
+                                                    \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n"  \
+                                                    \
+  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
+  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
+  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
+                                                    \
+  "fadd v17.4s, v17.4s, v13.4s                  \n" \
+                                                    \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
+                                                    \
+  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
+                                                    \
+  "fadd v17.4s, v17.4s, v14.4s                  \n" \
+                                                    \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
+                                                    \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
+                                                    \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n"  \
+                                                    \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
+                                                    \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
+                                                    \
+  "cmp %w[cnt], #1                             \n"  \
+                                                    \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
+                                                    \
+  "blt 1f                                     \n"
+
+#define MID_RESULT_S2_RELU                                    \
+  /* r4 */                                                    \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
+                                                              \
+  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"               \
+  "ld1 {v15.4s}, [%[inptr0]]                 \n"              \
+  "ld1 {v18.4s}, [%[inptr1]]                 \n"              \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "ld1 {v19.4s}, [%[inptr2]]                 \n"              \
+  "ld1 {v20.4s}, [%[inptr3]]                 \n"              \
+  "ld1 {v21.4s}, [%[inptr4]]                 \n"              \
+                                                              \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "ext  v10.16b, v0.16b, v15.16b, #4     \n"                  \
+  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"              \
+  "subs %w[cnt], %w[cnt], #1                    \n"           \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+                                                              \
+  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"              \
+                                                              \
+  "bne  2b                                    \n"
+
+#define RIGHT_RESULT_S2_RELU                                  \
+  /* r4 */                                                    \
+  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
+  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
+  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
+                                                              \
+  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
+                                                              \
+  "bif  v16.16b, v0.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
+                                                              \
+  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
+                                                              \
+  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
+                                                              \
+  "bif  v17.16b, v1.16b, %[wmask].16b    \n"                  \
+                                                              \
+  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
+  "4:                                          \n"
+
+#define COMPUTE_S_S2                                  \
+  "movi v9.4s, #0                                 \n" \
+  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n" \
+                                                      \
+  "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n" \
+  "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n" \
+  "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n" \
+                                                      \
+  "bif v10.16b, v9.16b, v6.16b                    \n" \
+  "bif v11.16b, v9.16b, v7.16b                    \n" \
+  "bif v12.16b, v9.16b, v6.16b                    \n" \
+  "bif v13.16b, v9.16b, v7.16b                    \n" \
+  "bif v14.16b, v9.16b, v6.16b                    \n" \
+  "bif v15.16b, v9.16b, v7.16b                    \n" \
+                                                      \
+  "ext v6.16b, v9.16b, v11.16b, #12               \n" \
+  "ext v7.16b, v9.16b, v13.16b, #12               \n" \
+  "ext v8.16b, v9.16b, v15.16b, #12               \n" \
+                                                      \
+  "fmul v4.4s, v10.4s, %[wr0].s[1]                \n" \
+  "fmul v5.4s, v11.4s, %[wr0].s[2]                \n" \
+  "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n" \
+                                                      \
+  "fmla v4.4s, v12.4s, %[wr1].s[1]                \n" \
+  "fmla v5.4s, v13.4s, %[wr1].s[2]                \n" \
+  "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n" \
+                                                      \
+  "fmla v4.4s, v14.4s, %[wr2].s[1]                \n" \
+  "fmla v5.4s, v15.4s, %[wr2].s[2]                \n" \
+  "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n" \
+                                                      \
+  "fadd v4.4s, v4.4s, v5.4s                       \n" \
+  "fadd v4.4s, v4.4s, v6.4s                       \n"
+
+#define RESULT_S_S2                                   \
+  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
+                                                      \
+  "st1 {v4.4s}, [%[out]]                          \n"
+
+#define RESULT_S_S2_RELU                              \
+  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
+  "fmax v4.4s, v4.4s, v9.4s                       \n" \
+                                                      \
+  "st1 {v4.4s}, [%[out]]                          \n"
+
+#define COMPUTE_S_S2_P0                                \
+  "movi v9.4s, #0                                 \n"  \
+  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"  \
+                                                       \
+  "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  \
+  "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  \
+  "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  \
+  "and  v4.16b, %[bias].16b, %[bias].16b  \n"          \
+                                                       \
+  "bif v10.16b, v9.16b, v6.16b                    \n"  \
+  "bif v11.16b, v9.16b, v7.16b                    \n"  \
+  "bif v12.16b, v9.16b, v6.16b                    \n"  \
+  "bif v13.16b, v9.16b, v7.16b                    \n"  \
+  "bif v14.16b, v9.16b, v6.16b                    \n"  \
+  "bif v15.16b, v9.16b, v7.16b                    \n"  \
+                                                       \
+  "ext v6.16b, v10.16b, v9.16b, #4               \n"   \
+  "ext v7.16b, v12.16b, v9.16b, #4               \n"   \
+  "ext v8.16b, v14.16b, v9.16b, #4               \n"   \
+                                                       \
+  "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"  \
+  "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"  \
+  "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n" \
+                                                       \
+  "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"  \
+  "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"  \
+  "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n" \
+                                                       \
+  "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"  \
+  "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"  \
+  "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n" \
+                                                       \
+  "fadd v4.4s, v4.4s, v5.4s                       \n"  \
+  "fadd v4.4s, v4.4s, v16.4s                       \n"
+
+#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]]                          \n"
+
+#define RESULT_S_S2_P0_RELU                           \
+  "fmax v4.4s, v4.4s, v9.4s                       \n" \
+  "st1 {v4.4s}, [%[out]]                          \n"
+
+#else
+#define INIT_S2                                                     \
+  "vmov.u32 q9, #0                                \n"               \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  \
+  "pld [%[din0_ptr]]                              @ preload data\n" \
+  "pld [%[din1_ptr]]                              @ preload data\n" \
+  "pld [%[din2_ptr]]                              @ preload data\n" \
+                                                                    \
+  "vdup.32 q3, %[bias]                            @ and \n"
+
+#define LEFT_COMPUTE_S2                                                   \
+  "vext.32 q6, q9, q11, #3                        @ shift right 1 data\n" \
+  "vext.32 q7, q9, q13, #3                        @ shift right 1 data\n" \
+  "vext.32 q8, q9, q15, #3                        @ shift right 1 data\n" \
+  "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, out0\n" \
+  "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, out0\n" \
+                                                                          \
+  "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"        \
+  "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"        \
+  "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"        \
+                                                                          \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"        \
+                                                                          \
+  "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, out0\n" \
+                                                                          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"        \
+                                                                          \
+  "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, out1\n" \
+  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, out1\n" \
+  "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, out1\n" \
+                                                                          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"        \
+                                                                          \
+  "vadd.f32 q3, q3, q4                            @ add \n"               \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define LEFT_RESULT_S2                                \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "cmp %[cnt], #1                                 \n" \
+  "blt 1f                                         \n"
+
+#define MID_COMPUTE_S2                                                    \
+  "2:                                             \n"                     \
+  "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"        \
+  "vdup.32  q3, %[bias]                           @ and \n"               \
+  "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"      \
+  "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"        \
+                                                                          \
+  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n" \
+  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n" \
+  "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, out0\n" \
+                                                                          \
+  "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"      \
+  "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"        \
+                                                                          \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"        \
+                                                                          \
+  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n" \
+  "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, out0\n" \
+                                                                          \
+  "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"      \
+                                                                          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"        \
+                                                                          \
+  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n" \
+  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n" \
+  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n" \
+                                                                          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"        \
+                                                                          \
+  "vadd.f32 q3, q3, q4                            @ add \n"               \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define MID_RESULT_S2                                 \
+  "subs %[cnt], #1                                \n" \
+                                                      \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "bne  2b                                        \n"
+
+#define RIGHT_COMPUTE_S2                                                    \
+  "1:                                             \n"                       \
+  "cmp %[remain], #1                              \n"                       \
+  "blt 3f                                         \n"                       \
+                                                                            \
+  "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"            \
+  "vdup.32  q3, %[bias]                           @ and \n"                 \
+                                                                            \
+  "vbif q10, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q11, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q12, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q13, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q14, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q15, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+                                                                            \
+  "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"        \
+  "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"        \
+                                                                            \
+  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n"   \
+  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n"   \
+  "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, out0\n"   \
+                                                                            \
+  "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"        \
+  "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"          \
+                                                                            \
+  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, out0\n"   \
+                                                                            \
+  "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"            \
+                                                                            \
+  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n"   \
+                                                                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"                 \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define RIGHT_RESULT_S2                                           \
+  "vbif.f32 q3, q10, q11                          @ write mask\n" \
+                                                                  \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
+  "3:                                             \n"
+
+#define LEFT_RESULT_S2_RELU                           \
+  "vmax.f32 q3, q3, q9                    @ relu \n"  \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "cmp %[cnt], #1                                 \n" \
+  "blt 1f                                         \n"
+
+#define MID_RESULT_S2_RELU                            \
+  "vmax.f32 q3, q3, q9                    @ relu \n"  \
+  "subs %[cnt], #1                                \n" \
+                                                      \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
+  "bne  2b                                        \n"
+
+#define RIGHT_RESULT_S2_RELU                                      \
+  "vmax.f32 q3, q3, q9                    @ relu \n"              \
+  "vbif.f32 q3, q10, q11                          @ write mask\n" \
+                                                                  \
+  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
+  "3:                                             \n"
+
+#define COMPUTE_S_S2                                                        \
+  "vmov.u32 q9, #0                                \n"                       \
+  "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"            \
+  "vdup.32  q3, %[bias]                           @ and \n"                 \
+                                                                            \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"          \
+                                                                            \
+  "vbif q10, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q11, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q12, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q13, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q14, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q15, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+                                                                            \
+  "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"        \
+  "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"        \
+  "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"        \
+                                                                            \
+  "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, out0\n"   \
+  "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, out0\n"   \
+  "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, out0\n"   \
+                                                                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"                 \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#define RESULT_S_S2_RELU                                    \
+  "vmax.f32 q3, q3, q9                            @ relu\n" \
+                                                            \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#define COMPUTE_S_S2_P0                                                     \
+  "vmov.u32 q9, #0                                \n"                       \
+  "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"             \
+  "vdup.32  q3, %[bias]                           @ and \n"                 \
+                                                                            \
+  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"          \
+  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"          \
+  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"          \
+                                                                            \
+  "vbif q10, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q11, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q12, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q13, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q14, q9, q6                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+  "vbif q15, q9, q7                               @ bit select, deal with " \
+  "right pad\n"                                                             \
+                                                                            \
+  "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"        \
+  "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"        \
+  "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"        \
+                                                                            \
+  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n"   \
+  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n"   \
+  "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n"   \
+  "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, out0\n"   \
+                                                                            \
+  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
+  "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, out0\n"   \
+                                                                            \
+  "vadd.f32 q3, q3, q4                            @ add \n"                 \
+  "vadd.f32 q3, q3, q5                            @ add \n"
+
+#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#define RESULT_S_S2_P0_RELU                                  \
+  "vmax.f32 q3, q3, q9                            @ relu \n" \
+  "vst1.32 {d6-d7}, [%[out]]                            \n"
+
+#endif
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2
+ * w_in > 7
+ */
+void conv_depthwise_3x3s2p1_bias(float* dout,
+                                 const float* din,
+                                 const float* weights,
+                                 const float* bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  int size_pad_bottom = h_out * 2 - h_in;
+
+  int cnt_col = (w_out >> 2) - 2;
+  int size_right_remain = w_in - (7 + cnt_col * 8);
+  if (size_right_remain >= 9) {
+    cnt_col++;
+    size_right_remain -= 8;
+  }
+  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
+
+  int size_right_pad = w_out * 2 - w_in;
+
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t vzero = vdupq_n_f32(0.f);
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
+      float bias_c = 0.f;
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+#endif  // __aarch64__
+
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
+      const float* din0_ptr = dr0;
+      const float* din1_ptr = dr1;
+      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;
+
+#ifdef __aarch64__
+      for (int i = 0; i < h_in; i += 4) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          din3_ptr = dr2;
+          din4_ptr = dr3;
+          dr0 = dr3;
+          dr1 = dr4;
+        } else {
+          dr0 = dr4;
+          dr1 = dr0 + w_in;
+        }
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i + 4 > h_in) {
+          switch (i + 4 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process output pad
+        if (i / 2 + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = cnt_col;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+              : [inptr0] "+r"(din0_ptr),
+                [inptr1] "+r"(din1_ptr),
+                [inptr2] "+r"(din2_ptr),
+                [inptr3] "+r"(din3_ptr),
+                [inptr4] "+r"(din4_ptr),
+                [outptr0] "+r"(doutr0_ptr),
+                [outptr1] "+r"(doutr1_ptr),
+                [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero),
+                [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [remain] "r"(cnt_remain),
+                [mask1] "w"(vmask_rp1),
+                [mask2] "w"(vmask_rp2),
+                [wmask] "w"(wmask),
+                [vbias] "w"(wbias)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21");
+        } else {
+          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
+                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                       : [inptr0] "+r"(din0_ptr),
+                         [inptr1] "+r"(din1_ptr),
+                         [inptr2] "+r"(din2_ptr),
+                         [inptr3] "+r"(din3_ptr),
+                         [inptr4] "+r"(din4_ptr),
+                         [outptr0] "+r"(doutr0_ptr),
+                         [outptr1] "+r"(doutr1_ptr),
+                         [cnt] "+r"(cnt)
+                       : [vzero] "w"(vzero),
+                         [w0] "w"(wr0),
+                         [w1] "w"(wr1),
+                         [w2] "w"(wr2),
+                         [remain] "r"(cnt_remain),
+                         [mask1] "w"(vmask_rp1),
+                         [mask2] "w"(vmask_rp2),
+                         [wmask] "w"(wmask),
+                         [vbias] "w"(wbias)
+                       : "cc",
+                         "memory",
+                         "v0",
+                         "v1",
+                         "v2",
+                         "v3",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16",
+                         "v17",
+                         "v18",
+                         "v19",
+                         "v20",
+                         "v21");
+        }
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_in; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        if (i == 0) {
+          din0_ptr = zero_ptr;
+          din1_ptr = dr0;
+          din2_ptr = dr1;
+          dr0 = dr1;
+          dr1 = dr2;
+          dr2 = dr1 + w_in;
+        } else {
+          dr0 = dr2;
+          dr1 = dr0 + w_in;
+          dr2 = dr1 + w_in;
+        }
+
+        //! process bottom pad
+        if (i + 2 > h_in) {
+          switch (i + 2 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        int cnt = cnt_col;
+        unsigned int* mask_ptr = dmask;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
+                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+              : [din0_ptr] "+r"(din0_ptr),
+                [din1_ptr] "+r"(din1_ptr),
+                [din2_ptr] "+r"(din2_ptr),
+                [outptr] "+r"(doutr0_ptr),
+                [cnt] "+r"(cnt),
+                [mask_ptr] "+r"(mask_ptr)
+              : [remain] "r"(cnt_remain),
+                [wr0] "w"(wr0),
+                [wr1] "w"(wr1),
+                [wr2] "w"(wr2),
+                [bias] "r"(bias_c)
+              : "cc",
+                "memory",
+                "q3",
+                "q4",
+                "q5",
+                "q6",
+                "q7",
+                "q8",
+                "q9",
+                "q10",
+                "q11",
+                "q12",
+                "q13",
+                "q14",
+                "q15");
+        } else {
+          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
+                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [outptr] "+r"(doutr0_ptr),
+                         [cnt] "+r"(cnt),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [remain] "r"(cnt_remain),
+                         [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        doutr0 = doutr0 + w_out;
+      }
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
+ */
+void conv_depthwise_3x3s2p1_bias_s(float* dout,
+                                   const float* din,
+                                   const float* weights,
+                                   const float* bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      int hs = -1;
+      int he = 2;
+      float out_buf[4];
+      for (int j = 0; j < h_out; ++j) {
+        const float* dr0 = din_channel + hs * w_in;
+        const float* dr1 = dr0 + w_in;
+        const float* dr2 = dr1 + w_in;
+        if (hs == -1) {
+          dr0 = zeros;
+        }
+        if (he > h_in) {
+          dr2 = zeros;
+        }
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        } else {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15");
+        }
+#else
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S2 RESULT_S_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+        hs += 2;
+        he += 2;
+      }
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2
+ */
+// w_in > 7
+void conv_depthwise_3x3s2p0_bias(float* dout,
+                                 const float* din,
+                                 const float* weights,
+                                 const float* bias,
+                                 bool flag_bias,
+                                 bool flag_relu,
+                                 const int num,
+                                 const int ch_in,
+                                 const int h_in,
+                                 const int w_in,
+                                 const int h_out,
+                                 const int w_out,
+                                 ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+
+  int tile_w = w_out >> 2;
+  int cnt_remain = w_out % 4;
+
+  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
+
+  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
+                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+  uint32x4_t wmask =
+      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  float* zero_ptr = ctx->workspace_data<float>();
+  memset(zero_ptr, 0, w_in * sizeof(float));
+  float* write_ptr = zero_ptr + w_in;
+
+  unsigned int dmask[12];
+
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+  vst1q_u32(dmask + 8, wmask);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float32x4_t vzero = vdupq_n_f32(0.f);
+
+#ifdef __aarch64__
+      float32x4_t wbias;
+      if (flag_bias) {
+        wbias = vdupq_n_f32(bias[i]);
+      } else {
+        wbias = vdupq_n_f32(0.f);
+      }
+#else
+      float bias_c = 0.f;
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+#endif  // __aarch64__
+
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      const float* dr3 = dr2 + w_in;
+      const float* dr4 = dr3 + w_in;
+
+      const float* din0_ptr = dr0;
+      const float* din1_ptr = dr1;
+      const float* din2_ptr = dr2;
+      const float* din3_ptr = dr3;
+      const float* din4_ptr = dr4;
+
+      float* doutr0 = dout_channel;
+      float* doutr0_ptr = nullptr;
+      float* doutr1_ptr = nullptr;
+
+#ifdef __aarch64__
+      for (int i = 0; i < h_out; i += 2) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+        din3_ptr = dr3;
+        din4_ptr = dr4;
+
+        doutr0_ptr = doutr0;
+        doutr1_ptr = doutr0 + w_out;
+
+        dr0 = dr4;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+        dr3 = dr2 + w_in;
+        dr4 = dr3 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 5 > h_in) {
+          switch (i * 2 + 5 - h_in) {
+            case 4:
+              din1_ptr = zero_ptr;
+            case 3:
+              din2_ptr = zero_ptr;
+            case 2:
+              din3_ptr = zero_ptr;
+            case 1:
+              din4_ptr = zero_ptr;
+            case 0:
+              din4_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        //! process output pad
+        if (i + 2 > h_out) {
+          doutr1_ptr = write_ptr;
+        }
+        int cnt = tile_w;
+        if (flag_relu) {
+          asm volatile(
+              INIT_S2
+              "ld1 {v15.4s}, [%[inptr0]]                 \n"
+              "ld1 {v18.4s}, [%[inptr1]]                 \n"
+              "ld1 {v19.4s}, [%[inptr2]]                 \n"
+              "ld1 {v20.4s}, [%[inptr3]]                 \n"
+              "ld1 {v21.4s}, [%[inptr4]]                 \n"
+              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+              MID_COMPUTE_S2 MID_RESULT_S2_RELU
+              "cmp %w[remain], #1                           \n"
+              "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                  RIGHT_RESULT_S2_RELU
+              "4:                                          \n"
+              : [inptr0] "+r"(din0_ptr),
+                [inptr1] "+r"(din1_ptr),
+                [inptr2] "+r"(din2_ptr),
+                [inptr3] "+r"(din3_ptr),
+                [inptr4] "+r"(din4_ptr),
+                [outptr0] "+r"(doutr0_ptr),
+                [outptr1] "+r"(doutr1_ptr),
+                [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero),
+                [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [remain] "r"(cnt_remain),
+                [mask1] "w"(vmask_rp1),
+                [mask2] "w"(vmask_rp2),
+                [wmask] "w"(wmask),
+                [vbias] "w"(wbias)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21");
+        } else {
+          asm volatile(
+              INIT_S2
+              "ld1 {v15.4s}, [%[inptr0]]                 \n"
+              "ld1 {v18.4s}, [%[inptr1]]                 \n"
+              "ld1 {v19.4s}, [%[inptr2]]                 \n"
+              "ld1 {v20.4s}, [%[inptr3]]                 \n"
+              "ld1 {v21.4s}, [%[inptr4]]                 \n"
+              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
+              MID_COMPUTE_S2 MID_RESULT_S2
+              "cmp %w[remain], #1                           \n"
+              "blt 4f                                     \n" RIGHT_COMPUTE_S2
+                  RIGHT_RESULT_S2
+              "4:                                          \n"
+              : [inptr0] "+r"(din0_ptr),
+                [inptr1] "+r"(din1_ptr),
+                [inptr2] "+r"(din2_ptr),
+                [inptr3] "+r"(din3_ptr),
+                [inptr4] "+r"(din4_ptr),
+                [outptr0] "+r"(doutr0_ptr),
+                [outptr1] "+r"(doutr1_ptr),
+                [cnt] "+r"(cnt)
+              : [vzero] "w"(vzero),
+                [w0] "w"(wr0),
+                [w1] "w"(wr1),
+                [w2] "w"(wr2),
+                [remain] "r"(cnt_remain),
+                [mask1] "w"(vmask_rp1),
+                [mask2] "w"(vmask_rp2),
+                [wmask] "w"(wmask),
+                [vbias] "w"(wbias)
+              : "cc",
+                "memory",
+                "v0",
+                "v1",
+                "v2",
+                "v3",
+                "v4",
+                "v5",
+                "v6",
+                "v7",
+                "v8",
+                "v9",
+                "v10",
+                "v11",
+                "v12",
+                "v13",
+                "v14",
+                "v15",
+                "v16",
+                "v17",
+                "v18",
+                "v19",
+                "v20",
+                "v21");
+        }
+        doutr0 = doutr0 + 2 * w_out;
+      }
+#else
+      for (int i = 0; i < h_out; i++) {
+        din0_ptr = dr0;
+        din1_ptr = dr1;
+        din2_ptr = dr2;
+
+        doutr0_ptr = doutr0;
+
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        //! process bottom pad
+        if (i * 2 + 3 > h_in) {
+          switch (i * 2 + 3 - h_in) {
+            case 2:
+              din1_ptr = zero_ptr;
+            case 1:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        int cnt = tile_w;
+        unsigned int* mask_ptr = dmask;
+        if (flag_relu) {
+          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU
+                           RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [outptr] "+r"(doutr0_ptr),
+                         [cnt] "+r"(cnt),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [remain] "r"(cnt_remain),
+                         [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
+                           RIGHT_RESULT_S2
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [outptr] "+r"(doutr0_ptr),
+                         [cnt] "+r"(cnt),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [remain] "r"(cnt_remain),
+                         [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+        doutr0 = doutr0 + w_out;
+      }
+#endif
+    }
+  }
+}
+
+/**
+ * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
+ */
+void conv_depthwise_3x3s2p0_bias_s(float* dout,
+                                   const float* din,
+                                   const float* weights,
+                                   const float* bias,
+                                   bool flag_bias,
+                                   bool flag_relu,
+                                   const int num,
+                                   const int ch_in,
+                                   const int h_in,
+                                   const int w_in,
+                                   const int h_out,
+                                   const int w_out,
+                                   ARMContext* ctx) {
+  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
+  int out_pad_idx[4] = {0, 1, 2, 3};
+  float zeros[8] = {0.0f};
+  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
+
+  uint32x4_t vmask_rp1 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
+  uint32x4_t vmask_rp2 =
+      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
+
+  int size_in_channel = w_in * h_in;
+  int size_out_channel = w_out * h_out;
+
+  unsigned int dmask[8];
+  vst1q_u32(dmask, vmask_rp1);
+  vst1q_u32(dmask + 4, vmask_rp2);
+
+  for (int n = 0; n < num; ++n) {
+    const float* din_batch = din + n * ch_in * size_in_channel;
+    float* dout_batch = dout + n * ch_in * size_out_channel;
+#pragma omp parallel for
+    for (int i = 0; i < ch_in; ++i) {
+      const float* din_channel = din_batch + i * size_in_channel;
+      float* dout_channel = dout_batch + i * size_out_channel;
+
+      const float* weight_ptr = weights + i * 9;
+      float32x4_t wr0 = vld1q_f32(weight_ptr);
+      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
+      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
+
+      float bias_c = 0.f;
+
+      if (flag_bias) {
+        bias_c = bias[i];
+      }
+      float32x4_t vbias = vdupq_n_f32(bias_c);
+      float out_buf[4];
+      const float* dr0 = din_channel;
+      const float* dr1 = dr0 + w_in;
+      const float* dr2 = dr1 + w_in;
+      for (int j = 0; j < h_out; j++) {
+        const float* din0_ptr = dr0;
+        const float* din1_ptr = dr1;
+        const float* din2_ptr = dr2;
+        if (j * 2 + 2 >= h_in) {
+          switch (j + 2 - h_in) {
+            case 1:
+              din1_ptr = zero_ptr;
+            case 0:
+              din2_ptr = zero_ptr;
+            default:
+              break;
+          }
+        }
+        dr0 = dr2;
+        dr1 = dr0 + w_in;
+        dr2 = dr1 + w_in;
+
+        unsigned int* mask_ptr = dmask;
+#ifdef __aarch64__
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16");
+        } else {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr),
+                         [mask_ptr] "+r"(mask_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "w"(vbias),
+                         [out] "r"(out_buf)
+                       : "cc",
+                         "memory",
+                         "v4",
+                         "v5",
+                         "v6",
+                         "v7",
+                         "v8",
+                         "v9",
+                         "v10",
+                         "v11",
+                         "v12",
+                         "v13",
+                         "v14",
+                         "v15",
+                         "v16");
+        }
+#else
+        if (flag_relu) {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf),
+                         [mask_ptr] "r"(dmask)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        } else {
+          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
+                       : [din0_ptr] "+r"(din0_ptr),
+                         [din1_ptr] "+r"(din1_ptr),
+                         [din2_ptr] "+r"(din2_ptr)
+                       : [wr0] "w"(wr0),
+                         [wr1] "w"(wr1),
+                         [wr2] "w"(wr2),
+                         [bias] "r"(bias_c),
+                         [out] "r"(out_buf),
+                         [mask_ptr] "r"(dmask)
+                       : "cc",
+                         "memory",
+                         "q3",
+                         "q4",
+                         "q5",
+                         "q6",
+                         "q7",
+                         "q8",
+                         "q9",
+                         "q10",
+                         "q11",
+                         "q12",
+                         "q13",
+                         "q14",
+                         "q15");
+        }
+#endif
+        for (int w = 0; w < w_out; ++w) {
+          *dout_channel++ = out_buf[w];
+        }
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
new file mode 100644
index 0000000000..9852c0f84e
--- /dev/null
+++ b/lite/backends/arm/math/conv3x3s2px_depthwise_fp32.cc
@@ -0,0 +1,362 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <arm_neon.h>
+#include "lite/backends/arm/math/conv_block_utils.h"
+#include "lite/backends/arm/math/conv_impl.h"
+#include "lite/core/context.h"
+#include "lite/operators/op_params.h"
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void conv_3x3s2_depthwise_fp32(const float* i_data,
+                               float* o_data,
+                               int bs,
+                               int oc,
+                               int oh,
+                               int ow,
+                               int ic,
+                               int ih,
+                               int win,
+                               const float* weights,
+                               const float* bias,
+                               const operators::ConvParam& param,
+                               ARMContext* ctx) {
+  auto paddings = *param.paddings;
+  int threads = ctx->threads();
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
+  const int out_c_block = 4;
+  const int out_h_kernel = 1;
+  const int out_w_kernel = 4;
+  const int win_ext = ow * 2 + 1;
+  const int ow_round = ROUNDUP(ow, 4);
+  const int win_round = ROUNDUP(win_ext, 4);
+  const int hin_round = oh * 2 + 1;
+  const int prein_size = win_round * hin_round * out_c_block;
+  auto workspace_size =
+      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
+  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
+
+  bool flag_relu = param.fuse_relu;
+  bool flag_bias = param.bias != nullptr;
+
+  /// get workspace
+  auto ptr_zero = ctx->workspace_data<float>();
+  memset(ptr_zero, 0, sizeof(float) * win_round);
+  float* ptr_write = ptr_zero + win_round;
+
+  int size_in_channel = win * ih;
+  int size_out_channel = ow * oh;
+
+  int ws = -pad_w;
+  int we = ws + win_round;
+  int hs = -pad_h;
+  int he = hs + hin_round;
+  int w_loop = ow_round / 4;
+  auto remain = w_loop * 4 - ow;
+  bool flag_remain = remain > 0;
+  remain = 4 - remain;
+  remain = remain > 0 ? remain : 0;
+  int row_len = win_round * out_c_block;
+
+  for (int n = 0; n < bs; ++n) {
+    const float* din_batch = i_data + n * ic * size_in_channel;
+    float* dout_batch = o_data + n * oc * size_out_channel;
+#pragma omp parallel for num_threads(threads)
+    for (int c = 0; c < oc; c += out_c_block) {
+#ifdef ARM_WITH_OMP
+      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
+#else
+      float* pre_din = ptr_write + ow_round;
+#endif
+      /// const array size
+      prepack_input_nxwc4_dw(
+          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
+      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
+      float* dout_c00 = dout_batch + c * size_out_channel;
+      float bias_local[4] = {0, 0, 0, 0};
+      if (flag_bias) {
+        bias_local[0] = bias[c];
+        bias_local[1] = bias[c + 1];
+        bias_local[2] = bias[c + 2];
+        bias_local[3] = bias[c + 3];
+      }
+#ifdef __aarch64__
+      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
+      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
+      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
+      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
+      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
+      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
+      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
+      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
+      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
+#endif
+      for (int h = 0; h < oh; h += out_h_kernel) {
+        float* outc0 = dout_c00 + h * ow;
+        float* outc1 = outc0 + size_out_channel;
+        float* outc2 = outc1 + size_out_channel;
+        float* outc3 = outc2 + size_out_channel;
+        const float* inr0 = pre_din + h * 2 * row_len;
+        const float* inr1 = inr0 + row_len;
+        const float* inr2 = inr1 + row_len;
+        if (c + out_c_block > oc) {
+          switch (c + out_c_block - oc) {
+            case 3:
+              outc1 = ptr_write;
+            case 2:
+              outc2 = ptr_write;
+            case 1:
+              outc3 = ptr_write;
+            default:
+              break;
+          }
+        }
+        auto c0 = outc0;
+        auto c1 = outc1;
+        auto c2 = outc2;
+        auto c3 = outc3;
+        float pre_out[16];
+        for (int w = 0; w < w_loop; ++w) {
+          bool flag_mask = (w == w_loop - 1) && flag_remain;
+          if (flag_mask) {
+            c0 = outc0;
+            c1 = outc1;
+            c2 = outc2;
+            c3 = outc3;
+            outc0 = pre_out;
+            outc1 = pre_out + 4;
+            outc2 = pre_out + 8;
+            outc3 = pre_out + 12;
+          }
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+          "ldr    q8, [%[bias]]\n"         /* load bias */
+          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v19.16b,  v8.16b, v8.16b\n"
+          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v20.16b,  v8.16b, v8.16b\n"
+          "ldp    q4, q5,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v21.16b,  v8.16b, v8.16b\n"
+          "ldp    q6, q7,   [%[inr0]], #32\n" /* load input r0*/
+          "and    v22.16b,  v8.16b, v8.16b\n"
+          "ldr    q8,       [%[inr0]]\n"      /* load input r0*/
+          /*  r0 mul w0-w2, get out */
+          "fmla   v19.4s ,  %[w0].4s,  v0.4s\n" /* outr0 = w0 * r0, 0*/
+          "fmla   v20.4s ,  %[w0].4s,  v2.4s\n" /* outr1 = w0 * r0, 2*/
+          "fmla   v21.4s ,  %[w0].4s,  v4.4s\n" /* outr2 = w0 * r0, 4*/
+          "fmla   v22.4s ,  %[w0].4s,  v6.4s\n" /* outr3 = w0 * r0, 6*/
+          "fmla   v19.4s ,  %[w1].4s,  v1.4s\n" /* outr0 = w1 * r0, 1*/
+          "ldp    q0, q1,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v20.4s ,  %[w1].4s,  v3.4s\n" /* outr1 = w1 * r0, 3*/
+          "fmla   v21.4s ,  %[w1].4s,  v5.4s\n" /* outr2 = w1 * r0, 5*/
+          "fmla   v22.4s ,  %[w1].4s,  v7.4s\n" /* outr3 = w1 * r0, 7*/
+          "fmla   v19.4s ,  %[w2].4s,  v2.4s\n" /* outr0 = w0 * r0, 2*/
+          "ldp    q2, q3,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v20.4s ,  %[w2].4s,  v4.4s\n" /* outr1 = w0 * r0, 4*/
+          "ldp    q4, q5,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v21.4s ,  %[w2].4s,  v6.4s\n" /* outr2 = w0 * r0, 6*/
+          "ldp    q6, q7,   [%[inr1]], #32\n"   /* load input r1*/
+          "fmla   v22.4s ,  %[w2].4s,  v8.4s\n" /* outr3 = w0 * r0, 8*/
+          "ldr    q8,   [%[inr1]]\n"            /* load input r1*/
+          /*  r1, mul w3-w5, get out */
+          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr0 = w3 * r1, 0*/
+          "fmla   v20.4s ,  %[w3].4s,  v2.4s\n" /* outr1 = w3 * r1, 2*/
+          "fmla   v21.4s ,  %[w3].4s,  v4.4s\n" /* outr2 = w3 * r1, 4*/
+          "fmla   v22.4s ,  %[w3].4s,  v6.4s\n" /* outr3 = w3 * r1, 6*/
+          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr0 = w4 * r1, 1*/
+          "ldp    q0, q1,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v20.4s ,  %[w4].4s,  v3.4s\n" /* outr1 = w4 * r1, 3*/
+          "fmla   v21.4s ,  %[w4].4s,  v5.4s\n" /* outr2 = w4 * r1, 5*/
+          "fmla   v22.4s ,  %[w4].4s,  v7.4s\n" /* outr3 = w4 * r1, 7*/
+          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr0 = w5 * r1, 2*/
+          "ldp    q2, q3,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v20.4s ,  %[w5].4s,  v4.4s\n" /* outr1 = w5 * r1, 4*/
+          "ldp    q4, q5,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v21.4s ,  %[w5].4s,  v6.4s\n" /* outr2 = w5 * r1, 6*/
+          "ldp    q6, q7,   [%[inr2]], #32\n"   /* load input r2*/
+          "fmla   v22.4s ,  %[w5].4s,  v8.4s\n" /* outr3 = w5 * r1, 8*/
+          "ldr    q8,   [%[inr2]]\n"            /* load input r2*/
+          /*  r2, mul w6-w8, get out r0, r1 */
+          "fmla   v19.4s ,  %[w6].4s,  v0.4s\n" /* outr0 = w6 * r2, 0*/
+          "fmla   v20.4s ,  %[w6].4s,  v2.4s\n" /* outr1 = w6 * r2, 2*/
+          "fmla   v21.4s ,  %[w6].4s,  v4.4s\n" /* outr2 = w6 * r2, 4*/
+          "fmla   v22.4s ,  %[w6].4s,  v6.4s\n" /* outr3 = w6 * r2, 6*/
+          "fmla   v19.4s ,  %[w7].4s,  v1.4s\n" /* outr0 = w7 * r2, 1*/
+          "fmla   v20.4s ,  %[w7].4s,  v3.4s\n" /* outr1 = w7 * r2, 3*/
+          "fmla   v21.4s ,  %[w7].4s,  v5.4s\n" /* outr2 = w7 * r2, 5*/
+          "fmla   v22.4s ,  %[w7].4s,  v7.4s\n" /* outr3 = w7 * r2, 7*/
+          "fmla   v19.4s ,  %[w8].4s,  v2.4s\n" /* outr0 = w8 * r2, 2*/
+          "fmla   v20.4s ,  %[w8].4s,  v4.4s\n" /* outr1 = w8 * r2, 4*/
+          "fmla   v21.4s ,  %[w8].4s,  v6.4s\n" /* outr2 = w8 * r2, 6*/
+          "fmla   v22.4s ,  %[w8].4s,  v8.4s\n" /* outr3 = w8 * r2, 8*/
+          /* transpose */
+          "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
+          "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
+          "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
+          "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
+          "trn1 v19.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
+          "trn2 v21.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
+          "trn1 v20.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
+          "trn2 v22.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
+          /* relu */
+          "cbz  %w[flag_relu],  0f\n"    /* skip relu*/
+          "movi v0.4s, #0\n"             /* for relu */
+          "fmax v19.4s, v19.4s, v0.4s\n"
+          "fmax v20.4s, v20.4s, v0.4s\n"
+          "fmax v21.4s, v21.4s, v0.4s\n"
+          "fmax v22.4s, v22.4s, v0.4s\n"
+          /* save result */
+          "0:\n"
+          "str q19, [%[outc0]], #16\n"
+          "str q20, [%[outc1]], #16\n"
+          "str q21, [%[outc2]], #16\n"
+          "str q22, [%[outc3]], #16\n"
+          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
+          [inr2] "+r"(inr2),
+          [outc0]"+r"(outc0), [outc1]"+r"(outc1),
+          [outc2]"+r"(outc2), [outc3]"+r"(outc3)
+          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
+          [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
+          [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
+          [bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
+          : "cc", "memory",
+                  "v0","v1","v2","v3","v4","v5","v6","v7",
+                  "v8", "v19","v20","v21","v22"
+          );
+#else
+          asm volatile(
+          /* fill with bias */
+          "vld1.32  {d16-d17}, [%[bias]]\n"   /* load bias */
+          /* load weights */
+          "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
+          "vld1.32  {d0-d3},   [%[r0]]!\n"    /* load input r0, 0,1*/
+          "vand.i32 q12,  q8, q8\n"
+          "vld1.32  {d4-d7},   [%[r0]]!\n"    /* load input r0, 2,3*/
+          "vand.i32 q13,  q8, q8\n"
+          "vld1.32  {d8-d11},  [%[r0]]!\n"    /* load input r0, 4,5*/
+          "vand.i32 q14,  q8, q8\n"
+          "vld1.32  {d12-d15}, [%[r0]]!\n"    /* load input r0, 6,7*/
+          "vand.i32 q15,  q8, q8\n"
+          "vld1.32  {d16-d17}, [%[r0]]\n"     /* load input r0, 8*/
+          /* mul r0 with w0, w1, w2 */
+          "vmla.f32   q12, q9, q0               @ w0 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w0 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w2, to q11 */
+          "vmla.f32   q14, q9, q4               @ w0 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w0 * inr6\n"
+          "vmla.f32   q12, q10, q1              @ w1 * inr1\n"
+          "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"
+          "vmla.f32   q13, q10, q3              @ w1 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w1 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w1 * inr7\n"
+          "vld1.32    {d18-d21}, [%[wc0]]!\n"  /* load w3-4, to q9-10 */
+          "vmla.f32   q12, q11, q2              @ w2 * inr2\n"
+          "vld1.32    {d4-d7}, [%[r1]]!         @ load r1, 2, 3\n"
+          "vmla.f32   q13, q11, q4              @ w2 * inr4\n"
+          "vld1.32    {d8-d11}, [%[r1]]!        @ load r1, 4, 5\n"
+          "vmla.f32   q14, q11, q6              @ w2 * inr6\n"
+          "vld1.32    {d12-d15}, [%[r1]]!       @ load r1, 6, 7\n"
+          "vmla.f32   q15, q11, q8              @ w2 * inr8\n"
+          /* mul r1 with w3, w4, w5 */
+          "vmla.f32   q12, q9, q0               @ w3 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w3 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w5, to q11 */
+          "vmla.f32   q14, q9, q4               @ w3 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w3 * inr6\n"
+          "vld1.32    {d16-d17}, [%[r1]]\n"     /* load input r1, 8*/
+          "vmla.f32   q12, q10, q1              @ w4 * inr1\n"
+          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"
+          "vmla.f32   q13, q10, q3              @ w4 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w4 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w4 * inr7\n"
+          "vld1.32    {d18-d21}, [%[wc0]]!\n"   /* load w6-7, to q9-10 */
+          "vmla.f32   q12, q11, q2              @ w5 * inr2\n"
+          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"
+          "vmla.f32   q13, q11, q4              @ w5 * inr4\n"
+          "vld1.32    {d8-d11}, [%[r2]]!        @ load r2, 4, 5\n"
+          "vmla.f32   q14, q11, q6              @ w5 * inr6\n"
+          "vld1.32    {d12-d15}, [%[r2]]!       @ load r2, 6, 7\n"
+          "vmla.f32   q15, q11, q8              @ w5 * inr8\n"
+          /* mul r2 with w6, w7, w8 */
+          "vmla.f32   q12, q9, q0               @ w6 * inr0\n"
+          "vmla.f32   q13, q9, q2               @ w6 * inr2\n"
+          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w8, to q11 */
+          "vmla.f32   q14, q9, q4               @ w6 * inr4\n"
+          "vmla.f32   q15, q9, q6               @ w6 * inr6\n"
+          "vld1.32    {d16-d17}, [%[r2]]\n"     /* load input r2, 8*/
+          "vmla.f32   q12, q10, q1              @ w7 * inr1\n"
+          "vmla.f32   q13, q10, q3              @ w7 * inr3\n"
+          "vmla.f32   q14, q10, q5              @ w7 * inr5\n"
+          "vmla.f32   q15, q10, q7              @ w7 * inr7\n"
+          "sub    %[wc0], %[wc0], #144          @ wc0 - 144 to start address\n"
+          "vmla.f32   q12, q11, q2              @ w8 * inr2\n"
+          "vmla.f32   q13, q11, q4              @ w8 * inr4\n"
+          "vmla.f32   q14, q11, q6              @ w8 * inr6\n"
+          "vmla.f32   q15, q11, q8              @ w8 * inr8\n"
+          /* transpose */
+          "vtrn.32 q12, q13\n"    /* a0a1c0c1, b0b1d0d1*/
+          "vtrn.32 q14, q15\n"    /* a2a3c2c3, b2b3d2d3*/
+          "vswp   d25, d28\n"     /* a0a1a2a3, c0c1c2c3*/
+          "vswp   d27, d30\n"     /* b0b1b2b3, d0d1d2d3*/
+          "cmp  %[flag_relu], #0\n"
+          "beq  0f\n"             /* skip relu*/
+          "vmov.u32 q0, #0\n"
+          "vmax.f32 q12, q12, q0\n"
+          "vmax.f32 q13, q13, q0\n"
+          "vmax.f32 q14, q14, q0\n"
+          "vmax.f32 q15, q15, q0\n"
+          "0:\n"
+          "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
+          "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
+          "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
+          "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
+          :[r0] "+r"(inr0), [r1] "+r"(inr1),
+           [r2] "+r"(inr2), [wc0] "+r" (weight_c),
+           [outc0]"+r"(outc0), [outc1]"+r"(outc1),
+           [outc2]"+r"(outc2), [outc3]"+r"(outc3)
+          :[bias] "r" (bias_local),
+           [flag_relu]"r"(flag_relu)
+          :"cc", "memory",
+            "q0","q1","q2","q3","q4","q5","q6","q7",
+            "q8", "q9","q10","q11","q12","q13","q14","q15"
+          );
+#endif  //  __arch64__
+          // clang-format off
+          if (flag_mask) {
+            for (int i = 0; i < remain; ++i) {
+              c0[i] = pre_out[i];
+              c1[i] = pre_out[i + 4];
+              c2[i] = pre_out[i + 8];
+              c3[i] = pre_out[i + 12];
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_block_utils.h b/lite/backends/arm/math/conv_block_utils.h
index b2d16d18d2..e4279d9a72 100644
--- a/lite/backends/arm/math/conv_block_utils.h
+++ b/lite/backends/arm/math/conv_block_utils.h
@@ -254,6 +254,7 @@ inline void prepack_input_nxwc4_dw(const float* din,
     LOG(FATAL) << "prepack_dw_input, valid height must > zero";
   }
   float32x4_t vzero = vdupq_n_f32(0.f);
+  auto out_data = dout;
 
   int size_w = we - ws;
   int w0 = ws < 0 ? 0 : ws;
@@ -269,6 +270,7 @@ inline void prepack_input_nxwc4_dw(const float* din,
 
   bool flag_ext_l = left_remain > 0;
   int left_sl = 4 - left_remain;
+  int left_valid_sl = left_sl > width ? width : left_sl;
   uint32x4_t vmask_padl;
   bool flag_mask_l = false;
   if (flag_ext_l) {
@@ -290,6 +292,7 @@ inline void prepack_input_nxwc4_dw(const float* din,
   }
   int size_c = width * height;
   for (int h = hs; h < he; ++h) {
+    dout = out_data + (h - hs) * 4 * size_w;
     auto ptr_c0 = din + cs * size_c + h * width;
     auto ptr_c1 = ptr_c0 + size_c;
     auto ptr_c2 = ptr_c1 + size_c;
@@ -351,10 +354,10 @@ inline void prepack_input_nxwc4_dw(const float* din,
       }
       transpose_4x4(vc0, vc1, vc2, vc3, dout);
       dout += 16;
-      ptr_c0 += left_sl;
-      ptr_c1 += left_sl;
-      ptr_c2 += left_sl;
-      ptr_c3 += left_sl;
+      ptr_c0 += left_valid_sl;
+      ptr_c1 += left_valid_sl;
+      ptr_c2 += left_valid_sl;
+      ptr_c3 += left_valid_sl;
     }
     /// valid
     for (int i = 0; i < cnt_valid; ++i) {
@@ -722,7 +725,57 @@ inline bool write_to_output_c1_fp32(const float* din,
   }
   return true;
 }
-
+#ifdef __aarch64__
+#define NCHWC2_TRANS_FP32_COMPUTE                                      \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "movi v20.4s, #0                \n" /* for relu */                   \
+  "1:                             \n" /* main loop*/                   \
+  "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/                \
+  "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/                \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1*/ \
+  "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/               \
+  "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
+
+#define NCHWC2_TRANS_FP32_RELU                 \
+  "fmax   v2.4s, v4.4s, v20.4s    \n" /*relu*/ \
+  "fmax   v3.4s, v5.4s, v20.4s    \n" /*relu*/
+
+#define NCHWC2_TRANS_FP32_STORE                          \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
+                                                         \
+  "str    q2, [%[doutc0r0]], #16  \n" /* store c0r0*/    \
+  "str    q3, [%[doutc1r0]], #16  \n" /* store c2r0*/    \
+                                                         \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC2_TRANS_FP32_COMPUTE                                      \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, " \
+  "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"                       \
+  "vmov.u32 q15, #0                       @ dump zero\n"               \
+  "1:                                     @ main loop\n"               \
+  "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "   \
+  "c1r0, c1r1 \n"                                                      \
+  "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "   \
+  "c1r2, c1r3 \n"                                                      \
+                                                                       \
+  "vswp  d1, d2                           @ swap data\n"
+
+#define NCHWC2_TRANS_FP32_RELU                      \
+  "vmax.f32   q0, q0, q15                 @ relu\n" \
+  "vmax.f32   q1, q1, q15                 @ relu\n"
+
+#define NCHWC2_TRANS_FP32_STORE                                 \
+  "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add " \
+  "pointer\n"                                                   \
+  "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add " \
+  "pointer\n"                                                   \
+                                                                \
+  "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"   \
+                                                                \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"       \
+                                                                \
+  "bne    1b                              @ jump to main loop\n"
+#endif
 /*wirte result in outputs
 * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
 */
@@ -777,127 +830,41 @@ inline bool write_to_output_c2_fp32(const float* din,
       int cnt_loop = cnt;
       if (flag_relu) {
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3  */
-            "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-            "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-
-            "fmax   v2.4s, v4.4s, v20.4s    \n" /*relu*/
-            "fmax   v3.4s, v5.4s, v20.4s    \n" /*relu*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-
-            "str    q2, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "str    q3, [%[doutc1r0]], #16  \n" /* store c2r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0", "v1", "v2", "v3", "v4", "v5", "v20");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "
-            "c1r0, c1r1 \n"
-            "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "
-            "c1r2, c1r3 \n"
-
-            "vswp  d1, d2                           @ swap data\n"
-
-            "vmax.f32   q0, q0, q15                 @ relu\n"
-            "vmax.f32   q1, q1, q15                 @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_RELU
+                         NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
 #endif
       } else {
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v2.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v3.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load data, c0r0, c1r0, c0r1,
-                                                   c1r1, , c0r2, c1r2, c0r3,
-                                                   c1r3  */
-            "trn1   v4.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-            "trn2   v5.2d, v2.2d, v3.2d     \n" /* trans q8, q10*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-
-            "str    q4, [%[doutc0r0]], #16  \n" /* store c0r0*/
-            "str    q5, [%[doutc1r0]], #16  \n" /* store c2r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0", "v1", "v2", "v3", "v4", "v5");
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0", "v1", "v2", "v3", "v4", "v5");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @ load data, c0r0, "
-            "c1r0, c0r1, c1r1, , c0r2, c1r2, c0r3, c1r3\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 d0, d1                         @ trans data:c0r0, c0r1, "
-            "c1r0, c1r1 \n"
-            "vtrn.32 d2, d3                         @ trans data:c0r2, c0r3, "
-            "c1r2, c1r3 \n"
-
-            "vswp  d1, d2                           @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!       @ store result, add "
-            "pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!       @ store result, add "
-            "pointer\n"
-
-            "subs   %[cnt], %[cnt], #1              @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!         @ load data \n"
-
-            "bne    1b                              @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
+        asm volatile(NCHWC2_TRANS_FP32_COMPUTE NCHWC2_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
 #endif
       }
     }
@@ -922,6 +889,70 @@ inline bool write_to_output_c2_fp32(const float* din,
   return true;
 }
 
+#ifdef __aarch64__
+#define NCHWC4_TRANS_FP32_COMPUTE                                   \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */ \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */ \
+  "movi v20.4s, #0                \n" /* for relu */                \
+  "1:                             \n" /* main loop*/                \
+  "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/             \
+  "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/             \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */ \
+  "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/             \
+  "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/             \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */ \
+  "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/            \
+  "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/            \
+  "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/            \
+  "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
+
+#define NCHWC4_TRANS_FP32_RELU                 \
+  "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
+
+#define NCHWC4_TRANS_FP32_STORE                          \
+  "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/    \
+  "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/    \
+  "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/    \
+  "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/    \
+                                                         \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC4_TRANS_FP32_COMPUTE                                     \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"      \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"              \
+  "vmov.u32 q15, #0                       @ dump zero\n"              \
+  "1:                                     @ main loop\n"              \
+  "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 " \
+  "\n"                                                                \
+  "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 " \
+  "\n"                                                                \
+                                                                      \
+  "vswp   d1, d4                          @ swap data\n"              \
+  "vswp   d3, d6                          @ swap data\n"
+
+#define NCHWC4_TRANS_FP32_RELU             \
+  "vmax.f32   q0, q0, q15        @ relu\n" \
+  "vmax.f32   q1, q1, q15        @ relu\n" \
+  "vmax.f32   q2, q2, q15        @ relu\n" \
+  "vmax.f32   q3, q3, q15        @ relu\n"
+
+#define NCHWC4_TRANS_FP32_STORE                                        \
+  "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n" \
+  "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n" \
+  "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n" \
+  "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n" \
+                                                                       \
+  "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"                    \
+                                                                       \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"                \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"                \
+                                                                       \
+  "bne    1b                            @ jump to main loop\n"
+#endif
 /*wirte result in outputs
 * input din: [n, c / 4, h, w * 4], output dout: [n, c, h, w]
 */
@@ -958,7 +989,9 @@ inline bool write_to_output_c4_fp32(const float* din,
 
   int size_h = (he > height ? height : he) - hs;  // size_h == hei_n
 
-  int cnt = (width - ws) / w4;
+  int valid_we = we > width ? width : we;
+  int cnt = (valid_we - ws) / w4;
+  int remain = valid_we - ws - cnt * w4;
 
   for (int i = 0; i < size_h; i++) {
     int size_w = i * width;
@@ -983,185 +1016,88 @@ inline bool write_to_output_c4_fp32(const float* din,
       int cnt_loop = cnt;
       if (flag_relu) {
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"
-            "vmov.u32 q15, #0                       @ dump zero\n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                          @ swap data\n"
-            "vswp   d3, d6                          @ swap data\n"
-
-            "vmax.f32   q0, q0, q15        @ relu\n"
-            "vmax.f32   q1, q1, q15        @ relu\n"
-            "vmax.f32   q2, q2, q15        @ relu\n"
-            "vmax.f32   q3, q3, q15        @ relu\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q15");
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_RELU
+                         NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q15");
 #endif
       } else {
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v1.4s     \n" /* trans q0, q1*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "trn1   v10.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v2.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "trn1   v16.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn2   v17.2d, v8.2d, v10.2d   \n" /* trans q8, q10*/
-            "trn1   v18.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "trn2   v19.2d, v9.2d, v11.2d   \n" /* trans q9, q11*/
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v16",
-              "v17",
-              "v18",
-              "v19");
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!                 @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!         @load data \n"
-            "1:                                     @ main loop\n"
-            "vtrn.32 q0, q1                         @ trans data:c00c01c20c21 "
-            "\n"
-            "vtrn.32 q2, q3                         @ trans data:c02c03c22c23 "
-            "\n"
-
-            "vswp   d1, d4                          @ swap data\n"
-            "vswp   d3, d6                          @ swap data\n"
-
-            "vst1.32  {d0-d1}, [%[doutc0r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d2-d3}, [%[doutc1r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d4-d5}, [%[doutc2r0]]!     @ store result, add pointer\n"
-            "vst1.32  {d6-d7}, [%[doutc3r0]]!     @ store result, add pointer\n"
-
-            "subs   %[cnt], %[cnt], #1    @ loop count - 1\n"
-
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-
-            "bne    1b                            @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3");
+        asm volatile(NCHWC4_TRANS_FP32_COMPUTE NCHWC4_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3");
 #endif
       }
     }
-    if (we > width) {
+    if (remain > 0) {
       int offset = i * w_round * c4 + c4 * w4 * cnt;
       din_hei_ptr = ptr_din + offset;
-      int j = we - w4;
+      int j = 0;
       if (flag_relu) {
-        for (; j < width; ++j) {
+        for (; j < remain; ++j) {
           *(doutc0_ptr++) = LITEMAX(din_hei_ptr[0], 0.f);
           *(doutc1_ptr++) = LITEMAX(din_hei_ptr[1], 0.f);
           *(doutc2_ptr++) = LITEMAX(din_hei_ptr[2], 0.f);
@@ -1169,7 +1105,7 @@ inline bool write_to_output_c4_fp32(const float* din,
           din_hei_ptr += w4;
         }
       } else {
-        for (; j < width; ++j) {
+        for (; j < remain; ++j) {
           *(doutc0_ptr++) = din_hei_ptr[0];
           *(doutc1_ptr++) = din_hei_ptr[1];
           *(doutc2_ptr++) = din_hei_ptr[2];
@@ -1182,6 +1118,120 @@ inline bool write_to_output_c4_fp32(const float* din,
   return true;
 }
 
+#ifdef __aarch64__
+#define NCHWC8_TRANS_FP32_COMPUTE                                    \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */  \
+  "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+  "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */  \
+  "movi v20.4s, #0                \n" /* for relu */                 \
+  "1:                             \n" /* main loop*/                 \
+  "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/              \
+  "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/              \
+  "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/              \
+  "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/              \
+  "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+                                                                     \
+  "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/              \
+  "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/              \
+  "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/              \
+  "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/              \
+  "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */  \
+                                                                     \
+  "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/ \
+  "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/ \
+  "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/ \
+  "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/ \
+  "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */  \
+                                                                     \
+  "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/ \
+  "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/ \
+  "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/ \
+  "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/ \
+  "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
+
+#define NCHWC8_TRANS_FP32_RELU                 \
+  "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/ \
+                                               \
+  "fmax   v8.4s,  v8.4s,  v20.4s  \n" /*relu*/ \
+  "fmax   v9.4s,  v9.4s,  v20.4s  \n" /*relu*/ \
+  "fmax   v12.4s, v12.4s, v20.4s  \n" /*relu*/ \
+  "fmax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
+
+#define NCHWC8_TRANS_FP32_STORE                          \
+  "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/    \
+  "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/    \
+  "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/    \
+  "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/    \
+                                                         \
+  "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/ \
+  "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/    \
+  "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/    \
+  "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/    \
+  "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/    \
+                                                         \
+  "bne    1b                      \n" /* jump to main loop*/
+#else
+#define NCHWC8_TRANS_FP32_COMPUTE                           \
+  "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"     \
+  "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"     \
+  "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"     \
+  "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"     \
+  "vmov.u32 q15, #0                      @ dump zero\n"     \
+  "1:                                    @ main loop\n"     \
+  "vtrn.32   q0, q2                      @ trans q0, q2 \n" \
+  "vtrn.32   q4, q6                      @ trans q4, q6 \n" \
+  "vswp.32   d1, d8                      @ swap  d1, d8 \n" \
+  "vswp.32   d5, d12                     @ swap  d5, d12\n" \
+                                                            \
+  "vtrn.32   q1, q3                      @ trans q1, q3 \n" \
+  "vtrn.32   q5, q7                      @ trans q5, q7 \n" \
+  "vswp.32   d3, d10                     @ swap  d3, d10\n" \
+  "vswp.32   d7, d14                     @ swap  d7, d14\n"
+
+#define NCHWC8_TRANS_FP32_RELU                     \
+  "vmax.f32  q0, q0, q15                 @ relu\n" \
+  "vmax.f32  q1, q1, q15                 @ relu\n" \
+  "vmax.f32  q2, q2, q15                 @ relu\n" \
+  "vmax.f32  q3, q3, q15                 @ relu\n" \
+                                                   \
+  "vmax.f32  q4, q4, q15                 @ relu\n" \
+  "vmax.f32  q5, q5, q15                 @ relu\n" \
+  "vmax.f32  q6, q6, q15                 @ relu\n" \
+  "vmax.f32  q7, q7, q15                 @ relu\n"
+
+#define NCHWC8_TRANS_FP32_STORE                                \
+  "subs   %[cnt], %[cnt], #1             @ loop count - 1\n"   \
+  "vst1.32   {d0-d1}, [%[doutc0r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d2-d3}, [%[doutc4r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d4-d5}, [%[doutc1r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d6-d7}, [%[doutc5r0]]!     @ store result, add " \
+  "pointer\n"                                                  \
+                                                               \
+  "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"        \
+  "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"        \
+                                                               \
+  "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+  "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add " \
+  "pointer\n"                                                  \
+                                                               \
+  "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"        \
+  "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"        \
+                                                               \
+  "bne    1b                             @ jump to main loop\n"
+
+#endif
 /*wirte result in outputs
 * input din: [n, c / 8, h, w * 8], output dout: [n, c, h, w]
 */
@@ -1261,158 +1311,54 @@ inline bool write_to_output_c8_fp32(const float* din,
       if (cnt > 0) {
         int cnt_loop = cnt;
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "movi v20.4s, #0                \n" /* for relu */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "fmax   v16.4s, v16.4s, v20.4s  \n" /*relu*/
-            "fmax   v17.4s, v17.4s, v20.4s  \n" /*relu*/
-            "fmax   v18.4s, v18.4s, v20.4s  \n" /*relu*/
-            "fmax   v19.4s, v19.4s, v20.4s  \n" /*relu*/
-
-            "fmax   v8.4s,  v8.4s,  v20.4s  \n" /*relu*/
-            "fmax   v9.4s,  v9.4s,  v20.4s  \n" /*relu*/
-            "fmax   v12.4s, v12.4s, v20.4s  \n" /*relu*/
-            "fmax   v13.4s, v13.4s, v20.4s  \n" /*relu*/
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/
-            "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
 #else
-        asm volatile(
-            "vld1.32 {d0-d3}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d4-d7}, [%[ptr_din]]!        @load data \n"
-            "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-            "vmov.u32 q15, #0                      @ dump zero\n"
-            "1:                                    @ main loop\n"
-            "vtrn.32   q0, q2                      @ trans q0, q2 \n"
-            "vtrn.32   q4, q6                      @ trans q4, q6 \n"
-            "vswp.32   d1, d8                      @ swap  d1, d8 \n"
-            "vswp.32   d5, d12                     @ swap  d5, d12\n"
-
-            "vtrn.32   q1, q3                      @ trans q1, q3 \n"
-            "vtrn.32   q5, q7                      @ trans q5, q7 \n"
-            "vswp.32   d3, d10                     @ swap  d3, d10\n"
-            "vswp.32   d7, d14                     @ swap  d7, d14\n"
-
-            "vmax.f32  q0, q0, q15                 @ relu\n"
-            "vmax.f32  q1, q1, q15                 @ relu\n"
-            "vmax.f32  q2, q2, q15                 @ relu\n"
-            "vmax.f32  q3, q3, q15                 @ relu\n"
-
-            "vmax.f32  q4, q4, q15                 @ relu\n"
-            "vmax.f32  q5, q5, q15                 @ relu\n"
-            "vmax.f32  q6, q6, q15                 @ relu\n"
-            "vmax.f32  q7, q7, q15                 @ relu\n"
-
-            "subs   %[cnt], %[cnt], #1             @ loop count - 1\n"
-            "vst1.32   {d0-d1}, [%[doutc0r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d2-d3}, [%[doutc4r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d4-d5}, [%[doutc1r0]]!     @ store result, add "
-            "pointer\n"
-            "vst1.32   {d6-d7}, [%[doutc5r0]]!     @ store result, add "
-            "pointer\n"
-
-            "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"
-
-            "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32 {d8-d11}, [%[ptr_din]]!       @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-
-            "bne    1b                             @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4", "q15");
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_RELU
+                         NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q4", "q15");
 #endif
       }
       if (we > width) {
@@ -1468,138 +1414,53 @@ inline bool write_to_output_c8_fp32(const float* din,
       if (cnt > 0) {
         int cnt_loop = cnt;
 #ifdef __aarch64__
-        asm volatile(
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-            "1:                             \n" /* main loop*/
-            "trn1   v8.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn2   v9.4s, v0.4s, v2.4s     \n" /* trans q0, q1*/
-            "trn1   v10.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "trn2   v11.4s, v1.4s, v3.4s    \n" /* trans q2, q3*/
-            "ldp q0, q1, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v12.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn2   v13.4s, v4.4s, v6.4s    \n" /* trans q0, q1*/
-            "trn1   v14.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "trn2   v15.4s, v5.4s, v7.4s    \n" /* trans q2, q3*/
-            "ldp q2, q3, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "trn1   v16.2d, v8.2d, v12.2d   \n" /* trans q8, q10 00 01 02 03*/
-            "trn2   v17.2d, v8.2d, v12.2d   \n" /* trans q8, q10 20 21 22 23*/
-            "trn1   v18.2d, v9.2d, v13.2d   \n" /* trans q9, q11 10 11 12 13*/
-            "trn2   v19.2d, v9.2d, v13.2d   \n" /* trans q9, q11 30 31 32 33*/
-            "ldp q4, q5, [%[ptr_din]], #32  \n" /* load r00, r01 to q0, q1 */
-
-            "trn1   v8.2d, v10.2d, v14.2d   \n" /* trans q8, q10 40 41 42 43*/
-            "trn2   v9.2d, v10.2d, v14.2d   \n" /* trans q8, q10 60 61 62 63*/
-            "trn1   v12.2d, v11.2d, v15.2d  \n" /* trans q9, q11 50 51 52 53*/
-            "trn2   v13.2d, v11.2d, v15.2d  \n" /* trans q9, q11 70 71 72 73*/
-            "ldp q6, q7, [%[ptr_din]], #32  \n" /* load r02, r03 to q2, q3 */
-
-            "str    q16, [%[doutc0r0]], #16 \n" /* store c0r0*/
-            "str    q17, [%[doutc2r0]], #16 \n" /* store c2r0*/
-            "str    q18, [%[doutc1r0]], #16 \n" /* store c1r0*/
-            "str    q19, [%[doutc3r0]], #16 \n" /* store c3r0*/
-
-            "subs   %w[cnt], %w[cnt], #1    \n" /* loop count -1*/
-            "str    q8,  [%[doutc4r0]], #16 \n" /* store c0r0*/
-            "str    q9,  [%[doutc6r0]], #16 \n" /* store c2r0*/
-            "str    q12, [%[doutc5r0]], #16 \n" /* store c1r0*/
-            "str    q13, [%[doutc7r0]], #16 \n" /* store c3r0*/
-
-            "bne    1b                      \n" /* jump to main loop*/
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [cnt] "+r"(cnt_loop),
-              [ptr_din] "+r"(din_hei_ptr)
-            :
-            : "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20");
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [cnt] "+r"(cnt_loop),
+                       [ptr_din] "+r"(din_hei_ptr)
+                     :
+                     : "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15",
+                       "v16",
+                       "v17",
+                       "v18",
+                       "v19",
+                       "v20");
 #else
-        asm volatile(
-            "vld1.32   {d0-d3}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d4-d7}, [%[ptr_din]]!      @load data \n"
-            "vld1.32   {d8-d11}, [%[ptr_din]]!     @load data \n"
-            "vld1.32   {d12-d15}, [%[ptr_din]]!    @load data \n"
-            "1:                                    @ main loop\n"
-            "vtrn.32   q0, q2                      @ trans q0, q2 \n"
-            "vtrn.32   q4, q6                      @ trans q4, q6 \n"
-            "vswp.32   d1, d8                      @ swap  d1, d8 \n"
-            "vswp.32   d5, d12                     @ swap  d5, d12\n"
-
-            "vtrn.32   q1, q3                      @ trans q1, q3 \n"
-            "vtrn.32   q5, q7                      @ trans q5, q7 \n"
-            "vswp.32   d3, d10                     @ swap  d3, d10\n"
-            "vswp.32   d7, d14                     @ swap  d7, d14\n"
-
-            "subs      %[cnt], %[cnt], #1          @ loop count - 1\n"
-
-            "vst1.32   {d0-d1},   [%[doutc0r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d2-d3},   [%[doutc4r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d4-d5},   [%[doutc1r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d6-d7},   [%[doutc5r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32   {d0-d3},   [%[ptr_din]]!    @load data \n"
-            "vld1.32   {d4-d7},   [%[ptr_din]]!    @load data \n"
-
-            "vst1.32   {d8-d9},   [%[doutc2r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d10-d11}, [%[doutc6r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d12-d13}, [%[doutc3r0]]!   @ store result, add "
-            "pointer\n"
-            "vst1.32   {d14-d15}, [%[doutc7r0]]!   @ store result, add "
-            "pointer\n"
-
-            "vld1.32 {d8-d11},  [%[ptr_din]]!      @load data \n"
-            "vld1.32 {d12-d15}, [%[ptr_din]]!      @load data \n"
-
-            "bne    1b                             @ jump to main loop\n"
-
-            : [doutc0r0] "+r"(doutc0_ptr),
-              [doutc1r0] "+r"(doutc1_ptr),
-              [doutc2r0] "+r"(doutc2_ptr),
-              [doutc3r0] "+r"(doutc3_ptr),
-              [doutc4r0] "+r"(doutc4_ptr),
-              [doutc5r0] "+r"(doutc5_ptr),
-              [doutc6r0] "+r"(doutc6_ptr),
-              [doutc7r0] "+r"(doutc7_ptr),
-              [ptr_din] "+r"(din_hei_ptr),
-              [cnt] "+r"(cnt_loop)
-            :
-            : "q0", "q1", "q2", "q3", "q4");
+        asm volatile(NCHWC8_TRANS_FP32_COMPUTE NCHWC8_TRANS_FP32_STORE
+                     : [doutc0r0] "+r"(doutc0_ptr),
+                       [doutc1r0] "+r"(doutc1_ptr),
+                       [doutc2r0] "+r"(doutc2_ptr),
+                       [doutc3r0] "+r"(doutc3_ptr),
+                       [doutc4r0] "+r"(doutc4_ptr),
+                       [doutc5r0] "+r"(doutc5_ptr),
+                       [doutc6r0] "+r"(doutc6_ptr),
+                       [doutc7r0] "+r"(doutc7_ptr),
+                       [ptr_din] "+r"(din_hei_ptr),
+                       [cnt] "+r"(cnt_loop)
+                     :
+                     : "q0", "q1", "q2", "q3", "q4");
 #endif
       }
       if (we > width) {
diff --git a/lite/backends/arm/math/conv_depthwise.h b/lite/backends/arm/math/conv_depthwise.h
index 1a23982cd5..b6c3478880 100644
--- a/lite/backends/arm/math/conv_depthwise.h
+++ b/lite/backends/arm/math/conv_depthwise.h
@@ -85,38 +85,6 @@ void conv_depthwise_3x3s2_fp32(const float* din,
                                bool flag_relu,
                                ARMContext* ctx);
 
-void conv_depthwise_3x3p0_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx);
-
-void conv_depthwise_3x3p1_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx);
-
 template <typename Dtype>
 void conv_depthwise_3x3s1_int8(Dtype* dout,
                                const int8_t* din,
diff --git a/lite/backends/arm/math/conv_impl.cc b/lite/backends/arm/math/conv_impl.cc
index 010563bf93..dc68e65f42 100644
--- a/lite/backends/arm/math/conv_impl.cc
+++ b/lite/backends/arm/math/conv_impl.cc
@@ -107,29 +107,35 @@ void im2col(const Dtype* data_im,
             int width,
             int kernel_h,
             int kernel_w,
-            int pad_h,
-            int pad_w,
+            int pad_top,
+            int pad_bottom,
+            int pad_left,
+            int pad_right,
             int stride_h,
             int stride_w,
             int dilation_h,
             int dilation_w,
             Dtype* data_col) {
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_top + pad_bottom - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_left + pad_right - (dilation_w * (kernel_w - 1) + 1)) /
+          stride_w +
+      1;
   const int channel_size = height * width;
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_top + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             for (int output_cols = output_w; output_cols; output_cols--) {
               *(data_col++) = 0;
             }
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_left + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 *(data_col++) = data_im[input_row * width + input_col];
@@ -202,7 +208,8 @@ void conv1x1s1_gemm(const float* i_data,
               k,
               flag_bias,
               bias_group,
-              flag_relu);
+              flag_relu,
+              ctx);
       } else {
         sgemm_prepack(false,
                       m,
@@ -361,6 +368,8 @@ void conv_im2col_gemm(const float* i_data,
   float* tmp_work_space =
       ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   //! use gemv when the output channel size = 1
   for (int b = 0; b < num; ++b) {
     // dC
@@ -378,12 +387,14 @@ void conv_im2col_gemm(const float* i_data,
              win,
              kernel_h,
              kernel_w,
-             param.paddings[0],
-             param.paddings[1],
+             paddings[0],
+             paddings[1],
+             paddings[2],
+             paddings[3],
              param.strides[0],
              param.strides[1],
-             param.dilations[0],
-             param.dilations[1],
+             dilations[0],
+             dilations[1],
              dB);
 
       if (n == 1) {
@@ -395,7 +406,8 @@ void conv_im2col_gemm(const float* i_data,
               k,
               flag_bias,
               bias_group,
-              flag_relu);
+              flag_relu,
+              ctx);
       } else {
         int ldb = n;
         sgemm_prepack(false,
@@ -434,14 +446,16 @@ void conv_im2col_gemm_int8(const int8_t* i_data,
                            const float* scale) {
   int group = param.groups;
   auto filter_dims = param.filter->dims();
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int kernel_h = filter_dims[2];
   int kernel_w = filter_dims[3];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   const int m = oc / group;
   const int n = oh * ow;
   const int k = ic * kernel_h * kernel_w / group;
@@ -482,7 +496,9 @@ void conv_im2col_gemm_int8(const int8_t* i_data,
              kernel_h,
              kernel_w,
              pad_h,
+             paddings[1],
              pad_w,
+             paddings[3],
              stride_h,
              stride_w,
              dila_h,
@@ -562,90 +578,83 @@ void conv_depthwise_3x3_fp32(const void* din,
                              const operators::ConvParam& param,
                              ARMContext* ctx,
                              const float* scale) {
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
-  if (pad_w != pad_h) {
-    LOG(FATAL) << "fp32 depthwise conv3x3 pad_w: " << pad_w
-               << ", pad_h: " << pad_h << " must be equal";
-    return;
-  }
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
   int stride = param.strides[1];
   int pad = pad_w;
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
-  if (stride == 1 && pad < 2) {  // support pad = [0, 1]
-    conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              pad,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
-  } else if (stride == 2 && pad < 2) {  // support pad = [0, 1]
-    conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              pad,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
-  } else {
-    LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride
-               << " or pad(<2): " << pad << " unsupported";
-  }
-#if 0
-  if (pad == 1) {
-    conv_depthwise_3x3p1_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              stride,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
-  } else if (pad == 0 && h_in > 2) {
-    conv_depthwise_3x3p0_fp32(reinterpret_cast<const float*>(din),
-                              reinterpret_cast<float*>(dout),
-                              num,
-                              ch_out,
-                              h_out,
-                              w_out,
-                              ch_in,
-                              h_in,
-                              w_in,
-                              reinterpret_cast<const float*>(weights),
-                              bias,
-                              stride,
-                              flag_bias,
-                              flag_relu,
-                              ctx);
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+  if (stride == 1) {
+    if (pads_equal && (pad_h == pad_w) && (pad < 2)) {  // support pad = [0, 1]
+      conv_depthwise_3x3s1_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                pad,
+                                flag_bias,
+                                flag_relu,
+                                ctx);
+    } else {
+      conv_3x3s1_depthwise_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                param,
+                                ctx);
+    }
+
+  } else if (stride == 2) {
+    if (pad_h == pad_w && (pad < 2)) {  // support pad = [0, 1]
+      conv_depthwise_3x3s2_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                pad,
+                                flag_bias,
+                                flag_relu,
+                                ctx);
+    } else {
+      conv_3x3s2_depthwise_fp32(reinterpret_cast<const float*>(din),
+                                reinterpret_cast<float*>(dout),
+                                num,
+                                ch_out,
+                                h_out,
+                                w_out,
+                                ch_in,
+                                h_in,
+                                w_in,
+                                reinterpret_cast<const float*>(weights),
+                                bias,
+                                param,
+                                ctx);
+    }
   } else {
-    LOG(FATAL) << "unsupport this type 3x3 dw conv";
+    LOG(FATAL) << "fp32 depthwise conv3x3 stride: " << stride << " unsupported";
   }
-#endif
 }
 
 void conv_depthwise_5x5_fp32(const void* din,
@@ -662,7 +671,8 @@ void conv_depthwise_5x5_fp32(const void* din,
                              const operators::ConvParam& param,
                              ARMContext* ctx,
                              const float* scale) {
-  int pad = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad = paddings[0];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -718,8 +728,9 @@ void conv_depthwise_3x3_int8_fp32(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -776,8 +787,9 @@ void conv_depthwise_3x3_int8_int8(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -834,8 +846,9 @@ void conv_depthwise_5x5_int8_fp32(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
@@ -875,8 +888,9 @@ void conv_depthwise_5x5_int8_int8(const void* din,
                                   const operators::ConvParam& param,
                                   ARMContext* ctx,
                                   const float* scale) {
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride = param.strides[1];
   bool flag_relu = param.fuse_relu;
   bool flag_bias = param.bias != nullptr;
diff --git a/lite/backends/arm/math/conv_impl.h b/lite/backends/arm/math/conv_impl.h
index c5baa31e14..f4d00039aa 100644
--- a/lite/backends/arm/math/conv_impl.h
+++ b/lite/backends/arm/math/conv_impl.h
@@ -314,7 +314,23 @@ void fill_bias_int8(int* tensor,
                     const int* bias,
                     int channel,
                     int channel_size);
+// new winograd
 
+void weight_trans_c4(
+    float* dest, const float* src, int ic, int oc, void* workspace);
+void conv_compute_6x6_3x3(const float* input,
+                          float* output,
+                          int num,
+                          int chout,
+                          int hout,
+                          int wout,
+                          int chin,
+                          int hin,
+                          int win,
+                          const float* weight,
+                          const float* bias,
+                          const operators::ConvParam& param,
+                          ARMContext* ctx);
 }  // namespace math
 }  // namespace arm
 }  // namespace lite
diff --git a/lite/backends/arm/math/conv_winograd_3x3.cc b/lite/backends/arm/math/conv_winograd_3x3.cc
index 87b08f6310..894b946a32 100644
--- a/lite/backends/arm/math/conv_winograd_3x3.cc
+++ b/lite/backends/arm/math/conv_winograd_3x3.cc
@@ -37,9 +37,9 @@ void conv_winograd3x3(const float* din,
                       const operators::ConvParam& param,
                       ARMContext* ctx) {
   int threads = ctx->threads();
-
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[1];
   int size_in_channel = win * hin;
   int size_out_channel = wout * hout;
   bool flag_relu = param.fuse_relu;
diff --git a/lite/backends/arm/math/funcs.h b/lite/backends/arm/math/funcs.h
index d8ef6ff47d..8977b5712c 100644
--- a/lite/backends/arm/math/funcs.h
+++ b/lite/backends/arm/math/funcs.h
@@ -39,10 +39,12 @@
 #include "lite/backends/arm/math/im2sequence.h"
 #include "lite/backends/arm/math/increment.h"
 #include "lite/backends/arm/math/interpolate.h"
+#include "lite/backends/arm/math/layout.h"
 #include "lite/backends/arm/math/lrn.h"
 #include "lite/backends/arm/math/negative.h"
 #include "lite/backends/arm/math/norm.h"
 #include "lite/backends/arm/math/packed_sgemm.h"
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
 #include "lite/backends/arm/math/pad2d.h"
 #include "lite/backends/arm/math/pooling.h"
 #include "lite/backends/arm/math/power.h"
diff --git a/lite/backends/arm/math/interpolate.cc b/lite/backends/arm/math/interpolate.cc
index f89410ad11..e9e18043df 100644
--- a/lite/backends/arm/math/interpolate.cc
+++ b/lite/backends/arm/math/interpolate.cc
@@ -22,6 +22,28 @@ namespace lite {
 namespace arm {
 namespace math {
 
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 // The following function bilinear_interp is partially base on
 // https://github.com/Tencent/ncnn/blob/master/src/layer/arm/interp_arm.cpp
 // Tencent is pleased to support the open source community by making ncnn
@@ -472,33 +494,52 @@ void nearest_interp(const float* src,
 
 void interpolate(lite::Tensor* X,
                  lite::Tensor* OutSize,
+                 std::vector<const lite::Tensor*> SizeTensor,
+                 lite::Tensor* Scale,
                  lite::Tensor* Out,
                  int out_height,
                  int out_width,
-                 float height_scale,
-                 float width_scale,
+                 float scale,
                  bool with_align,
                  std::string interpolate_type) {
+  int in_h = X->dims()[2];
+  int in_w = X->dims()[3];
+  if (SizeTensor.size() > 0) {
+    auto new_size = get_new_shape(SizeTensor);
+    out_height = new_size[0];
+    out_width = new_size[1];
+  } else {
+    auto scale_tensor = Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_height = static_cast<int>(in_h * scale);
+      out_width = static_cast<int>(in_w * scale);
+    }
+    auto out_size = OutSize;
+    if (out_size != nullptr) {
+      auto out_size_data = get_new_data_from_tensor<float>(out_size);
+      out_height = static_cast<int>(out_size_data[0]);
+      out_width = static_cast<int>(out_size_data[1]);
+    }
+  }
+  float height_scale = scale;
+  float width_scale = scale;
   if (out_width > 0 && out_height > 0) {
     height_scale = static_cast<float>(out_height / X->dims()[2]);
     width_scale = static_cast<float>(out_width / X->dims()[3]);
   }
-  if (OutSize != nullptr) {
-    auto OutSize_data = OutSize->data<int>();
-    int h_out = OutSize_data[0];  // HW
-    int w_out = OutSize_data[1];  // HW
-    int num_cout = Out->dims()[0];
-    int c_cout = Out->dims()[1];
-    Out->Resize({num_cout, c_cout, h_out, w_out});
-  }
+  int num_cout = X->dims()[0];
+  int c_cout = X->dims()[1];
+  Out->Resize({num_cout, c_cout, out_height, out_width});
 
   float* dout = Out->mutable_data<float>();
   const float* din = X->data<float>();
   int out_num = Out->dims()[0];
   int out_c = Out->dims()[1];
   int count = out_num * out_c;
-  int in_h = X->dims()[2];
-  int in_w = X->dims()[3];
   int out_h = Out->dims()[2];
   int out_w = Out->dims()[3];
   int spatial_in = in_h * in_w;
diff --git a/lite/backends/arm/math/interpolate.h b/lite/backends/arm/math/interpolate.h
index be250f6a5e..e9c41c5bc8 100644
--- a/lite/backends/arm/math/interpolate.h
+++ b/lite/backends/arm/math/interpolate.h
@@ -44,11 +44,12 @@ void nearest_interp(const float* src,
 
 void interpolate(lite::Tensor* X,
                  lite::Tensor* OutSize,
+                 std::vector<const lite::Tensor*> SizeTensor,
+                 lite::Tensor* Scale,
                  lite::Tensor* Out,
                  int out_height,
                  int out_width,
-                 float height_scale,
-                 float width_scale,
+                 float scale,
                  bool with_align,
                  std::string interpolate_type);
 
diff --git a/lite/backends/arm/math/layout.cc b/lite/backends/arm/math/layout.cc
new file mode 100644
index 0000000000..fd9126ab48
--- /dev/null
+++ b/lite/backends/arm/math/layout.cc
@@ -0,0 +1,668 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/layout.h"
+#include <string>
+#include <vector>
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+#ifdef __aarch64__
+#define TRANS_C4                                                \
+  "ld1 {v0.4s}, [%[din0_ptr]]   \n"                             \
+  "ld1 {v1.4s}, [%[din1_ptr]]   \n"                             \
+  "ld1 {v2.4s}, [%[din2_ptr]]   \n"                             \
+  "ld1 {v3.4s}, [%[din3_ptr]]   \n"                             \
+                                                                \
+  "1: \n"                                                       \
+  "trn1 v4.4s, v0.4s, v1.4s \n" /*00 10 02 12 */                \
+  "trn1 v5.4s, v2.4s, v3.4s \n" /*20 30 22 32 */                \
+  "trn2 v6.4s, v0.4s, v1.4s \n" /*01 11 03 13 */                \
+  "trn2 v7.4s, v2.4s, v3.4s \n" /*21 31 23 33 */                \
+                                                                \
+  "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \
+                                                                \
+  "trn1 v8.2d, v4.2d, v5.2d \n"  /*00 10 20 30 */               \
+  "trn1 v9.2d, v6.2d, v7.2d \n"  /*01 11 21 31 */               \
+  "trn2 v10.2d, v4.2d, v5.2d \n" /*02 12 22 32 */               \
+  "trn2 v11.2d, v6.2d, v7.2d \n" /*03 13 23 33 */               \
+                                                                \
+  "ld1 {v0.4s}, [%[din0_ptr]]   \n"                             \
+  "ld1 {v1.4s}, [%[din1_ptr]]   \n"                             \
+  "ld1 {v2.4s}, [%[din2_ptr]]   \n"                             \
+  "ld1 {v3.4s}, [%[din3_ptr]]   \n"                             \
+                                                                \
+  "subs %w[cnt], %w[cnt], #1 \n"                                \
+  "str q8, [%[out0_ptr]], #16 \n"                               \
+  "str q9, [%[out1_ptr]], #16 \n"                               \
+  "str q10, [%[out2_ptr]], #16 \n"                              \
+  "str q11, [%[out3_ptr]], #16 \n"                              \
+  "bne 1b \n"
+
+#define TRANS_C8                                                  \
+  "1: \n"                                                         \
+  "ld1 {v0.8b}, [%[din0_ptr]]   \n"                               \
+  "ld1 {v1.8b}, [%[din1_ptr]]   \n"                               \
+  "ld1 {v2.8b}, [%[din2_ptr]]   \n"                               \
+  "ld1 {v3.8b}, [%[din3_ptr]]   \n"                               \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "trn1 v8.8b, v0.8b, v1.8b \n"  /*00 10 02 12 04 14 06 16 */     \
+  "trn1 v9.8b, v2.8b, v3.8b \n"  /*20 30 22 32 */                 \
+  "trn2 v12.8b, v0.8b, v1.8b \n" /*01 11 03 13 05 15 07 17 */     \
+  "trn2 v13.8b, v2.8b, v3.8b \n" /*21 31 23 33 */                 \
+                                                                  \
+  "ld1 {v4.8b}, [%[din0_ptr]]   \n"                               \
+  "ld1 {v5.8b}, [%[din1_ptr]]   \n"                               \
+  "ld1 {v6.8b}, [%[din2_ptr]]   \n"                               \
+  "ld1 {v7.8b}, [%[din3_ptr]]   \n"                               \
+                                                                  \
+  "trn1 v10.8b, v4.8b, v5.8b \n" /*40 50 42 52 */                 \
+  "trn1 v11.8b, v6.8b, v7.8b \n" /*60 70 62 72 */                 \
+  "trn2 v14.8b, v4.8b, v5.8b \n" /*41 51 43 53 */                 \
+  "trn2 v15.8b, v6.8b, v7.8b \n" /*61 71 63 73 */                 \
+                                                                  \
+  "trn1 v0.4h, v8.4h, v9.4h \n"   /*00 10 20 30 04 14 24 34*/     \
+  "trn1 v2.4h, v12.4h, v13.4h \n" /*01 11 21 31 05 15 25 35*/     \
+  "trn1 v1.4h, v10.4h, v11.4h \n" /*40 50 60 70 44 54 64 74*/     \
+  "trn1 v3.4h, v14.4h, v15.4h \n" /*41 51 61 71 45 55 65 75*/     \
+                                                                  \
+  "trn2 v4.4h, v8.4h, v9.4h \n"   /*02 10 20 30 06 14 24 34*/     \
+  "trn2 v6.4h, v12.4h, v13.4h \n" /*03 11 21 31 07 15 25 35*/     \
+  "trn2 v5.4h, v10.4h, v11.4h \n" /*42 50 60 70 46 54 64 74*/     \
+  "trn2 v7.4h, v14.4h, v15.4h \n" /*43 51 61 71 47 55 65 75*/     \
+                                                                  \
+  "trn1 v8.2s, v0.2s, v1.2s \n"  /*00 10 20 30 40 50 60 70*/      \
+  "trn1 v9.2s, v2.2s, v3.2s \n"  /*01 11 21 31 41 51 61 71*/      \
+  "trn1 v10.2s, v4.2s, v5.2s \n" /*02 12 22 32 42 50 60 70*/      \
+  "trn1 v11.2s, v6.2s, v7.2s \n" /*03 13 23 33 41 51 61 71*/      \
+                                                                  \
+  "trn2 v12.2s, v0.2s, v1.2s \n" /*04 14 24 34 44 54 64 74*/      \
+  "trn2 v13.2s, v2.2s, v3.2s \n" /*05 15 25 35  45 55 65 75*/     \
+  "trn2 v14.2s, v4.2s, v5.2s \n" /*06 16 22 32 42 50 60 70*/      \
+  "trn2 v15.2s, v6.2s, v7.2s \n" /*07 17 23 33 41 51 61 71*/      \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "subs %w[cnt], %w[cnt], #1 \n"                                  \
+  "st1 {v8.8b}, [%[out0_ptr]], #8 \n"                             \
+  "st1 {v9.8b}, [%[out1_ptr]], #8 \n"                             \
+  "st1 {v10.8b}, [%[out2_ptr]], #8 \n"                            \
+  "st1 {v11.8b}, [%[out3_ptr]], #8 \n"                            \
+                                                                  \
+  "st1 {v11.8b}, [%[out4_ptr]], #8 \n"                            \
+  "st1 {v12.8b}, [%[out5_ptr]], #8 \n"                            \
+  "st1 {v13.8b}, [%[out6_ptr]], #8 \n"                            \
+  "st1 {v14.8b}, [%[out7_ptr]], #8 \n"                            \
+  "bne 1b \n"
+
+#else
+#define TRANS_C4                                                \
+  "1: \n"                                                       \
+  "vld1.32 {d0-d1}, [%[din0_ptr]] \n"                           \
+  "vld1.32 {d2-d3}, [%[din1_ptr]] \n"                           \
+  "vld1.32 {d4-d5}, [%[din2_ptr]] \n"                           \
+  "vld1.32 {d6-d7}, [%[din3_ptr]] \n"                           \
+                                                                \
+  "vtrn.32 q0, q1 \n" /*00 10 02 12 01 11 03 13*/               \
+  "vtrn.32 q2, q3 \n" /*20 30 22 32 21 31 23 33 */              \
+                                                                \
+  "add %[din0_ptr], %[din0_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride] \n" /* din+=c*size*/ \
+  "vswp d1, d4 \n"                                              \
+  "vswp d3, d6 \n"                                              \
+                                                                \
+  "subs %[cnt], %[cnt], #1 \n"                                  \
+  "vst1.32  {d0-d1}, [%[out0_ptr]]! \n"                         \
+  "vst1.32  {d2-d3}, [%[out1_ptr]]! \n"                         \
+  "vst1.32  {d4-d5}, [%[out2_ptr]]! \n"                         \
+  "vst1.32  {d6-d7}, [%[out3_ptr]]! \n"                         \
+  "bne 1b \n"
+
+#define TRANS_C8                                                  \
+  "1: \n"                                                         \
+  "vld1.8 d0, [%[din0_ptr]] \n"                                   \
+  "vld1.8 d1, [%[din1_ptr]] \n"                                   \
+  "vld1.8 d2, [%[din2_ptr]] \n"                                   \
+  "vld1.8 d3, [%[din3_ptr]] \n"                                   \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "vtrn.8 d0, d1 \n" /*00 10 02 12 04 14 06 16*/                  \
+  "vtrn.8 d2, d3 \n" /*20 30 22 32 24 34 26 36 */                 \
+                                                                  \
+  "vld1.8 d4, [%[din0_ptr]] \n"                                   \
+  "vld1.8 d5, [%[din1_ptr]] \n"                                   \
+  "vld1.8 d6, [%[din2_ptr]] \n"                                   \
+  "vld1.8 d7, [%[din3_ptr]] \n"                                   \
+                                                                  \
+  "vtrn.16 d0, d2 \n" /*00 10 20 30 04 14 24 34*/                 \
+  "vtrn.16 d1, d3 \n" /* 01 11 21 31 05 15 25 35 */               \
+  "vtrn.8 d4, d5 \n"  /*40 50 02 12 04 14 06 16*/                 \
+  "vtrn.8 d6, d7 \n"  /*60 70 22 32 24 34 26 36 */                \
+                                                                  \
+  "add %[din0_ptr], %[din0_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din1_ptr], %[din1_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din2_ptr], %[din2_ptr], %[stride_w] \n" /* din+=c*size*/ \
+  "add %[din3_ptr], %[din3_ptr], %[stride_w] \n" /* din+=c*size*/ \
+                                                                  \
+  "vtrn.16 d4, d6 \n" /*40 50 60 70 04 14 24 34*/                 \
+  "vtrn.16 d5, d7 \n" /* 41 51 61 71 05 15 25 35 */               \
+                                                                  \
+  "vtrn.32 d0, d4 \n" /*00 10 20 30 40 50 60 70*/                 \
+  "vtrn.32 d1, d5 \n" /* 01 11 21 31 41 51 61 71 */               \
+  "vtrn.32 d2, d6 \n" /*02 12 22 32 42 52 62 72*/                 \
+  "vtrn.32 d3, d7 \n" /* 03 11 21 33 43 53 63 73 */               \
+                                                                  \
+  "subs %[cnt], %[cnt], #1 \n"                                    \
+  "vst1.8  {d0}, [%[out0_ptr]]! \n"                               \
+  "vst1.8  {d1}, [%[out1_ptr]]! \n"                               \
+  "vst1.8  {d2}, [%[out2_ptr]]! \n"                               \
+  "vst1.8  {d3}, [%[out3_ptr]]! \n"                               \
+  "vst1.8  {d4}, [%[out4_ptr]]! \n"                               \
+  "vst1.8  {d5}, [%[out5_ptr]]! \n"                               \
+  "vst1.8  {d6}, [%[out6_ptr]]! \n"                               \
+  "vst1.8  {d7}, [%[out7_ptr]]! \n"                               \
+  "bne 1b \n"
+
+#endif
+template <>
+void NCHW2NHWC<float>(int N, int C, int size, const float* X, float* Y) {
+  int cnt = C >> 2;
+  int remain = C % 4;
+  int sum = C * size;
+  int stride = size << 4;  // 4 * size
+  int stride_w = stride >> 2;
+  for (int n = 0; n < N; n++) {
+    const float* din = X + n * sum;
+    float* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < size - 3; s += 4) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + size;
+      const float* din2_ptr = din1_ptr + size;
+      const float* din3_ptr = din2_ptr + size;
+      float* out0_ptr = dout + s * C;
+      float* out1_ptr = out0_ptr + C;
+      float* out2_ptr = out1_ptr + C;
+      float* out3_ptr = out2_ptr + C;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12");
+#else
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      for (int i = 0; i < remain; i++) {
+        const float* ptr = din0_ptr;
+        *out0_ptr++ = *ptr++;
+        *out1_ptr++ = *ptr++;
+        *out2_ptr++ = *ptr++;
+        *out3_ptr++ = *ptr++;
+        din0_ptr += size;
+      }
+    }
+    // remain size
+    for (; s < size; s++) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + size;
+      const float* din2_ptr = din1_ptr + size;
+      const float* din3_ptr = din2_ptr + size;
+      float* out0_ptr = dout + s * C;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        din0_ptr += stride_w;
+        din1_ptr += stride_w;
+        din2_ptr += stride_w;
+        din3_ptr += stride_w;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += size;
+      }
+    }
+  }
+}
+template <>
+void NCHW2NHWC<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
+  int cnt = C >> 3;
+  int remain = C % 8;
+  int sum = C * size;
+  int stride = size << 3;    // 8 * size
+  int stride_w = size << 4;  // 4 * size * 4
+  for (int n = 0; n < N; n++) {
+    const int8_t* din = X + n * sum;
+    int8_t* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < size - 7; s += 8) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + size;
+      const int8_t* din2_ptr = din1_ptr + size;
+      const int8_t* din3_ptr = din2_ptr + size;
+      int8_t* out0_ptr = dout + s * C;
+      int8_t* out1_ptr = out0_ptr + C;
+      int8_t* out2_ptr = out1_ptr + C;
+      int8_t* out3_ptr = out2_ptr + C;
+      int8_t* out4_ptr = out3_ptr + C;
+      int8_t* out5_ptr = out4_ptr + C;
+      int8_t* out6_ptr = out5_ptr + C;
+      int8_t* out7_ptr = out6_ptr + C;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+#else
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      // const int8_t* din_ptr = din + 8 * cnt * size + s; // remain channel
+      for (int i = 0; i < remain; i++) {
+        const int8_t* ptr = din0_ptr;
+        *out0_ptr = *ptr++;
+        *out1_ptr = *ptr++;
+        *out2_ptr = *ptr++;
+        *out3_ptr = *ptr++;
+        din0_ptr += size;
+        *out4_ptr = *ptr++;
+        *out5_ptr = *ptr++;
+        *out6_ptr = *ptr++;
+        *out7_ptr = *ptr++;
+      }
+    }
+    // remain size
+    for (; s < size; s++) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + size;
+      const int8_t* din2_ptr = din1_ptr + size;
+      const int8_t* din3_ptr = din2_ptr + size;
+      const int8_t* din4_ptr = din3_ptr + size;
+      const int8_t* din5_ptr = din4_ptr + size;
+      const int8_t* din6_ptr = din5_ptr + size;
+      const int8_t* din7_ptr = din6_ptr + size;
+      int8_t* out0_ptr = dout + s * C;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        *out0_ptr++ = *din4_ptr;
+        *out0_ptr++ = *din5_ptr;
+        *out0_ptr++ = *din6_ptr;
+        *out0_ptr++ = *din7_ptr;
+        din0_ptr += stride;
+        din1_ptr += stride;
+        din2_ptr += stride;
+        din3_ptr += stride;
+        din4_ptr += stride;
+        din5_ptr += stride;
+        din6_ptr += stride;
+        din7_ptr += stride;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += size;
+      }
+    }
+  }
+}
+template <>
+void NHWC2NCHW<float>(int N, int C, int size, const float* X, float* Y) {
+  int cnt = size >> 2;
+  int remain = size % 4;
+  int sum = C * size;
+  int stride = C << 4;  // 4 * size
+  int stride_w = C << 2;
+  for (int n = 0; n < N; n++) {
+    const float* din = X + n * sum;
+    float* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < C - 3; s += 4) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + C;
+      const float* din2_ptr = din1_ptr + C;
+      const float* din3_ptr = din2_ptr + C;
+      float* out0_ptr = dout + s * size;
+      float* out1_ptr = out0_ptr + size;
+      float* out2_ptr = out1_ptr + size;
+      float* out3_ptr = out2_ptr + size;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11");
+#else
+        asm volatile(TRANS_C4
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride] "+r"(stride)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      for (int i = 0; i < remain; i++) {
+        const float* ptr = din0_ptr;
+        *out0_ptr++ = *ptr++;
+        *out1_ptr++ = *ptr++;
+        *out2_ptr++ = *ptr++;
+        *out3_ptr++ = *ptr++;
+        din0_ptr += C;
+      }
+    }
+    // remain size
+    for (; s < C; s++) {
+      const float* din0_ptr = din + s;
+      const float* din1_ptr = din0_ptr + C;
+      const float* din2_ptr = din1_ptr + C;
+      const float* din3_ptr = din2_ptr + C;
+      float* out0_ptr = dout + s * size;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        din0_ptr += stride_w;
+        din1_ptr += stride_w;
+        din2_ptr += stride_w;
+        din3_ptr += stride_w;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += C;
+      }
+    }
+  }
+}
+template <>
+void NHWC2NCHW<int8_t>(int N, int C, int size, const int8_t* X, int8_t* Y) {
+  int cnt = size >> 3;
+  int remain = size % 8;
+  int sum = C * size;
+  int stride = C << 3;    // 8 * size
+  int stride_w = C << 4;  // 4 * size
+  for (int n = 0; n < N; n++) {
+    const int8_t* din = X + n * sum;
+    int8_t* dout = Y + n * sum;
+    int s = 0;
+#pragma omp parallel for
+    for (s = 0; s < C - 7; s += 8) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + C;
+      const int8_t* din2_ptr = din1_ptr + C;
+      const int8_t* din3_ptr = din2_ptr + C;
+      const int8_t* din4_ptr = din3_ptr + C;
+      const int8_t* din5_ptr = din4_ptr + C;
+      const int8_t* din6_ptr = din5_ptr + C;
+      const int8_t* din7_ptr = din6_ptr + C;
+      int8_t* out0_ptr = dout + s * size;
+      int8_t* out1_ptr = out0_ptr + size;
+      int8_t* out2_ptr = out1_ptr + size;
+      int8_t* out3_ptr = out2_ptr + size;
+      int8_t* out4_ptr = out3_ptr + size;
+      int8_t* out5_ptr = out4_ptr + size;
+      int8_t* out6_ptr = out5_ptr + size;
+      int8_t* out7_ptr = out6_ptr + size;
+      int cnt_num = cnt;
+      if (cnt_num > 0) {
+#ifdef __aarch64__
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc",
+                       "memory",
+                       "v0",
+                       "v1",
+                       "v2",
+                       "v3",
+                       "v4",
+                       "v5",
+                       "v6",
+                       "v7",
+                       "v8",
+                       "v9",
+                       "v10",
+                       "v11",
+                       "v12",
+                       "v13",
+                       "v14",
+                       "v15");
+#else
+        asm volatile(TRANS_C8
+                     : [din0_ptr] "+r"(din0_ptr),
+                       [din1_ptr] "+r"(din1_ptr),
+                       [din2_ptr] "+r"(din2_ptr),
+                       [din3_ptr] "+r"(din3_ptr),
+                       [out0_ptr] "+r"(out0_ptr),
+                       [out1_ptr] "+r"(out1_ptr),
+                       [out2_ptr] "+r"(out2_ptr),
+                       [out3_ptr] "+r"(out3_ptr),
+                       [out4_ptr] "+r"(out4_ptr),
+                       [out5_ptr] "+r"(out5_ptr),
+                       [out6_ptr] "+r"(out6_ptr),
+                       [out7_ptr] "+r"(out7_ptr),
+                       [cnt] "+r"(cnt_num),
+                       [stride_w] "+r"(stride_w)
+                     :
+                     : "cc", "memory", "q0", "q1", "q2", "q3");
+#endif
+      }
+      for (int i = 0; i < remain; i++) {
+        const int8_t* ptr = din0_ptr;
+        *out0_ptr++ = *ptr++;
+        *out1_ptr++ = *ptr++;
+        *out2_ptr++ = *ptr++;
+        *out3_ptr++ = *ptr++;
+        *out4_ptr++ = *ptr++;
+        *out5_ptr++ = *ptr++;
+        *out6_ptr++ = *ptr++;
+        *out7_ptr++ = *ptr++;
+        din0_ptr += C;
+      }
+    }
+    // remain size
+    for (; s < C; s++) {
+      const int8_t* din0_ptr = din + s;
+      const int8_t* din1_ptr = din0_ptr + C;
+      const int8_t* din2_ptr = din1_ptr + C;
+      const int8_t* din3_ptr = din2_ptr + C;
+      const int8_t* din4_ptr = din3_ptr + C;
+      const int8_t* din5_ptr = din4_ptr + C;
+      const int8_t* din6_ptr = din5_ptr + C;
+      const int8_t* din7_ptr = din6_ptr + C;
+      int8_t* out0_ptr = dout + s * size;
+      for (int i = 0; i < cnt; i++) {
+        *out0_ptr++ = *din0_ptr;
+        *out0_ptr++ = *din1_ptr;
+        *out0_ptr++ = *din2_ptr;
+        *out0_ptr++ = *din3_ptr;
+        *out0_ptr++ = *din4_ptr;
+        *out0_ptr++ = *din5_ptr;
+        *out0_ptr++ = *din6_ptr;
+        *out0_ptr++ = *din7_ptr;
+        din0_ptr += stride;
+        din1_ptr += stride;
+        din2_ptr += stride;
+        din3_ptr += stride;
+        din4_ptr += stride;
+        din5_ptr += stride;
+        din6_ptr += stride;
+        din7_ptr += stride;
+      }
+      for (int i = 0; i < remain; i++) {
+        *out0_ptr++ = *din0_ptr;
+        din0_ptr += C;
+      }
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/layout.h b/lite/backends/arm/math/layout.h
new file mode 100644
index 0000000000..ed0e2f8b78
--- /dev/null
+++ b/lite/backends/arm/math/layout.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+template <typename T>
+void NCHW2NHWC(int N, int C, int HxW, const T* X, T* Y);
+
+template <typename T>
+void NHWC2NCHW(int N, int C, int HxW, const T* X, T* Y);
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/packed_sgemm.cc b/lite/backends/arm/math/packed_sgemm.cc
index 0d6eed9904..092e6937c4 100644
--- a/lite/backends/arm/math/packed_sgemm.cc
+++ b/lite/backends/arm/math/packed_sgemm.cc
@@ -53,6 +53,38 @@ void sgemm_prepacked_8x12(bool is_transB,
                           bool has_bias,
                           bool has_relu,
                           ARMContext *ctx);
+
+void pack_m4(float *out,
+             const float *in,
+             float alpha,
+             int ldin,
+             int m0,
+             int mmax,
+             int k0,
+             int kmax);
+
+void pack_trans_m4(float *out,
+                   const float *in,
+                   float alpha,
+                   int ldin,
+                   int m0,
+                   int mmax,
+                   int k0,
+                   int kmax);
+void sgemm_prepacked_4x4(bool is_transB,
+                         int M,
+                         int N,
+                         int K,
+                         const float *A_packed,
+                         const float *B,
+                         int ldb,
+                         float beta,
+                         float *C,
+                         int ldc,
+                         const float *bias,
+                         bool has_bias,
+                         bool has_relu,
+                         ARMContext *ctx);
 #else
 // for kA72
 void prepackA_6x8(float *out,
@@ -139,13 +171,21 @@ void prepackA(float *out,
               bool is_trans,
               ARMContext *ctx) {
 #ifdef __aarch64__
-  if (is_trans) {
-    prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+  if (mmax <= 4) {
+    if (is_trans) {
+      pack_trans_m4(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    } else {
+      pack_m4(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    }
   } else {
-    prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    if (is_trans) {
+      prepackA_trans_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    } else {
+      prepackA_8x12(out, in, alpha, ldin, m0, mmax, k0, kmax);
+    }
   }
 #else
-  if (ctx->arch() == kA73) {
+  if (ctx->arch() == kA73 || mmax <= 4) {
     if (is_trans) {
       prepackA_trans_4x8(out, in, alpha, ldin, m0, mmax, k0, kmax);
     } else {
@@ -212,22 +252,39 @@ void sgemm_prepack(bool is_transB,
                    bool has_relu,
                    ARMContext *ctx) {
 #ifdef __aarch64__
-  sgemm_prepacked_8x12(is_transB,
-                       M,
-                       N,
-                       K,
-                       A_packed,
-                       B,
-                       ldb,
-                       beta,
-                       C,
-                       ldc,
-                       bias,
-                       has_bias,
-                       has_relu,
-                       ctx);
+  if (M <= 4) {
+    sgemm_prepacked_4x4(is_transB,
+                        M,
+                        N,
+                        K,
+                        A_packed,
+                        B,
+                        ldb,
+                        beta,
+                        C,
+                        ldc,
+                        bias,
+                        has_bias,
+                        has_relu,
+                        ctx);
+  } else {
+    sgemm_prepacked_8x12(is_transB,
+                         M,
+                         N,
+                         K,
+                         A_packed,
+                         B,
+                         ldb,
+                         beta,
+                         C,
+                         ldc,
+                         bias,
+                         has_bias,
+                         has_relu,
+                         ctx);
+  }
 #else   // armv7
-  if (ctx->arch() == kA73) {
+  if (ctx->arch() == kA73 || M <= 4) {
     sgemm_prepacked_4x8(is_transB,
                         M,
                         N,
@@ -522,6 +579,147 @@ void prepackA_8x12(float *dout,
     }
   }
 }
+void pack_m4(float *dout,
+             const float *inptr,
+             float alpha,
+             int ldin,
+             int m0,
+             int mmax,
+             int k0,
+             int kmax) {
+  int x_len = kmax - k0;
+  int stride = x_len * 4;
+  float zerobuff[x_len];  // NOLINT
+  memset(zerobuff, 0, sizeof(float) * x_len);
+  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
+
+#pragma omp parallel for
+  for (int y = m0; y < mmax; y += 4) {
+    float *outptr = dout + stride * (y - m0) / 4;
+
+    const float *inptr0 = inptr + y * ldin + k0;
+    const float *inptr1 = inptr0 + ldin;
+    const float *inptr2 = inptr1 + ldin;
+    const float *inptr3 = inptr2 + ldin;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]        \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(inptr0),
+          [ptr1] "r"(inptr1),
+          [ptr2] "r"(inptr2),
+          [ptr3] "r"(inptr3)
+        : "memory");
+
+    int x = x_len;
+    //! cope with row index exceed real size, set to zero buffer
+    if ((y + 3) >= mmax) {
+      switch ((y + 3) - mmax) {
+        case 2:
+          inptr1 = zerobuff;
+        case 1:
+          inptr2 = zerobuff;
+        case 0:
+          inptr3 = zerobuff;
+        default:
+          break;
+      }
+    }
+    for (; x > 7; x -= 8) {
+      asm volatile(
+          "cbz    %w[has_alpha], 0f\n"            /* check alpha == 1.f? */
+          "dup    v31.4s, %w[alpha]\n"            /* alpha to vector */
+          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
+          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
+          "fmul   v0.4s,  v31.4s, v0.4s\n"        /* mul alpha */
+          "fmul   v1.4s,  v31.4s, v1.4s\n"        /* mul alpha */
+          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
+          "fmul   v2.4s,  v31.4s, v2.4s\n"        /* mul alpha */
+          "fmul   v3.4s,  v31.4s, v3.4s\n"        /* mul alpha */
+          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
+          "fmul   v4.4s,  v31.4s, v4.4s\n"        /* mul alpha */
+          "fmul   v5.4s,  v31.4s, v5.4s\n"        /* mul alpha */
+          "fmul   v6.4s,  v31.4s, v6.4s\n"        /* mul alpha */
+          "fmul   v7.4s,  v31.4s, v7.4s\n"        /* mul alpha */
+          "b 1f\n"                                /* to main process */
+          "0: \n"                                 /* alpha == 1 */
+          "ldp    q0, q1,     [%[inptr0]], #32\n" /* load r0, a0~a7 */
+          "ldp    q2, q3,     [%[inptr1]], #32\n" /* load r1, b0~b7 */
+          "ldp    q4, q5,     [%[inptr2]], #32\n" /* load r2, c0~c7 */
+          "ldp    q6, q7,     [%[inptr3]], #32\n" /* load r3, d0~d7 */
+          "1: \n"                                 /* main process */
+          "trn1   v8.4s, v0.4s, v2.4s\n"          /* a0b0a2b2*/
+          "trn2   v9.4s, v0.4s, v2.4s\n"          /* a1b1a3b3*/
+          "trn1   v10.4s, v1.4s, v3.4s\n"         /* a4b4a6b6*/
+          "trn2   v11.4s, v1.4s, v3.4s\n"         /* a5b5a7b7*/
+
+          "trn1   v12.4s, v4.4s, v6.4s\n" /* c0d0c2d2*/
+          "trn2   v13.4s, v4.4s, v6.4s\n" /* c1d1c3d3*/
+          "trn1   v14.4s, v5.4s, v7.4s\n" /* c4d4c6d6*/
+          "trn2   v15.4s, v5.4s, v7.4s\n" /* c5d5c7d7*/
+
+          "trn1   v0.2d, v8.2d, v12.2d\n"  /* a0b0c0d0 */
+          "trn1   v1.2d, v9.2d, v13.2d\n"  /* a1b1c1d1 */
+          "trn1   v2.2d, v10.2d, v14.2d\n" /* a4b4c4d4 */
+          "trn1   v3.2d, v11.2d, v15.2d\n" /* a5b5c5d5 */
+
+          "trn2   v4.2d, v8.2d, v12.2d\n"     /* a2b2c2d2 */
+          "trn2   v5.2d, v9.2d, v13.2d\n"     /* a3b3c3d3 */
+          "stp    q0, q1, [%[outptr]], #32\n" /* save q0, q1, a0~h0*/
+          "trn2   v6.2d, v10.2d, v14.2d\n"    /* a6b6c6d6 */
+          "trn2   v7.2d, v11.2d, v15.2d\n"    /* a7b7c7d7 */
+          "stp    q4, q5, [%[outptr]], #32\n" /* save q2, q3, a1~h1*/
+          "stp    q2, q3, [%[outptr]], #32\n" /* save q4, q5, a2~h2*/
+          "stp    q6, q7, [%[outptr]], #32\n" /* save q6, q7, a3~h3*/
+
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outptr] "+r"(outptr)
+          : [alpha] "r"(alpha), [has_alpha] "r"(has_alpha)
+          : "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "cc",
+            "memory");
+    }
+
+    for (; x > 0; x--) {
+      if (has_alpha) {
+        *outptr++ = *inptr0++ * alpha;
+        *outptr++ = *inptr1++ * alpha;
+        *outptr++ = *inptr2++ * alpha;
+        *outptr++ = *inptr3++ * alpha;
+      } else {
+        *outptr++ = *inptr0++;
+        *outptr++ = *inptr1++;
+        *outptr++ = *inptr2++;
+        *outptr++ = *inptr3++;
+      }
+    }
+  }
+}
 
 void prepackA_trans_8x12(float *outptr,
                          const float *in,
@@ -682,6 +880,128 @@ void prepackA_trans_8x12(float *outptr,
     }
   }
 }
+void pack_trans_m4(float *outptr,
+                   const float *in,
+                   float alpha,
+                   int ldin,
+                   int m0,
+                   int mmax,
+                   int k0,
+                   int kmax) {
+  auto inptr = in + k0 * ldin + m0;
+  uint32_t mask_buffer[4] = {0, 1, 2, 3};
+  int x_len = mmax - m0;
+  int y_len = kmax - k0;
+  int right_remain = x_len - 4 * (x_len / 4);
+  int stride_out = 4 * y_len;
+
+  float32x4_t vzero = vdupq_n_f32(0.f);
+  uint32x4_t vmask1 =
+      vcltq_u32(vld1q_u32(mask_buffer), vdupq_n_u32(right_remain));
+
+  bool has_alpha = fabsf(alpha - 1.f) > 1e-8f;
+  float32x4_t valpha = vdupq_n_f32(alpha);
+
+#pragma omp parallel for
+  for (int y = 0; y < y_len - 3; y += 4) {
+    const float *ptr0 = inptr + y * ldin;
+    const float *ptr1 = ptr0 + ldin;
+    const float *ptr2 = ptr1 + ldin;
+    const float *ptr3 = ptr2 + ldin;
+
+    asm volatile(
+        "prfm   pldl1keep, [%[ptr0]]        \n"
+        "prfm   pldl1keep, [%[ptr0], #64]   \n"
+        "prfm   pldl1keep, [%[ptr1]]        \n"
+        "prfm   pldl1keep, [%[ptr1], #64]   \n"
+        "prfm   pldl1keep, [%[ptr2]]        \n"
+        "prfm   pldl1keep, [%[ptr2], #64]   \n"
+        "prfm   pldl1keep, [%[ptr3]]        \n"
+        "prfm   pldl1keep, [%[ptr3], #64]   \n"
+        :
+        : [ptr0] "r"(ptr0), [ptr1] "r"(ptr1), [ptr2] "r"(ptr2), [ptr3] "r"(ptr3)
+        : "memory");
+
+    float *outptr_row_col = outptr + y * 4;
+    int i = 0;
+    for (; i < x_len - 3; i += 4) {
+      float32x4_t vr00 = vld1q_f32(ptr0);
+      float32x4_t vr10 = vld1q_f32(ptr1);
+      float32x4_t vr20 = vld1q_f32(ptr2);
+      float32x4_t vr30 = vld1q_f32(ptr3);
+      if (has_alpha) {
+        vr00 = vmulq_f32(vr00, valpha);
+        vr10 = vmulq_f32(vr10, valpha);
+        vr20 = vmulq_f32(vr20, valpha);
+        vr30 = vmulq_f32(vr30, valpha);
+      }
+
+      vst1q_f32(outptr_row_col, vr00);
+      vst1q_f32(outptr_row_col + 4, vr10);
+      vst1q_f32(outptr_row_col + 8, vr20);
+      vst1q_f32(outptr_row_col + 12, vr30);
+
+      ptr0 += 4;
+      ptr1 += 4;
+      ptr2 += 4;
+      ptr3 += 4;
+
+      outptr_row_col += stride_out;
+    }
+    if (right_remain > 0) {
+      float32x4_t vr00 = vld1q_f32(ptr0);
+      float32x4_t vr10 = vld1q_f32(ptr1);
+      float32x4_t vr20 = vld1q_f32(ptr2);
+      float32x4_t vr30 = vld1q_f32(ptr3);
+
+      if (has_alpha) {
+        vr00 = vmulq_f32(vr00, valpha);
+        vr10 = vmulq_f32(vr10, valpha);
+        vr20 = vmulq_f32(vr20, valpha);
+        vr30 = vmulq_f32(vr30, valpha);
+      }
+
+      float32x4_t vr00_1 = vbslq_f32(vmask1, vr00, vzero);
+      float32x4_t vr10_1 = vbslq_f32(vmask1, vr10, vzero);
+      float32x4_t vr20_1 = vbslq_f32(vmask1, vr20, vzero);
+      float32x4_t vr30_1 = vbslq_f32(vmask1, vr30, vzero);
+
+      vst1q_f32(outptr_row_col, vr00_1);
+      vst1q_f32(outptr_row_col + 4, vr10_1);
+      vst1q_f32(outptr_row_col + 8, vr20_1);
+      vst1q_f32(outptr_row_col + 12, vr30_1);
+    }
+  }
+
+#pragma omp parallel for
+  for (int y = 4 * (y_len / 4); y < y_len; ++y) {
+    const float *ptr0 = inptr + y * ldin;
+    float *outptr_row_col = outptr + y * 4;
+    int i = 0;
+    for (; i < x_len - 3; i += 4) {
+      float32x4_t vr0 = vld1q_f32(ptr0);
+      if (has_alpha) {
+        vr0 = vmulq_f32(vr0, valpha);
+      }
+      vst1q_f32(outptr_row_col, vr0);
+
+      ptr0 += 4;
+
+      outptr_row_col += stride_out;
+    }
+    if (right_remain > 0) {
+      float32x4_t vr0 = vld1q_f32(ptr0);
+
+      if (has_alpha) {
+        vr0 = vmulq_f32(vr0, valpha);
+      }
+
+      float32x4_t vr0_1 = vbslq_f32(vmask1, vr0, vzero);
+
+      vst1q_f32(outptr_row_col, vr0_1);
+    }
+  }
+}
 
 #else  // __aarch64__
 void prepackA_6x8(float* outptr,
@@ -2592,6 +2912,292 @@ void sgemm_prepacked_8x12(bool is_transB,
     }
   }
 }
+
+void sgemm_prepacked_4x4(bool is_transB,
+                         int M,
+                         int N,
+                         int K,
+                         const float *A_packed,
+                         const float *B,
+                         int ldb,
+                         float beta,
+                         float *C,
+                         int ldc,
+                         const float *bias,
+                         bool has_bias,
+                         bool has_relu,
+                         ARMContext *ctx) {
+  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
+  auto workspace = ctx->workspace_data<float>();
+  int threads = ctx->threads();
+
+  const int n_block = 4;
+  const int m_block = 4;
+  //! MBLOCK * x (result) + MBLOCK * k (A) + x * k (B) = l2
+  int x_block = (l2_cache - (m_block * K)) / (sizeof(float) * (K + m_block));
+  x_block /= n_block;
+  x_block *= n_block;
+  int x_num = (N + (x_block - 1)) / x_block;
+  x_block = (N + x_num - 1) / x_num;
+  x_block = (x_block + n_block - 1) / n_block;
+  x_block *= n_block;
+  x_block = x_block < n_block ? n_block : x_block;
+
+  // unroll 2 loop
+  int tail_pre = (K & (KBLOCK - 1));
+  int k_pre = ((K + KBLOCK - 1) / KBLOCK) - 1;
+  if (tail_pre == 0) {
+    tail_pre = KBLOCK;
+  }
+
+  bool flag_p_remain = false;
+  int remain = 0;
+
+  int has_beta = fabsf(beta) > 1e-8f ? 1 : 0;
+  //! apanel is pre_compute outside gemm
+  for (unsigned int x0 = 0; x0 < N; x0 += x_block) {
+    unsigned int xmax = x0 + x_block;
+    if (xmax > N) {
+      xmax = N;
+    }
+    int bblocks = (xmax - x0 + n_block - 1) / n_block;
+    remain = xmax - x0 - (bblocks - 1) * n_block;
+    if (remain > 0) {
+      flag_p_remain = true;
+    }
+    //! load bpanel
+    float *b_pannel = workspace;
+    if (is_transB) {
+      pack_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K);
+    } else {
+      pack_trans_m4(b_pannel, B, 1.0f, ldb, x0, xmax, 0, K);
+    }
+#pragma omp parallel for num_threads(threads)
+    for (unsigned int y = 0; y < M; y += m_block) {
+      unsigned int ymax = y + m_block;
+      if (ymax > M) {
+        ymax = M;
+      }
+
+      float bias_local[4] = {0};
+      if (has_bias) {
+        bias_local[0] = bias[y];
+        bias_local[1] = bias[y + 1];
+        bias_local[2] = bias[y + 2];
+        bias_local[3] = bias[y + 3];
+      }
+
+      float cout0[n_block];  // NOLINT
+      float cout1[n_block];  // NOLINT
+      float cout2[n_block];  // NOLINT
+      float cout3[n_block];  // NOLINT
+
+      float *c_ptr0 = C + y * ldc + x0;
+      float *c_ptr1 = c_ptr0 + ldc;
+      float *c_ptr2 = c_ptr1 + ldc;
+      float *c_ptr3 = c_ptr2 + ldc;
+
+      float *pout0 = c_ptr0;
+      float *pout1 = c_ptr1;
+      float *pout2 = c_ptr2;
+      float *pout3 = c_ptr3;
+
+      const float *a_ptr_l = A_packed + y * K;
+      const float *b_ptr_l = b_pannel;
+      for (int xb = 0; xb < bblocks; xb++) {
+        if ((y + 3) >= ymax) {
+          switch ((y + 3) - ymax) {
+            case 2:
+              c_ptr1 = cout1;
+            case 1:
+              c_ptr2 = cout2;
+            case 0:
+              c_ptr3 = cout3;
+            default:
+              break;
+          }
+        }
+        if (flag_p_remain && (xb == bblocks - 1)) {
+          pout0 = c_ptr0;
+          pout1 = c_ptr1;
+          pout2 = c_ptr2;
+          pout3 = c_ptr3;
+
+          c_ptr0 = cout0;
+          c_ptr1 = cout1;
+          c_ptr2 = cout2;
+          c_ptr3 = cout3;
+          if (has_beta) {
+            for (int i = 0; i < remain; ++i) {
+              cout0[i] = pout0[i];
+              cout1[i] = pout1[i];
+              cout2[i] = pout2[i];
+              cout3[i] = pout3[i];
+            }
+          }
+        }
+        const float *a_ptr = a_ptr_l;
+        const float *b_ptr = b_ptr_l + xb * K * 4;
+        int tail = tail_pre;
+        int k = k_pre;
+        // clang-format off
+        asm volatile(
+            "prfm   pldl1keep, [%[a_ptr]]\n"       /* preload a*/
+            "ld1    {v2.4s}, [%[bias_ptr]]\n"         /* load bias to q2, q3*/
+            "dup    v8.4s,  v2.s[0]\n"               /* out0 = 0 */
+            "prfm   pldl1keep, [%[b_ptr]]\n"       /* preload b*/
+            "dup    v9.4s,  v2.s[1]\n"               /* out1 = 0*/
+            "prfm   pldl1keep, [%[a_ptr], #64]\n"  /* preload a*/
+            "dup    v10.4s, v2.s[2]\n"               /* out2 = 0*/
+            "prfm   pldl1keep, [%[b_ptr], #64]\n"  /* preload b*/
+            "dup    v11.4s, v2.s[3]\n"               /* out3 = 0*/
+            "cbz    %w[has_beta], 0f\n"            /* check beta == 0? */
+            /* process beta */
+            "dup    v7.4s, %w[beta]\n"                    /* beta to vector */
+            "ld1    {v0.4s}, [%[c_ptr0]]\n" /* load output r0 */
+            "ld1    {v1.4s}, [%[c_ptr1]]\n" /* load output r1 */
+            "fmla   v8.4s, v0.4s, v7.4s\n"  /* cr00 += beta * c_r00*/
+            "fmla   v9.4s, v1.4s, v7.4s\n"  /* cr10 += beta * c_r10*/
+            "ld1    {v2.4s}, [%[c_ptr2]]\n"
+            "ld1    {v3.4s}, [%[c_ptr3]]\n"
+            "fmla   v10.4s, v2.4s, v7.4s\n" /* cr20 += beta * c_r20*/
+            "fmla   v11.4s, v3.4s, v7.4s\n" /* cr30 += beta * c_r30*/
+
+            "0: \n"                          /* check loop count */
+            "ldp	q0, q1, [%[a_ptr]], #32\n" /* load a00,a10 to q0, q1*/
+            "ldp	q4, q5, [%[b_ptr]], #32\n" /* load b0, b1 to q4, q5*/
+            "cbz	%w[k], 2f\n"               /* check loop count > 0 */
+            /* main loop */
+            /* unrool 0*/
+            "1:\n"                              /* main loop */
+            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
+            "fmla  	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
+            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b2, b3 to q6, q7 */
+            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
+            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
+            
+            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a20, a30 to q2, q3 */
+            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
+            "fmla	v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5 */
+            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
+            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
+            "ldp	q4, q5, [%[b_ptr]], #32\n"    /* load b0, b1 to q4, q5*/
+
+            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b2 =q6 */
+            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b2 =q6 */
+            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b2 =q6*/
+            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b2 =q6*/
+            "ldp	q0, q1, [%[a_ptr]], #32\n"    /* load a00, a10 to q0, q1 */
+
+            "fmla	v8.4s,   v7.4s,  v3.s[0]\n"   /* out0 = b3 * a30[0], b3 =q7*/
+            "fmla	v9.4s,   v7.4s,  v3.s[1]\n"   /* out1 = b3 * a30[1], b3 =q7*/
+            "subs	%w[k], %w[k], #1\n"         /* loop count - 1*/
+            "fmla	v10.4s,  v7.4s,  v3.s[2]\n"   /* out2 = b3 * a30[2], b3 =q7*/
+            "fmla	v11.4s,  v7.4s,  v3.s[3]\n"   /* out3 = b3 * a30[3], b3 =q7*/
+
+            "bne	1b\n"
+            "2:\n"                            /* process tail*/
+            "subs		%w[tail], %w[tail], #1\n" /* tail--*/
+            "beq		3f\n"                     /*jump to tail = 1*/
+            /* final unrool 0*/
+            /* unrool 0, tail > 1*/
+            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
+            "fmla	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
+            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
+            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
+            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
+
+            "beq		4f\n"                     /*jump to tail = 2*/
+            /* unrool 1, tail > 2*/
+            "ldp	q6, q7, [%[b_ptr]], #32\n"    /* load b2, b3 to q6, q7 */
+
+            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
+            "fmla       v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5*/
+            "subs	%w[tail], %w[tail], #1\n"      /* tail--*/
+            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
+            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
+            "ldp	q2, q3, [%[a_ptr]], #32\n"    /* load a20, a30 to q2, q3 */
+
+            "beq		5f\n"                       /*jump to tail = 3*/
+            /* unrool 2, tail = 4*/
+            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b1 =q6 */
+            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b1 =q6 */
+            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b1 =q6*/
+            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/
+
+            /* unrool 3, tail = 4*/
+
+            "fmla	v8.4s,   v7.4s,  v3.s[0]\n"   /* out0 = b3 * a30[0], b3 =q7*/
+            "fmla	v9.4s,   v7.4s,  v3.s[1]\n"   /* out1 = b3 * a30[1], b3 =q7*/
+            "fmla	v10.4s,  v7.4s,  v3.s[2]\n"   /* out2 = b3 * a30[2], b3 =q7*/
+            "fmla	v11.4s,  v7.4s,  v3.s[3]\n"   /* out3 = b3 * a30[3], b3 =q7*/
+
+            "b		11f\n"
+            /* tails==1 final tail*/
+            "3: \n"                            /* tail=1*/
+            "fmla 	v8.4s,   v4.4s,  v0.s[0]\n"   /* out0 = b0 * a00[0], b0 =q4 */
+            "fmla	v9.4s,   v4.4s,  v0.s[1]\n"   /* out1 = b0 * a00[1], b0 =q4 */
+            "fmla	v10.4s,  v4.4s,  v0.s[2]\n"   /* out2 = b0 * a00[2], b0 =q4 */
+            "fmla	v11.4s,  v4.4s,  v0.s[3]\n"   /* out3 = b0 * a00[3], b0 =q4 */
+
+            "b		11f\n"
+            /* tails==2 final tail*/
+            "4:\n"                              /* tail = 2*/
+
+            "fmla 	v8.4s,   v5.4s,  v1.s[0]\n"   /* out0 = b1 * a10[0], b1 =q5 */
+            "fmla       v9.4s,   v5.4s,  v1.s[1]\n"   /* out1 = b1 * a10[1], b1 =q5*/
+            "fmla	v10.4s,  v5.4s,  v1.s[2]\n"   /* out2 = b1 * a10[2], b1 =q5 */
+            "fmla	v11.4s,  v5.4s,  v1.s[3]\n"   /* out3 = b1 * a10[3], b1 =q5 */
+
+            "b		11f\n"
+            /* tails==3 final tail*/
+            "5:\n"                              /* tail = 3*/
+            "fmla	v8.4s,   v6.4s,  v2.s[0]\n"   /* out0 = b2 * a20[0], b1 =q6 */
+            "fmla	v9.4s,   v6.4s,  v2.s[1]\n"   /* out1 = b2 * a20[1], b1 =q6 */
+            "fmla	v10.4s,  v6.4s,  v2.s[2]\n"   /* out2 = b2 * a20[2], b1 =q6*/
+            "fmla	v11.4s,  v6.4s,  v2.s[3]\n"   /* out3 = b2 * a20[3], b1 =q6*/
+
+            "11: \n"                            /* check if relu */
+            "cbz    %w[relu],   12f\n"          /* skip relu */
+            "movi   v2.4s, #0\n"                /* for relu*/
+            "fmax   v8.4s, v8.4s, v2.4s\n"      /* relu*/
+            "fmax   v9.4s, v9.4s, v2.4s\n"      /* relu*/
+            "fmax   v10.4s, v10.4s, v2.4s\n"    /* relu*/
+            "fmax   v11.4s, v11.4s, v2.4s\n"    /* relu*/
+            "12: \n"
+            "st1 {v8.4s}, [%[c_ptr0]], #16\n"   /* store r0 */
+            "st1 {v9.4s}, [%[c_ptr1]], #16\n" /* store r1 */
+            "st1 {v10.4s}, [%[c_ptr2]], #16\n" /* store r2 */
+            "st1 {v11.4s}, [%[c_ptr3]], #16\n" /* store r3 */
+
+            : [a_ptr] "+r"(a_ptr),
+              [b_ptr] "+r"(b_ptr),
+              [k] "+r"(k),
+              [tail] "+r"(tail),
+              [c_ptr0] "+r"(c_ptr0),
+              [c_ptr1] "+r"(c_ptr1),
+              [c_ptr2] "+r"(c_ptr2),
+              [c_ptr3] "+r"(c_ptr3)
+            : [bias_ptr] "r"(bias_local),
+              [relu] "r"(has_relu),
+              [has_beta] "r"(has_beta),
+              [beta] "r"(beta)
+            : "cc","memory",
+              "v0","v1","v2","v3","v4","v5","v6","v7",
+              "v8","v9","v10","v11");
+        // clang-format on
+        if (flag_p_remain && (xb == bblocks - 1)) {
+          for (int i = 0; i < remain; ++i) {
+            *pout0++ = cout0[i];
+            *pout1++ = cout1[i];
+            *pout2++ = cout2[i];
+            *pout3++ = cout3[i];
+          }
+        }
+      }
+    }
+  }
+}
 #else  // __aarch64__
 /**
  * \brief gemm with ablock = 6, bblock = 8, output 6x8
diff --git a/lite/backends/arm/math/packed_sgemm_c4.cc b/lite/backends/arm/math/packed_sgemm_c4.cc
new file mode 100644
index 0000000000..8087e0337b
--- /dev/null
+++ b/lite/backends/arm/math/packed_sgemm_c4.cc
@@ -0,0 +1,1171 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/arm/math/packed_sgemm_c4.h"
+#include <arm_neon.h>
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+void loadb_c4(float* out,
+              const float* in,
+              const int xstart,
+              const int xend,
+              const int k_round,
+              const int n) {
+  const int xlen = (xend - xstart + NBLOCK_C4 - 1) / NBLOCK_C4 * NBLOCK_C4;
+  int xloop = xlen / NBLOCK_C4;
+  const int flag_remain = n < xstart + xlen;
+  int remain = 0;
+  int remain4 = 0;
+  int remain1 = 0;
+  if (flag_remain) {
+    remain = (n - xstart) - (xloop - 1) * NBLOCK_C4;
+    remain4 = remain >> 2;
+    remain1 = remain & 3;
+    xloop -= 1;
+  }
+  const int ldo = NBLOCK_C4 * k_round;
+  const int kloop = k_round >> 2;
+  in += xstart * 4;
+  if (xloop > 0) {
+#pragma omp parallel for
+    for (int i = 0; i < kloop; ++i) {
+      float* out_ptr = out + 4 * NBLOCK_C4 * i;
+      const float* in_ptr = in + i * 4 * n;
+      for (int j = 0; j < xloop; ++j) {
+        float* out_p = out_ptr + j * ldo;
+#ifdef __aarch64__
+        asm volatile(
+            "ld1 {v0.4s, v1.4s}, [%[in]],  #32  \n"
+            "ld1 {v2.4s, v3.4s}, [%[in]],  #32  \n"
+            "st1 {v0.4s, v1.4s}, [%[out]], #32  \n"
+            "ld1 {v4.4s, v5.4s}, [%[in]],  #32  \n"
+            "st1 {v2.4s, v3.4s}, [%[out]], #32  \n"
+            "ld1 {v6.4s, v7.4s}, [%[in]],  #32  \n"
+            "st1 {v4.4s, v5.4s}, [%[out]], #32  \n"
+            "st1 {v6.4s, v7.4s}, [%[out]], #32  \n"
+            : [in] "+r"(in_ptr), [out] "+r"(out_p)
+            :
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
+#else
+        asm volatile(
+            "vld1.32 {d0-d3},   [%[in]]!  \n"
+            "vld1.32 {d4-d7},   [%[in]]!  \n"
+            "vst1.32 {d0-d3},   [%[out]]! \n"
+            "vld1.32 {d8-d11},  [%[in]]!  \n"
+            "vst1.32 {d4-d7},   [%[out]]! \n"
+            "vld1.32 {d12-d15}, [%[in]]!  \n"
+            "vst1.32 {d8-d11},  [%[out]]! \n"
+            "vst1.32 {d12-d15}, [%[out]]! \n"
+            : [in] "+r"(in_ptr), [out] "+r"(out_p)
+            :
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
+#endif  // __aarch674__
+      }
+    }
+  }
+  float* out_remain4 = out + xloop * k_round * NBLOCK_C4;
+  const float* in_remain4 = in + xloop * NBLOCK_C4 * 4;
+  if (remain4) {
+#pragma omp parallel for
+    for (int i = 0; i < kloop; ++i) {
+      float* out_ptr = out_remain4 + 4 * 4 * i;
+      const float* in_ptr = in_remain4 + i * 4 * n;
+#ifdef __aarch64__
+      asm volatile(
+          "ld1 {v0.4s, v1.4s}, [%[in]], #32  \n"
+          "ld1 {v2.4s, v3.4s}, [%[in]], #32  \n"
+          "st1 {v0.4s, v1.4s}, [%[out]], #32 \n"
+          "st1 {v2.4s, v3.4s}, [%[out]], #32 \n"
+          : [in] "+r"(in_ptr), [out] "+r"(out_ptr)
+          :
+          : "v0", "v1", "v2", "v3");
+#else
+      asm volatile(
+          "vld1.32 {d0-d3}, [%[in]]!  \n"
+          "vld1.32 {d4-d7}, [%[in]]!  \n"
+          "vst1.32 {d0-d3}, [%[out]]! \n"
+          "vst1.32 {d4-d7}, [%[out]]! \n"
+          : [in] "+r"(in_ptr), [out] "+r"(out_ptr)
+          :
+          : "q0", "q1", "q2", "q3");
+#endif  // __aarch64__
+    }
+  }
+  float* out_remain1 = out_remain4 + remain4 * k_round * 4;
+  const float* in_remain1 = in_remain4 + remain4 * 4 * 4;
+  if (remain1) {
+#pragma omp parallel for
+    for (int i = 0; i < kloop; ++i) {
+      float* out_ptr = out_remain1 + 4 * remain1 * i;
+      const float* in_ptr = in_remain1 + i * 4 * n;
+      for (int j = 0; j < remain1; ++j) {
+        float32x4_t vin = vld1q_f32(in_ptr);
+        in_ptr += 4;
+        vst1q_f32(out_ptr, vin);
+        out_ptr += 4;
+      }
+    }
+  }
+}
+
+void sgemm_prepack_c4_common(int M,
+                             int N,
+                             int K,
+                             const float* A_packed,
+                             const float* B,
+                             float* C,
+                             const float* bias,
+                             bool has_bias,
+                             bool has_relu,
+                             ARMContext* ctx) {
+  const int m_round = (M + 3) / 4 * 4;
+  const int k_round = (K + 3) / 4 * 4;
+  size_t l2_cache = ctx->llc_size() > 0 ? ctx->llc_size() : 512 * 1024;
+  int threads = ctx->threads();
+  auto workspace = ctx->workspace_data<float>();
+  // l2 = ablock * K * threads + K * bchunk_w + threads * ablock * bchunk_w;
+  int bchunk_w = (l2_cache - threads * k_round * sizeof(float)) /
+                 ((k_round + threads * MBLOCK_C4) * sizeof(float));
+  bchunk_w = bchunk_w > N ? N : bchunk_w;
+  bchunk_w = bchunk_w / NBLOCK_C4 * NBLOCK_C4;
+  bchunk_w = bchunk_w > NBLOCK_C4 ? bchunk_w : NBLOCK_C4;
+  int bchunk_loop = (N + bchunk_w - 1) / bchunk_w;
+
+  const int h_loop = m_round >> 2;  // MBLOCK_C4 == 4;
+  const int kcnt = (k_round + KBLOCK_C4 - 1) / KBLOCK_C4;
+  const int ldc = N * 4;
+  const int lda = k_round * 4;
+  float bias_buf[m_round];  // NOLINT
+  if (has_bias) {
+    memcpy(bias_buf, bias, M * sizeof(float));
+    memset(bias_buf + M, 0, (m_round - M) * sizeof(float));
+  } else {
+    memset(bias_buf, 0, m_round * sizeof(float));
+  }
+  // bchunk_loop
+  float* c = C;
+  for (int n = 0; n < bchunk_loop; ++n) {
+    int x_start = n * bchunk_w;
+    int x_end = x_start + bchunk_w;
+    int w_loop = bchunk_w / NBLOCK_C4;
+    int flag_remain = 0;
+    int w_loop4 = 0;
+    int remain = 0;
+    if (x_end > N) {
+      w_loop = (N - x_start) / NBLOCK_C4;
+      int w_loop_rem = (N - x_start) - w_loop * NBLOCK_C4;
+      w_loop4 = w_loop_rem >> 2;
+      remain = w_loop_rem & 3;
+      x_end = N;
+      flag_remain = 1;
+    }
+    float* bchunk = workspace;
+    loadb_c4(bchunk, B, x_start, x_end, k_round, N);
+    float* cchunk = c + n * bchunk_w * 4;
+    int has_remain = (n == bchunk_loop - 1) && flag_remain;
+#pragma omp parallel for num_threads(threads)
+    for (int h = 0; h < h_loop; ++h) {
+      float* bias_h = bias_buf + h * 4;
+#ifdef __aarch64__
+      float32x4_t vzero = vdupq_n_f32(0.f);
+      float32x4_t vbias = vld1q_f32(bias_h);
+#endif
+      const float* ablock = A_packed + h * lda;
+      const float* bblock = bchunk;
+      float* cblock = cchunk + h * ldc;
+      for (int w = 0; w < w_loop; ++w) {
+        int cnt = kcnt;
+        const float* ablock_ptr = ablock;
+// clang-format off
+#ifdef __aarch64__
+        asm volatile(
+            "prfm pldl1keep, [%[a]]         \n"
+            "prfm pldl1keep, [%[b]]         \n"
+            "prfm pldl1keep, [%[b], #64]    \n"
+            "mov  v9.16b,   %[vbias].16b    \n" /* mov bias to c0*/
+            "mov  v10.16b,  %[vbias].16b    \n" /* mov bias to c1*/
+            "mov  v11.16b,  %[vbias].16b    \n" /* mov bias to c2*/
+            "mov  v12.16b,  %[vbias].16b    \n" /* mov bias to c3*/
+            /* load a0a1 to v1-v2  */
+            "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+            "mov  v13.16b,  %[vbias].16b    \n" /* mov bias to c4*/
+            "mov  v14.16b,  %[vbias].16b    \n" /* mov bias to c5*/
+            "mov  v15.16b,  %[vbias].16b    \n" /* mov bias to c6*/
+            "mov  v16.16b,  %[vbias].16b    \n" /* mov bias to c7*/
+            "1:\n"
+            /* load b0b1b2b3 to v5-v8 */
+            "ld1   {v5.4s, v6.4s}, [%[b]], #32 \n"
+            "ld1   {v7.4s, v8.4s}, [%[b]], #32 \n"
+            "prfm  pldl1keep, [%[b]]        \n"
+            "fmla  v9.4s,  v1.4s, v5.s[0]   \n"
+            "fmla  v10.4s, v1.4s, v6.s[0]   \n"
+            "fmla  v11.4s, v1.4s, v7.s[0]   \n"
+            "fmla  v12.4s, v1.4s, v8.s[0]   \n"
+            /* load b4b5b6b7 to v25-v28 */
+            "ld1   {v25.4s, v26.4s}, [%[b]], #32 \n"
+            "ld1   {v27.4s, v28.4s}, [%[b]], #32 \n"
+            "prfm  pldl1keep, [%[a], #32]   \n"
+            "fmla  v9.4s,  v2.4s, v5.s[1]   \n"
+            "fmla  v10.4s, v2.4s, v6.s[1]   \n"
+            "fmla  v11.4s, v2.4s, v7.s[1]   \n"
+            "fmla  v12.4s, v2.4s, v8.s[1]   \n"
+            "prfm  pldl1keep, [%[b], #64]   \n"
+            "fmla  v13.4s, v1.4s, v25.s[0]  \n"
+            "fmla  v14.4s, v1.4s, v26.s[0]  \n"
+            "fmla  v15.4s, v1.4s, v27.s[0]  \n"
+            "fmla  v16.4s, v1.4s, v28.s[0]  \n"
+            /* load a2a3 to v3-v4 */
+            "ld1   {v3.4s, v4.4s},  [%[a]], #32 \n"
+            "prfm  pldl1keep, [%[b], #128]  \n"
+            "fmla  v13.4s, v2.4s, v25.s[1]  \n"
+            "fmla  v14.4s, v2.4s, v26.s[1]  \n"
+            "fmla  v15.4s, v2.4s, v27.s[1]  \n"
+            "fmla  v16.4s, v2.4s, v28.s[1]  \n"
+            "subs  %w[cnt], %w[cnt], #1     \n"
+            "fmla  v9.4s,  v3.4s, v5.s[2]   \n"
+            "fmla  v10.4s, v3.4s, v6.s[2]   \n"
+            "fmla  v11.4s, v3.4s, v7.s[2]   \n"
+            "fmla  v12.4s, v3.4s, v8.s[2]   \n"
+            "fmla  v13.4s, v3.4s, v25.s[2]  \n"
+            "fmla  v14.4s, v3.4s, v26.s[2]  \n"
+            "fmla  v15.4s, v3.4s, v27.s[2]  \n"
+            "fmla  v16.4s, v3.4s, v28.s[2]  \n"
+            /* load a0a1 to v1-v2 */
+            "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+            "fmla  v9.4s,  v4.4s, v5.s[3]   \n"
+            "fmla  v10.4s, v4.4s, v6.s[3]   \n"
+            "fmla  v11.4s, v4.4s, v7.s[3]   \n"
+            "fmla  v12.4s, v4.4s, v8.s[3]   \n"
+
+            "fmla  v13.4s, v4.4s, v25.s[3]  \n"
+            "fmla  v14.4s, v4.4s, v26.s[3]  \n"
+            "fmla  v15.4s, v4.4s, v27.s[3]  \n"
+            "fmla  v16.4s, v4.4s, v28.s[3]  \n"
+            "bne   1b\n"
+            "cbz   %w[relu], 2f             \n"
+            "fmax  v9.4s,  v9.4s,  %[vzero].4s  \n"
+            "fmax  v10.4s, v10.4s, %[vzero].4s  \n"
+            "fmax  v11.4s, v11.4s, %[vzero].4s  \n"
+            "fmax  v12.4s, v12.4s, %[vzero].4s  \n"
+            "fmax  v13.4s, v13.4s, %[vzero].4s  \n"
+            "fmax  v14.4s, v14.4s, %[vzero].4s  \n"
+            "fmax  v15.4s, v15.4s, %[vzero].4s  \n"
+            "fmax  v16.4s, v16.4s, %[vzero].4s  \n"
+            "2:\n"
+            "st1   {v9.4s,  v10.4s, v11.4s, v12.4s}, [%[c]], #64  \n"
+            "st1   {v13.4s, v14.4s, v15.4s, v16.4s}, [%[c]], #64  \n"
+            : [a] "+r"(ablock_ptr),
+              [b] "+r"(bblock),
+              [c] "+r"(cblock),
+              [cnt] "+r"(cnt)
+            : [bias] "r"(bias_h), [relu] "r"(has_relu), 
+              [vbias] "w"(vbias), [vzero] "w" (vzero) 
+            : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", 
+              "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", 
+              "v25", "v26", "v27", "v28", "cc", "memory");
+#else
+        asm volatile(
+            "vld1.32  {d6-d7}, [%[bias]] \n"
+            "pld [%[a]]  \n"
+            "pld [%[b]]  \n"
+            "pld [%[b], #64]  \n"
+            "vmov.32  q8,   q3   \n" /* mov bias to c0*/
+            "vmov.32  q9,   q3   \n" /* mov bias to c1*/
+            "vmov.32  q10,  q3   \n" /* mov bias to c2*/
+            "vmov.32  q11,  q3   \n" /* mov bias to c3*/
+            "vld1.32   {d0-d3}, [%[a]]! \n"
+            "vmov.32  q12,  q3   \n" /* mov bias to c4*/
+            "vmov.32  q13,  q3   \n" /* mov bias to c5*/
+            "vmov.32  q14,  q3   \n" /* mov bias to c6*/
+            "vmov.32  q15,  q3   \n" /* mov bias to c7*/
+            "1:\n"
+            /* c0c1c2c3 */
+            "vld1.32   {d8-d11},  [%[b]]! \n"
+            "vld1.32   {d12-d15}, [%[b]]! \n"
+            "pld  [%[b]]                  \n"
+            "vmla.f32  q8,  q0, d8[0]     \n"
+            "vmla.f32  q9,  q0, d10[0]    \n"
+            "vmla.f32  q10, q0, d12[0]    \n"
+            "vmla.f32  q11, q0, d14[0]    \n"
+            "vld1.32   {d4-d7}, [%[a]]!   \n"
+            "vmla.f32  q8,  q1, d8[1]     \n"
+            "vmla.f32  q9,  q1, d10[1]    \n"
+            "vmla.f32  q10, q1, d12[1]    \n"
+            "vmla.f32  q11, q1, d14[1]    \n"
+            "pld [%[b], #64]              \n"
+            "vmla.f32  q8,  q2, d9[0]     \n"
+            "vmla.f32  q9,  q2, d11[0]    \n"
+            "vmla.f32  q10, q2, d13[0]    \n"
+            "vmla.f32  q11, q2, d15[0]    \n"
+            "subs  %[cnt], %[cnt], #1     \n"
+            "vmla.f32  q8,  q3, d9[1]     \n"
+            "vmla.f32  q9,  q3, d11[1]    \n"
+            "vld1.f32  {d8-d11}, [%[b]]!  \n"
+            "vmla.f32  q10, q3, d13[1]    \n"
+            "vmla.f32  q11, q3, d15[1]    \n"
+            "vld1.32   {d12-d15}, [%[b]]! \n"
+            /* c4c5c6c7 */
+            "vmla.f32  q12, q0, d8[0]     \n"
+            "vmla.f32  q13, q0, d10[0]    \n"
+            "vmla.f32  q14, q0, d12[0]    \n"
+            "vmla.f32  q15, q0, d14[0]    \n"
+            "pld  [%[a], #32]             \n"
+            "vmla.f32  q12, q1, d8[1]     \n"
+            "vmla.f32  q13, q1, d10[1]    \n"
+            "vmla.f32  q14, q1, d12[1]    \n"
+            "vmla.f32  q15, q1, d14[1]    \n"
+            "vld1.32   {d0-d3}, [%[a]]!   \n"
+            "vmla.f32  q12, q2, d9[0]     \n"
+            "vmla.f32  q13, q2, d11[0]    \n"
+            "vmla.f32  q14, q2, d13[0]    \n"
+            "vmla.f32  q15, q2, d15[0]    \n"
+            "pld [%[b], #64]              \n"
+            "vmla.f32  q12, q3, d9[1]     \n"
+            "vmla.f32  q13, q3, d11[1]    \n"
+            "vmla.f32  q14, q3, d13[1]    \n"
+            "vmla.f32  q15, q3, d15[1]    \n"
+            "bne   1b\n"
+            "cmp   %[relu], #0            \n"
+            "beq   2f                     \n"
+            "vmov.u32 q0, #0              \n"
+            "vmax.f32  q8,   q8,   q0     \n"
+            "vmax.f32  q9,   q9,   q0     \n"
+            "vmax.f32  q10,  q10,  q0     \n"
+            "vmax.f32  q11,  q11,  q0     \n"
+            "vmax.f32  q12,  q12,  q0     \n"
+            "vmax.f32  q13,  q13,  q0     \n"
+            "vmax.f32  q14,  q14,  q0     \n"
+            "vmax.f32  q15,  q15,  q0     \n"
+            "2:\n"
+            "vst1.32   {d16-d19}, [%[c]]! \n"
+            "vst1.32   {d20-d23}, [%[c]]! \n"
+            "vst1.32   {d24-d27}, [%[c]]! \n"
+            "vst1.32   {d28-d31}, [%[c]]! \n"
+            : [a] "+r"(ablock_ptr),
+              [b] "+r"(bblock),
+              [c] "+r"(cblock),
+              [cnt] "+r"(cnt)
+            : [bias] "r"(bias_h), 
+              [relu] "r"(has_relu)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9", "q10", "q11", "q12", "q13", "q14", "q15", "cc", "memory");
+#endif
+        // clang-format on
+      }
+      if (has_remain) {
+        if (w_loop4 > 0) {
+          int cnt = kcnt;
+          const float* ablock_ptr = ablock;
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+              "prfm pldl1keep, [%[a]]         \n"
+              "prfm pldl1keep, [%[b]]         \n"
+              "mov  v9.16b,   %[vbias].16b    \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vbias].16b    \n" /* mov bias to c1*/
+              "mov  v11.16b,  %[vbias].16b    \n" /* mov bias to c2*/
+              "mov  v12.16b,  %[vbias].16b    \n" /* mov bias to c3*/
+              /* load a0a1 to v1-v2 */
+              "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+              "1:\n"
+              /* load b0b1b2b3 to v5-v8 */
+              "ld1   {v5.4s, v6.4s}, [%[b]], #32 \n"
+              "ld1   {v7.4s, v8.4s}, [%[b]], #32 \n"
+              "fmla  v9.4s,  v1.4s, v5.s[0]   \n"
+              "fmla  v10.4s, v1.4s, v6.s[0]   \n"
+              "fmla  v11.4s, v1.4s, v7.s[0]   \n"
+              "fmla  v12.4s, v1.4s, v8.s[0]   \n"
+              /* load a2a3 to v3-v4 */
+              "ld1   {v3.4s, v4.4s},  [%[a]], #32 \n"
+              "prfm  pldl1keep, [%[a]]        \n"
+              "fmla  v9.4s,  v2.4s, v5.s[1]   \n"
+              "fmla  v10.4s, v2.4s, v6.s[1]   \n"
+              "fmla  v11.4s, v2.4s, v7.s[1]   \n"
+              "fmla  v12.4s, v2.4s, v8.s[1]   \n"
+              "prfm  pldl1keep, [%[b]]        \n"
+              "subs  %w[cnt], %w[cnt], #1     \n"
+              "fmla  v9.4s,  v3.4s, v5.s[2]   \n"
+              "fmla  v10.4s, v3.4s, v6.s[2]   \n"
+              "fmla  v11.4s, v3.4s, v7.s[2]   \n"
+              "fmla  v12.4s, v3.4s, v8.s[2]   \n"
+              /* load a0a1 to v1-v2 */
+              "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,  v4.4s, v5.s[3]   \n"
+              "fmla  v10.4s, v4.4s, v6.s[3]   \n"
+              "fmla  v11.4s, v4.4s, v7.s[3]   \n"
+              "fmla  v12.4s, v4.4s, v8.s[3]   \n"
+              "bne   1b\n"
+              "cbz   %w[relu], 2f             \n"
+              "fmax  v9.4s,  v9.4s,  %[vzero].4s  \n"
+              "fmax  v10.4s, v10.4s, %[vzero].4s  \n"
+              "fmax  v11.4s, v11.4s, %[vzero].4s  \n"
+              "fmax  v12.4s, v12.4s, %[vzero].4s  \n"
+              "2:\n"
+              "st1   {v9.4s,  v10.4s, v11.4s, v12.4s}, [%[c]], #64  \n"
+              : [a] "+r"(ablock_ptr),
+                [b] "+r"(bblock),
+                [c] "+r"(cblock),
+                [cnt] "+r"(cnt)
+              : [bias] "r"(bias_h),
+                [relu] "r"(has_relu),
+                [vbias] "w"(vbias), 
+                [vzero] "w" (vzero)   
+              : "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+                "v8", "v9", "v10", "v11", "v12", "cc", "memory");
+#else
+          asm volatile(
+            "pld [%[a]]  \n"
+            "pld [%[b]]  \n"
+            "vld1.32  {d6-d7}, [%[bias]] \n"
+            "vld1.32  {d0-d3}, [%[a]]!   \n" /* load a0 a1 */
+            "vmov.32  q8,   q3   \n"     /* mov bias to c0 */
+            "vmov.32  q9,   q3   \n"     /* mov bias to c1 */
+            "vmov.32  q10,  q3   \n"     /* mov bias to c2 */
+            "vmov.32  q11,  q3   \n"     /* mov bias to c3 */
+            "1:\n"
+            /* c0c1c2c3 */
+            "vld1.32   {d8-d11},  [%[b]]! \n"
+            "vld1.32   {d12-d15}, [%[b]]! \n"
+            "pld  [%[b]]                  \n"
+            "vmla.f32  q8,  q0, d8[0]     \n"
+            "vmla.f32  q9,  q0, d10[0]    \n"
+            "vmla.f32  q10, q0, d12[0]    \n"
+            "vmla.f32  q11, q0, d14[0]    \n"
+            "vld1.32   {d4-d7}, [%[a]]!   \n"
+            "pld [%[a]]                   \n"
+            "vmla.f32  q8,  q1, d8[1]     \n"
+            "vmla.f32  q9,  q1, d10[1]    \n"
+            "vmla.f32  q10, q1, d12[1]    \n"
+            "vmla.f32  q11, q1, d14[1]    \n"
+            "subs  %[cnt], %[cnt], #1     \n"
+            "vmla.f32  q8,  q2, d9[0]     \n"
+            "vmla.f32  q9,  q2, d11[0]    \n"
+            "vmla.f32  q10, q2, d13[0]    \n"
+            "vmla.f32  q11, q2, d15[0]    \n"
+            "vld1.32   {d0-d3}, [%[a]]!   \n"
+            "vmla.f32  q8,  q3, d9[1]     \n"
+            "vmla.f32  q9,  q3, d11[1]    \n"
+            "vmla.f32  q10, q3, d13[1]    \n"
+            "vmla.f32  q11, q3, d15[1]    \n"
+            "bne   1b\n"
+            "cmp   %[relu], #0            \n"
+            "beq   2f                     \n"
+            "vmov.u32  q0, #0             \n"
+            "vmax.f32  q8,   q8,   q0     \n"
+            "vmax.f32  q9,   q9,   q0     \n"
+            "vmax.f32  q10,  q10,  q0     \n"
+            "vmax.f32  q11,  q11,  q0     \n"
+            "2:\n"
+            "vst1.32   {d16-d19}, [%[c]]! \n"
+            "vst1.32   {d20-d23}, [%[c]]! \n"
+            : [a] "+r"(ablock_ptr),
+              [b] "+r"(bblock),
+              [c] "+r"(cblock),
+              [cnt] "+r"(cnt)
+            : [bias] "r"(bias_h), [relu] "r"(has_relu)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+              "q9", "q10", "q11", "cc", "memory");
+#endif
+          // clang-format on
+        }
+        if (remain > 0) {
+          int cnt = kcnt;
+          const float* ablock_ptr = ablock;
+// clang-format off
+#ifdef __aarch64__
+          asm volatile(
+              "prfm pldl1keep, [%[a]]   \n"
+              "prfm pldl1keep, [%[b]]   \n"
+              "ld1   {v1.4s, v2.4s}, [%[a]], #32 \n"
+              "cmp  %w[remain], #3      \n"
+              "beq  1f                  \n"
+              "cmp  %w[remain], #2      \n"
+              "beq  2f                  \n"
+              /* remain 1 */
+              "mov  v9.16b,   %[vbias].16b  \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vzero].16b  \n" /* mov zero to c1*/
+              "3:                                 \n"
+              "ld1   {v5.4s}, [%[b]], #16         \n"
+              "ld1   {v3.4s,  v4.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v1.4s,  v5.s[0]     \n"
+              "fmla  v10.4s,  v2.4s,  v5.s[1]     \n"
+              "subs  %w[cnt], %w[cnt], #1         \n"
+              "ld1   {v1.4s,  v2.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v3.4s,  v5.s[2]     \n"
+              "fmla  v10.4s,  v4.4s,  v5.s[3]     \n"
+              "bne   3b                           \n"
+              "fadd  v9.4s,   v9.4s,  v10.4s      \n"
+              "cbz   %w[relu], 6f                 \n"
+              "fmax  v9.4s,   v9.4s,  %[vzero].4s \n"
+              "6:                                 \n"
+              "st1   {v9.4s}, [%[c]], #16         \n"
+              "b     9f                           \n"
+              /* remain 2 */
+              "2:                           \n"
+              "mov  v9.16b,   %[vbias].16b  \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vbias].16b  \n" /* mov bias to c1*/
+              "mov  v11.16b,  %[vzero].16b  \n" /* mov zero to c2*/
+              "mov  v12.16b,  %[vzero].16b  \n" /* mov zero to c3*/
+              "4:                                 \n"
+              "ld1   {v5.4s,  v6.4s}, [%[b]], #32 \n"
+              "ld1   {v3.4s,  v4.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v1.4s,  v5.s[0]     \n"
+              "fmla  v10.4s,  v1.4s,  v6.s[0]     \n"
+              "fmla  v11.4s,  v2.4s,  v5.s[1]     \n"
+              "fmla  v12.4s,  v2.4s,  v6.s[1]     \n"
+              "subs  %w[cnt], %w[cnt], #1         \n"
+              "fmla  v9.4s,   v3.4s,  v5.s[2]     \n"
+              "fmla  v10.4s,  v3.4s,  v6.s[2]     \n"
+              "fmla  v11.4s,  v4.4s,  v5.s[3]     \n"
+              "fmla  v12.4s,  v4.4s,  v6.s[3]     \n"
+              "ld1   {v1.4s,  v2.4s}, [%[a]], #32 \n"
+              "bne   4b                           \n"
+              "fadd  v9.4s,   v9.4s,  v11.4s      \n"
+              "fadd  v10.4s,  v10.4s, v12.4s      \n"
+              "cbz   %w[relu], 7f                 \n"
+              "fmax  v9.4s,   v9.4s,  %[vzero].4s \n"
+              "fmax  v10.4s,  v10.4s, %[vzero].4s \n"
+              "7:                                 \n"
+              "st1   {v9.4s, v10.4s}, [%[c]], #32 \n"
+              "b     9f                           \n"
+              /* remain 3 */
+              "1:                       \n"
+              "mov  v9.16b,   %[vbias].16b  \n" /* mov bias to c0*/
+              "mov  v10.16b,  %[vbias].16b  \n" /* mov bias to c1*/
+              "mov  v11.16b,  %[vbias].16b  \n" /* mov bias to c2*/
+              "5:                                 \n"
+              "ld1   {v5.4s,  v6.4s}, [%[b]], #32 \n"
+              "ld1   {v7.4s}, [%[b]], #16         \n"
+              "fmla  v9.4s,   v1.4s,  v5.s[0]     \n"
+              "fmla  v10.4s,  v1.4s,  v6.s[0]     \n"
+              "fmla  v11.4s,  v1.4s,  v7.s[0]     \n"
+              "ld1   {v3.4s,  v4.4s}, [%[a]], #32 \n"
+              "fmla  v9.4s,   v2.4s,  v5.s[1]     \n"
+              "fmla  v10.4s,  v2.4s,  v6.s[1]     \n"
+              "fmla  v11.4s,  v2.4s,  v7.s[1]     \n"
+              "subs  %w[cnt], %w[cnt], #1         \n"
+              "fmla  v9.4s,   v3.4s,  v5.s[2]     \n"
+              "fmla  v10.4s,  v3.4s,  v6.s[2]     \n"
+              "fmla  v11.4s,  v3.4s,  v7.s[2]     \n"
+              "prfm  pldl1keep, [%[a]]            \n"
+              "fmla  v9.4s,   v4.4s,  v5.s[3]     \n"
+              "fmla  v10.4s,  v4.4s,  v6.s[3]     \n"
+              "fmla  v11.4s,  v4.4s,  v7.s[3]     \n"
+              "ld1   {v1.4s,  v2.4s}, [%[a]], #32 \n"
+              "bne   5b                           \n"
+              "cbz   %w[relu], 8f                 \n"
+              "fmax  v9.4s,   v9.4s,  %[vzero].4s \n"
+              "fmax  v10.4s,  v10.4s, %[vzero].4s \n"
+              "fmax  v11.4s,  v11.4s, %[vzero].4s \n"
+              "8:                                 \n"
+              "st1   {v9.4s, v10.4s}, [%[c]], #32 \n"
+              "st1   {v11.4s}, [%[c]], #16        \n"
+              "9:\n"
+              : [a] "+r"(ablock_ptr),
+                [b] "+r"(bblock),
+                [c] "+r"(cblock),
+                [cnt] "+r"(cnt)
+              : [bias] "r"(bias_h), [relu] "r"(has_relu), 
+                [remain] "r"(remain), [vbias] "w"(vbias), 
+                [vzero] "w" (vzero) 
+              : "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v9", 
+                "v10", "v11", "v12", "cc","memory");
+#else
+          asm volatile(
+              "pld  [%[a]]  \n"
+              "pld  [%[b]]  \n"
+              "vld1.32  {d0-d1}, [%[bias]]  \n"
+              "vld1.32  {d2-d5}, [%[a]]!    \n"
+              "vmov.u32 q15,  #0            \n"
+              "cmp  %[remain], #3           \n"
+              "beq  1f                      \n"
+              "cmp  %[remain], #2           \n"
+              "beq  2f                      \n"
+              /* remain 1 */
+              "vmov.32  q9,   q0  \n" /* mov bias to c0*/
+              "vmov.32  q10,  q15 \n" /* mov zero to c1*/
+              "3:                             \n"
+              "vld1.32   {d10-d11}, [%[b]]!   \n"
+              "vld1.32   {d6-d9},   [%[a]]!   \n"
+              "vmla.f32  q9,  q1,  d10[0]     \n"
+              "vmla.f32  q10, q2,  d10[1]     \n"
+              "subs   %[cnt],  %[cnt], #1     \n"
+              "vld1.32   {d2-d5},   [%[a]]!   \n"
+              "vmla.f32  q9,  q3,  d11[0]     \n"
+              "vmla.f32  q10, q4,  d11[1]     \n"
+              "bne   3b                       \n"
+              "vadd.f32  q9,  q9,  q10        \n"
+              "cmp  %[relu],  #0              \n"
+              "beq  6f                        \n"
+              "vmax.f32  q9,  q9,  q15        \n"
+              "6:                             \n"
+              "vst1.32   {d18-d19}, [%[c]]!   \n"
+              "b     9f                       \n"
+              /* remain 2 */
+              "2:                             \n"
+              "vmov.u32  q9,  q0    \n" /* mov bias to c0*/
+              "vmov.u32  q10, q0    \n" /* mov bias to c1*/
+              "vmov.u32  q11, q15   \n" /* mov zero to c2*/
+              "vmov.u32  q12, q15   \n" /* mov zero to c3*/
+              "4:                             \n"
+              "vld1.32   {d10-d13}, [%[b]]!   \n"
+              "vld1.32   {d6-d9},   [%[a]]!   \n"
+              "vmla.f32  q9,   q1,  d10[0]    \n"
+              "vmla.f32  q10,  q1,  d12[0]    \n"
+              "vmla.f32  q11,  q2,  d10[1]    \n"
+              "vmla.f32  q12,  q2,  d12[1]    \n"
+              "subs  %[cnt],  %[cnt], #1      \n"
+              "vmla.f32  q9,   q3,  d11[0]    \n"
+              "vmla.f32  q10,  q3,  d13[0]    \n"
+              "vmla.f32  q11,  q4,  d11[1]    \n"
+              "vmla.f32  q12,  q4,  d13[1]    \n"
+              "vld1.32   {d2-d5},   [%[a]]!   \n"
+              "bne   4b                       \n"
+              "vadd.f32  q9,   q9,  q11       \n"
+              "vadd.f32  q10,  q10, q12       \n"
+              "cmp  %[relu],  #0              \n"
+              "beq  7f                        \n"
+              "vmax.f32  q9,   q9,  q15       \n"
+              "vmax.f32  q10,  q10, q15       \n"
+              "7:                             \n"
+              "vst1.32   {d18-d21}, [%[c]]!   \n"
+              "b     9f                       \n"
+              /* remain 3 */
+              "1:                             \n"
+              "vmov.u32  q9,   q0    \n" /* mov bias to c0*/
+              "vmov.u32  q10,  q0    \n" /* mov bias to c1*/
+              "vmov.u32  q11,  q0    \n" /* mov bias to c2*/
+              "5:                             \n"
+              "vld1.32   {d10-d13}, [%[b]]!   \n"
+              "vld1.32   {d14-d15}, [%[b]]!   \n"
+              "vmla.f32  q9,  q1,   d10[0]    \n"
+              "vmla.f32  q10, q1,   d12[0]    \n"
+              "vmla.f32  q11, q1,   d14[0]    \n"
+              "vld1.32   {d6-d9},   [%[a]]!   \n"
+              "vmla.f32  q9,  q2,  d10[1]     \n"
+              "vmla.f32  q10, q2,  d12[1]     \n"
+              "vmla.f32  q11, q2,  d14[1]     \n"
+              "subs  %[cnt],  %[cnt], #1      \n"
+              "vmla.f32  q9,  q3,  d11[0]     \n"
+              "vmla.f32  q10, q3,  d13[0]     \n"
+              "vmla.f32  q11, q3,  d15[0]     \n"
+              "pld       [%[a]]               \n"
+              "vmla.f32  q9,  q4,  d11[1]     \n"
+              "vmla.f32  q10, q4,  d13[1]     \n"
+              "vmla.f32  q11, q4,  d15[1]     \n"
+              "vld1.32   {d2-d5},  [%[a]]!    \n"
+              "bne   5b                       \n"
+              "cmp  %[relu],  #0              \n"
+              "beq  8f                        \n"
+              "vmax.f32  q9,  q9,  q15        \n"
+              "vmax.f32  q10, q10, q15        \n"
+              "vmax.f32  q11, q11, q15        \n"
+              "8:                             \n"
+              "vst1.32   {d18-d21}, [%[c]]!   \n"
+              "vst1.32   {d22-d23}, [%[c]]!   \n"
+              "9:\n"
+              : [a] "+r"(ablock_ptr),
+                [b] "+r"(bblock),
+                [c] "+r"(cblock),
+                [cnt] "+r"(cnt)
+              : [bias] "r"(bias_h), 
+                [relu] "r"(has_relu), 
+                [remain] "r"(remain)
+              : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q9", 
+                "q10", "q11", "q12", "q15", "cc","memory");
+#endif
+          // clang-format on
+        }
+      }
+    }
+  }
+}
+
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            const float* bias,
+                            bool has_bias,
+                            bool has_relu,
+                            ARMContext* ctx) {
+  const int m_round = (M + 3) / 4 * 4;
+  const int k_round = (K + 3) / 4 * 4;
+  const int mloop = m_round >> 2;
+  const int lda = 4 * k_round;
+  const int ldb_byte = 4 * N * sizeof(float);
+  const int kcnt = k_round >> 2;
+  float bias_buf[m_round];  // NOLINT
+  if (has_bias) {
+    memcpy(bias_buf, bias, M * sizeof(float));
+    memset(bias_buf + M, 0, (m_round - M) * sizeof(float));
+  } else {
+    memset(bias_buf, 0, m_round * sizeof(float));
+  }
+#ifdef __aarch64__
+  float32x4_t vzero = vdupq_n_f32(0.f);
+#endif
+  const float* bias_ptr = bias_buf;
+  for (int m = 0; m < mloop; ++m) {
+#ifdef __aarch64__
+    float32x4_t vbias = vld1q_f32(bias_ptr);
+#endif
+    const float* b = B;
+    int n = N;
+#ifdef __aarch64__
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* mov bias to c0-c7*/
+        "mov  v8.16b,   %[vbias].16b \n"
+        "mov  v9.16b,   %[vbias].16b \n"
+        "mov  v10.16b,  %[vbias].16b \n"
+        "mov  v11.16b,  %[vbias].16b \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "mov  v12.16b,  %[vbias].16b \n"
+        "mov  v13.16b,  %[vbias].16b \n"
+        "mov  v14.16b,  %[vbias].16b \n"
+        "mov  v15.16b,  %[vbias].16b \n"
+        "1:\n"
+        /* load b2, b3 */
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        "prfm pldl1keep, [%[b]]       \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        /* load b4, b5 */
+        "ld1  {v4.4s, v5.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load b6, b7 */
+        "ld1  {v6.4s, v7.4s}, [%[b]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "sub  %[b],   %[b],   #128    \n"
+        "fmla v12.4s, v16.4s, v4.s[0] \n"
+        "fmla v13.4s, v16.4s, v5.s[0] \n"
+        "fmla v14.4s, v16.4s, v6.s[0] \n"
+        "fmla v15.4s, v16.4s, v7.s[0] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v12.4s, v17.4s, v4.s[1] \n"
+        "fmla v13.4s, v17.4s, v5.s[1] \n"
+        "fmla v14.4s, v17.4s, v6.s[1] \n"
+        "fmla v15.4s, v17.4s, v7.s[1] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v12.4s, v18.4s, v4.s[2] \n"
+        "fmla v13.4s, v18.4s, v5.s[2] \n"
+        "fmla v14.4s, v18.4s, v6.s[2] \n"
+        "fmla v15.4s, v18.4s, v7.s[2] \n"
+        /* load b0, b1 */
+        "ld1  {v0.4s,  v1.4s}, [%[b]], #32 \n"
+        "fmla v12.4s, v19.4s, v4.s[3] \n"
+        "fmla v13.4s, v19.4s, v5.s[3] \n"
+        "fmla v14.4s, v19.4s, v6.s[3] \n"
+        "fmla v15.4s, v19.4s, v7.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "cbz  %w[relu], 2f            \n"
+        "fmax v8.4s,  v8.4s,  %[vzero].4s \n"
+        "fmax v9.4s,  v9.4s,  %[vzero].4s \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s \n"
+        "fmax v12.4s, v12.4s, %[vzero].4s \n"
+        "fmax v13.4s, v13.4s, %[vzero].4s \n"
+        "fmax v14.4s, v14.4s, %[vzero].4s \n"
+        "fmax v15.4s, v15.4s, %[vzero].4s \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        "st1  {v12.4s, v13.4s, v14.4s, v15.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [vbias] "w" (vbias),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* mov bias to c0-c3*/
+        "mov  v8.16b,   %[vbias].16b \n"
+        "mov  v9.16b,   %[vbias].16b \n"
+        "mov  v10.16b,  %[vbias].16b \n"
+        "mov  v11.16b,  %[vbias].16b \n"
+        "1:\n"
+        /* load b0-b3 */
+        "ld1  {v0.4s,  v1.4s},  [%[b]], #32 \n"
+        "ld1  {v2.4s,  v3.4s},  [%[b]], #32 \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v16.4s, v1.s[0] \n"
+        "fmla v10.4s, v16.4s, v2.s[0] \n"
+        "fmla v11.4s, v16.4s, v3.s[0] \n"
+        "sub  %[b],   %[b],   #64     \n"
+        "fmla v8.4s,  v17.4s, v0.s[1] \n"
+        "fmla v9.4s,  v17.4s, v1.s[1] \n"
+        "fmla v10.4s, v17.4s, v2.s[1] \n"
+        "fmla v11.4s, v17.4s, v3.s[1] \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v18.4s, v1.s[2] \n"
+        "fmla v10.4s, v18.4s, v2.s[2] \n"
+        "fmla v11.4s, v18.4s, v3.s[2] \n"
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v19.4s, v0.s[3] \n"
+        "fmla v9.4s,  v19.4s, v1.s[3] \n"
+        "fmla v10.4s, v19.4s, v2.s[3] \n"
+        "fmla v11.4s, v19.4s, v3.s[3] \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "bne  1b                      \n"
+        "cbz  %w[relu], 2f            \n"
+        "fmax v8.4s,  v8.4s,  %[vzero].4s \n"
+        "fmax v9.4s,  v9.4s,  %[vzero].4s \n"
+        "fmax v10.4s, v10.4s, %[vzero].4s \n"
+        "fmax v11.4s, v11.4s, %[vzero].4s \n"
+        "2:\n"
+        "st1  {v8.4s,  v9.4s,  v10.4s, v11.4s}, [%[c]], #64 \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [vbias] "w" (vbias),
+          [vzero] "w" (vzero)
+        : "v0", "v1", "v2", "v3", "v8", "v9",
+          "v10", "v11", "v16", "v17", "v18",
+          "v19", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        /* mov bias to c0 */
+        "mov  v8.16b,   %[vbias].16b \n"
+        "mov  v9.16b,   %[vzero].16b \n"
+        "1:\n"
+        /* load b0 */
+        "ld1  {v0.4s},  [%[b]], #16  \n"
+        /* load a2, a3 */
+        "ld1  {v18.4s, v19.4s}, [%[a]], #32 \n"
+        "fmla v8.4s,  v16.4s, v0.s[0] \n"
+        "fmla v9.4s,  v17.4s, v0.s[1] \n"
+        "sub  %[b],   %[b],   #16     \n"
+        "subs %w[cnt], %w[cnt], #1    \n"
+        "add  %[b],   %[b],   %[ldb]  \n"
+        "fmla v8.4s,  v18.4s, v0.s[2] \n"
+        "fmla v9.4s,  v19.4s, v0.s[3] \n"
+         /* load a0, a1 */
+        "ld1  {v16.4s, v17.4s}, [%[a]], #32 \n"
+        "bne  1b                      \n"
+        "fadd v8.4s,  v8.4s,  v9.4s   \n"
+        "cbz  %w[relu], 2f            \n"
+        "fmax v8.4s,  v8.4s,  %[vzero].4s \n"
+        "2:\n"
+        "st1  {v8.4s}, [%[c]], #16    \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [vbias] "w" (vbias),
+          [vzero] "w" (vzero)
+        : "v0", "v8", "v9", "v16", "v17", 
+          "v18", "v19", "cc", "memory"
+      );
+      b += 4;
+    }
+#else
+    for (; n > 7; n -= 8) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      // clang-format off
+      asm volatile(
+        "vld1.32  {d6-d7}, [%[bias]] \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]!  \n"
+        /* mov bias to c0-c7*/
+        "vmov.u32   q8,    q3 \n"
+        "vmov.u32   q9,    q3 \n"
+        "vmov.u32   q10,   q3 \n"
+        "vmov.u32   q11,   q3 \n"
+        /* load b0, b1 */
+        "vld1.32  {d0-d3}, [%[b]]! \n"
+        "vmov.u32   q12,   q3 \n"
+        "vmov.u32   q13,   q3 \n"
+        "vmov.u32   q14,   q3 \n"
+        "vmov.u32   q15,   q3 \n"
+        "1:\n"
+        /* load b2, b3 */
+        "vld1.32    {d4-d7},   [%[b]]! \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15},   [%[a]]! \n"
+        "vmla.f32   q8,   q4,   d0[0]  \n"
+        "vmla.f32   q9,   q4,   d2[0]  \n"
+        "vmla.f32   q10,  q4,   d4[0]  \n"
+        "vmla.f32   q11,  q4,   d6[0]  \n"
+        "pld    [%[b]]                 \n"
+        "vmla.f32   q8,   q5,   d0[1]  \n"
+        "vmla.f32   q9,   q5,   d2[1]  \n"
+        "vmla.f32   q10,  q5,   d4[1]  \n"
+        "vmla.f32   q11,  q5,   d6[1]  \n"
+        "subs   %[cnt],   %[cnt],  #1  \n"
+        "vmla.f32   q8,   q6,   d1[0]  \n"
+        "vmla.f32   q9,   q6,   d3[0]  \n"
+        "vmla.f32   q10,  q6,   d5[0]  \n"
+        "vmla.f32   q11,  q6,   d7[0]  \n"
+        "pld    [%[b], #64]            \n"
+        "vmla.f32   q8,   q7,   d1[1]  \n"
+        "vmla.f32   q9,   q7,   d3[1]  \n"
+        /* load b4, b5 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q10,  q7,   d5[1]  \n"
+        "vmla.f32   q11,  q7,   d7[1]  \n"
+        /* load b6, b7 */
+        "vld1.32    {d4-d7},  [%[b]]!  \n"
+        "vmla.f32   q12,  q4,   d0[0]  \n"
+        "vmla.f32   q13,  q4,   d2[0]  \n"
+        "vmla.f32   q14,  q4,   d4[0]  \n"
+        "vmla.f32   q15,  q4,   d6[0]  \n"
+        "sub  %[b],   %[b],   #128     \n"
+        "vmla.f32   q12,  q5,   d0[1]  \n"
+        "vmla.f32   q13,  q5,   d2[1]  \n"
+        "vmla.f32   q14,  q5,   d4[1]  \n"
+        "vmla.f32   q15,  q5,   d6[1]  \n"
+        "add  %[b],   %[b],   %[ldb]   \n"
+        "vmla.f32   q12,  q6,   d1[0]  \n"
+        "vmla.f32   q13,  q6,   d3[0]  \n"
+        "vmla.f32   q14,  q6,   d5[0]  \n"
+        "vmla.f32   q15,  q6,   d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32    {d8-d11}, [%[a]]!  \n"
+        "vmla.f32   q12,  q7,   d1[1]  \n"
+        "vmla.f32   q13,  q7,   d3[1]  \n"
+        /* load b0, b1 */
+        "vld1.32    {d0-d3},  [%[b]]!  \n"
+        "vmla.f32   q14,  q7,   d5[1]  \n"
+        "vmla.f32   q15,  q7,   d7[1]  \n"
+        "bne  1b                       \n"
+        "cmp  %[relu],  #0             \n"
+        "beq  2f                       \n"
+        "vmov.u32   q0,   #0           \n"
+        "vmax.f32   q8,   q8,   q0     \n"
+        "vmax.f32   q9,   q9,   q0     \n"
+        "vmax.f32   q10,  q10,  q0     \n"
+        "vmax.f32   q11,  q11,  q0     \n"
+        "vmax.f32   q12,  q12,  q0     \n"
+        "vmax.f32   q13,  q13,  q0     \n"
+        "vmax.f32   q14,  q14,  q0     \n"
+        "vmax.f32   q15,  q15,  q0     \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!   \n"
+        "vst1.32  {d20-d23}, [%[c]]!   \n"
+        "vst1.32  {d24-d27}, [%[c]]!   \n"
+        "vst1.32  {d28-d31}, [%[c]]!   \n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [bias] "r" (bias_ptr)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "q14", "q15", "cc", "memory"
+      );
+      b += 4 * 8;
+    }
+    for (; n > 3; n -= 4) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "vld1.32  {d24-d25}, [%[bias]] \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11},  [%[a]]!   \n"
+        /* mov bias to c0-c3*/
+        "vmov.u32   q8,   q12 \n"
+        "vmov.u32   q9,   q12 \n"
+        "vmov.u32   q10,  q12 \n"
+        "vmov.u32   q11,  q12 \n"
+        "vmov.u32   q13,  #0  \n"
+        "1:\n"
+        /* load b0-b3 */
+        "vld1.32  {d0-d3},  [%[b]]! \n"
+        "vld1.32  {d4-d7},  [%[b]]! \n"
+        /* load a2, a3 */
+        "vld1.32  {d12-d15}, [%[a]]!\n"
+        "vmla.f32  q8,   q4, d0[0]  \n"
+        "vmla.f32  q9,   q4, d2[0]  \n"
+        "vmla.f32  q10,  q4, d4[0]  \n"
+        "vmla.f32  q11,  q4, d6[0]  \n"
+        "sub  %[b], %[b], #64       \n"
+        "vmla.f32  q8,   q5, d0[1]  \n"
+        "vmla.f32  q9,   q5, d2[1]  \n"
+        "vmla.f32  q10,  q5, d4[1]  \n"
+        "vmla.f32  q11,  q5, d6[1]  \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32  q8,   q6, d1[0]  \n"
+        "vmla.f32  q9,   q6, d3[0]  \n"
+        "vmla.f32  q10,  q6, d5[0]  \n"
+        "vmla.f32  q11,  q6, d7[0]  \n"
+        /* load a0, a1 */
+        "vld1.32  {d8-d11}, [%[a]]! \n"
+        "vmla.f32  q8,   q7, d1[1]  \n"
+        "vmla.f32  q9,   q7, d3[1]  \n"
+        "vmla.f32  q10,  q7, d5[1]  \n"
+        "vmla.f32  q11,  q7, d7[1]  \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "bne  1b                    \n"
+        "cmp  %[relu],  #0          \n"
+        "beq  2f                    \n"
+        "vmax.f32 q8,   q8,   q13   \n"
+        "vmax.f32 q9,   q9,   q13   \n"
+        "vmax.f32 q10,  q10,  q13   \n"
+        "vmax.f32 q11,  q11,  q13   \n"
+        "2:\n"
+        "vst1.32  {d16-d19}, [%[c]]!\n"
+        "vst1.32  {d20-d23}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [bias] "r" (bias_ptr)
+        : "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q8", "q9", "q10", "q11",
+          "q12", "q13", "cc", "memory"
+      );
+      b += 4 * 4;
+    }
+    for (; n > 0; n--) {
+      int cnt = kcnt;
+      const float* a_ptr = A_packed;
+      const float* b_ptr = b;
+      asm volatile(
+        "vld1.32  {d14-d15}, [%[bias]] \n"
+        "vmov.u32   q8,   #0  \n"
+        /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]! \n"
+        /* mov bias to c0 */
+        "vmov.u32   q5,   q7  \n"
+        "vmov.u32   q6,   q8  \n"
+        "1:\n"
+        /* load b0 */
+        "vld1.32  {d0-d1},  [%[b]]! \n"
+        /* load a2, a3 */
+        "vld1.32  {d6-d9},  [%[a]]! \n"
+        "vmla.f32   q5, q1, d0[0]   \n"
+        "vmla.f32   q6, q2, d0[1]   \n"
+        "sub  %[b], %[b],   #16     \n"
+        "subs %[cnt], %[cnt], #1    \n"
+        "add  %[b], %[b], %[ldb]    \n"
+        "vmla.f32   q5, q3, d1[0]   \n"
+        "vmla.f32   q6, q4, d1[1]   \n"
+         /* load a0, a1 */
+        "vld1.32  {d2-d5}, [%[a]]!  \n"
+        "bne  1b                    \n"
+        "vadd.f32   q5, q5,   q6    \n"
+        "cmp  %[relu],  #0          \n"
+        "beq  2f                    \n"
+        "vmax.f32   q5, q5,   q8    \n"
+        "2:\n"
+        "vst1.32  {d10-d11}, [%[c]]!\n"
+        : [a] "+r" (a_ptr),
+          [b] "+r" (b_ptr),
+          [c] "+r" (C),
+          [cnt] "+r" (cnt)
+        : [relu] "r" (has_relu),
+          [ldb]  "r" (ldb_byte),
+          [bias] "r" (bias_ptr)
+        : "q0", "q1", "q2", "q3", "q4", 
+          "q5", "q6", "q7", "q8", "cc", "memory"
+      );
+      // clang-format on
+      b += 4;
+    }
+#endif
+    bias_ptr += 4;
+    A_packed += lda;
+  }
+}
+
+void sgemm_prepack_c4(int M,
+                      int N,
+                      int K,
+                      const float* A_packed,
+                      const float* B,
+                      float* C,
+                      const float* bias,
+                      bool has_bias,
+                      bool has_relu,
+                      ARMContext* ctx) {
+  if (N > 16) {
+    sgemm_prepack_c4_common(
+        M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx);
+  } else {
+    sgemm_prepack_c4_small(
+        M, N, K, A_packed, B, C, bias, has_bias, has_relu, ctx);
+  }
+}
+
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/packed_sgemm_c4.h b/lite/backends/arm/math/packed_sgemm_c4.h
new file mode 100644
index 0000000000..21e5af6343
--- /dev/null
+++ b/lite/backends/arm/math/packed_sgemm_c4.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cmath>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace arm {
+namespace math {
+
+constexpr int MBLOCK_C4 = 4;
+constexpr int NBLOCK_C4 = 8;
+constexpr int KBLOCK_C4 = 4;
+
+void sgemm_prepack_c4(int M,
+                      int N,
+                      int K,
+                      const float* A_packed,
+                      const float* B,
+                      float* C,
+                      const float* bias,
+                      bool has_bias,
+                      bool has_relu,
+                      ARMContext* ctx);
+void sgemm_prepack_c4_small(int M,
+                            int N,
+                            int K,
+                            const float* A_packed,
+                            const float* B,
+                            float* C,
+                            const float* bias,
+                            bool has_bias,
+                            bool has_relu,
+                            ARMContext* ctx);
+}  // namespace math
+}  // namespace arm
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/arm/math/pooling.cc b/lite/backends/arm/math/pooling.cc
index a857e9830c..8524d7376f 100644
--- a/lite/backends/arm/math/pooling.cc
+++ b/lite/backends/arm/math/pooling.cc
@@ -46,7 +46,7 @@ void pooling_basic(const float* din,
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int size_channel_in = win * hin;
   int size_channel_out = wout * hout;
   if (global_pooling) {
@@ -125,18 +125,22 @@ void pooling_basic(const float* din,
                 int bh = kernel_h;
                 int bw = kernel_w;
                 if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                   bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                     bw += pad_w;
                   }
                 }
                 if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                   bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                     bh += pad_h;
                   }
                 }
diff --git a/lite/backends/arm/math/sgemv.cc b/lite/backends/arm/math/sgemv.cc
index 506451932d..1830423136 100644
--- a/lite/backends/arm/math/sgemv.cc
+++ b/lite/backends/arm/math/sgemv.cc
@@ -14,6 +14,7 @@
 
 #include "lite/backends/arm/math/sgemv.h"
 #include <arm_neon.h>
+#include <algorithm>
 #include "lite/utils/cp_logging.h"
 
 namespace paddle {
@@ -50,6 +51,495 @@ void sgemv_bias_relu(const bool transA,
                      const float *x,
                      float *y,
                      const float *bias);
+#ifdef __aarch64__
+void sgemv_trans(const int M,
+                 const int N,
+                 const float *A,
+                 const float *x,
+                 float *y,
+                 bool flag_bias,
+                 const float *bias,
+                 bool flag_relu,
+                 const ARMContext *ctx) {
+  int m_cnt16 = M >> 4;
+  int m_cnt8 = (M & 15) >> 3;
+  int m_cnt4 = (M & 15 & 7) >> 2;
+  int m_remain = M & 15 & 7 & 3;
+  int ths = ctx->threads();
+  int valid_ths = std::min((N + 3) / 4, ths);
+  int valid_block = std::max(4, (N / valid_ths + 3) / 4 * 4);
+  valid_ths = (N + valid_block - 1) / valid_block;
+  int block_cnt = valid_block / 4;
+  float zero_buf[M];           // NOLINT
+  float y_buf[valid_ths * M];  // NOLINT
+  memset(zero_buf, 0, M * sizeof(float));
+  if (flag_bias) {
+    memcpy(y_buf, bias, M * sizeof(float));
+    memset(y_buf + M, 0, (valid_ths - 1) * M * sizeof(float));
+  } else {
+    memset(y_buf, 0, valid_ths * M * sizeof(float));
+  }
+#pragma omp parallel for
+  for (int t = 0; t < valid_ths; ++t) {
+    float *block_y = y_buf + t * M;
+    const float *block_x = x + t * valid_block;
+    const float *block_A = A + t * valid_block * M;
+    for (int i = 0; i < block_cnt; ++i) {
+      float *y_ptr = block_y;
+      const float *x_ptr = block_x + i * 4;
+      const float *in0_ptr = block_A + i * 4 * M;
+      const float *in1_ptr = in0_ptr + M;
+      const float *in2_ptr = in1_ptr + M;
+      const float *in3_ptr = in2_ptr + M;
+      int offset = t * valid_block + (i + 1) * 4 - N;
+      if (offset > 0) {
+        if (offset > 3) {
+          in0_ptr = zero_buf;
+          in1_ptr = zero_buf;
+          in2_ptr = zero_buf;
+          in3_ptr = zero_buf;
+        } else {
+          switch (offset) {
+            case 3:
+              in1_ptr = zero_buf;
+            case 2:
+              in2_ptr = zero_buf;
+            case 1:
+              in3_ptr = zero_buf;
+            default:
+              break;
+          }
+        }
+      }
+      // clang-format off
+      if (m_cnt16 > 0) {
+        int cnt16 = m_cnt16;
+        asm volatile(
+            "ld1  {v4.4s},  [%[x]]    \n"                               /* load x   to v4     */
+            "ld1  {v5.4s,  v6.4s,  v7.4s,  v8.4s},   [%[in0]], #64 \n"  /* load in0 to v5,  v6,  v7,  v8  */
+            "ld1  {v9.4s,  v10.4s, v11.4s, v12.4s},  [%[in1]], #64 \n"  /* load in1 to v9,  v10, v11, v12 */
+            "ld1  {v13.4s, v14.4s, v15.4s, v16.4s},  [%[in2]], #64 \n"  /* load in2 to v13, v14, v15, v16 */
+            "ld1  {v17.4s, v18.4s, v19.4s, v20.4s},  [%[in3]], #64 \n"  /* load in3 to v17, v18, v19, v20 */
+            "1:\n"
+            "ld1  {v0.4s, v1.4s, v2.4s, v3.4s},  [%[y]]    \n"        /*load y to v0, v1, v2, v3  */
+            "fmla v0.4s,  v5.4s,  v4.s[0]     \n" /*  v0 += v5 * v4[0]  */
+            "fmla v1.4s,  v6.4s,  v4.s[0]     \n" /*  v1 += v6 * v4[0]  */
+            "fmla v2.4s,  v7.4s,  v4.s[0]     \n" /*  v2 += v7 * v4[0]  */
+            "fmla v3.4s,  v8.4s,  v4.s[0]     \n" /*  v3 += v8 * v4[0]  */
+            "ld1  {v5.4s, v6.4s,  v7.4s,  v8.4s},   [%[in0]], #64 \n" /* load in0 to v5,  v6,  v7,  v8  */
+            "fmla v0.4s,  v9.4s,  v4.s[1]     \n" /*  v0 += v9  * v4[1]  */
+            "fmla v1.4s,  v10.4s, v4.s[1]     \n" /*  v1 += v10 * v4[1]  */
+            "fmla v2.4s,  v11.4s, v4.s[1]     \n" /*  v2 += v11 * v4[1]  */
+            "fmla v3.4s,  v12.4s, v4.s[1]     \n" /*  v3 += v12 * v4[1]  */
+            "ld1  {v9.4s, v10.4s, v11.4s, v12.4s},  [%[in1]], #64 \n" /* load in1 to v9,  v10, v11, v12 */
+            "fmla v0.4s,  v13.4s, v4.s[2]     \n" /*  v0 += v13 * v4[2]  */
+            "fmla v1.4s,  v14.4s, v4.s[2]     \n" /*  v1 += v14 * v4[2]  */
+            "fmla v2.4s,  v15.4s, v4.s[2]     \n" /*  v2 += v15 * v4[2]  */
+            "fmla v3.4s,  v16.4s, v4.s[2]     \n" /*  v3 += v16 * v4[2]  */
+            "ld1  {v13.4s, v14.4s, v15.4s, v16.4s}, [%[in2]], #64 \n" /* load in2 to v13, v14, v15, v16 */
+            "fmla v0.4s,  v17.4s, v4.s[3]     \n" /*  v0 += v17 * v4[3]  */
+            "fmla v1.4s,  v18.4s, v4.s[3]     \n" /*  v1 += v18 * v4[3]  */
+            "fmla v2.4s,  v19.4s, v4.s[3]     \n" /*  v2 += v19 * v4[3]  */
+            "fmla v3.4s,  v20.4s, v4.s[3]     \n" /*  v3 += v20 * v4[3]  */
+            "ld1  {v17.4s, v18.4s, v19.4s, v20.4s}, [%[in3]], #64 \n" /* load in3 to v17, v18, v19, v20 */
+            "subs %w[cnt], %w[cnt], #1        \n" /*       sub cnt       */
+            "st1  {v0.4s, v1.4s, v2.4s, v3.4s}, [%[y]], #64   \n"     /*  store v0, v1, v2, v3 to y */
+            "bne  1b  \n"                     /*  branch to label 1 */
+            "sub  %[in0], %[in0], #64     \n" /* restore in0 address */
+            "sub  %[in1], %[in1], #64     \n" /* restore in1 address */
+            "sub  %[in2], %[in2], #64     \n" /* restore in2 address */
+            "sub  %[in3], %[in3], #64     \n" /* restore in3 address */
+            : [cnt] "+r"(cnt16),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", 
+              "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", 
+              "v17", "v18", "v19", "v20", "cc", "memory"
+        );
+      }
+      if (m_cnt8 > 0) {
+        int cnt8 = m_cnt8;
+        asm volatile(
+            "ld1  {v2.4s},  [%[x]]                \n" /* load x   to v2     */
+            "ld1  {v3.4s, v4.4s},  [%[in0]], #32  \n" /* load in0 to v3, v4 */
+            "ld1  {v5.4s, v6.4s},  [%[in1]], #32  \n" /* load in1 to v5, v6 */
+            "ld1  {v7.4s, v8.4s},  [%[in2]], #32  \n" /* load in2 to v7, v8 */
+            "ld1  {v9.4s, v10.4s}, [%[in3]], #32  \n" /* load in3 to v9, v10*/
+            "1:\n"
+            "ld1  {v0.4s, v1.4s}, [%[y]]    \n" /*  load y to v0, v1  */
+            "fmla v0.4s, v3.4s,   v2.s[0]   \n" /*  v0 += v3 * v2[0]  */
+            "fmla v1.4s, v4.4s,   v2.s[0]   \n" /*  v1 += v4 * v2[0]  */
+            "prfm pldl1keep,      [%[in0]]  \n" /*    preload in0     */
+            "ld1  {v3.4s, v4.4s}, [%[in0]], #32 \n" /* load in0 to v3, v4 */
+            "fmla v0.4s, v5.4s,   v2.s[1]   \n" /*  v0 += v5 * v2[1]  */
+            "fmla v1.4s, v6.4s,   v2.s[1]   \n" /*  v1 += v6 * v2[1]  */
+            "prfm pldl1keep,      [%[in1]]  \n" /*    preload in1     */
+            "ld1  {v5.4s, v6.4s}, [%[in1]], #32 \n" /* load in0 to v5, v6 */
+            "fmla v0.4s, v7.4s,   v2.s[2]   \n" /*  v0 += v7 * v2[2]  */
+            "fmla v1.4s, v8.4s,   v2.s[2]   \n" /*  v1 += v8 * v2[2]  */
+            "prfm pldl1keep,      [%[in2]]  \n" /*    preload in2     */
+            "ld1  {v7.4s, v8.4s}, [%[in2]], #32 \n" /* load in0 to v7, v8 */
+            "fmla v0.4s, v9.4s,   v2.s[3]   \n" /*  v0 += v9 * v2[3]  */
+            "fmla v1.4s, v10.4s,  v2.s[3]   \n" /*  v1 += v10 * v2[3] */
+            "subs %w[cnt], %w[cnt], #1      \n" /*      sub cnt       */
+            "prfm pldl1keep,      [%[in3]]  \n" /*    preload in3     */
+            "st1  {v0.4s, v1.4s}, [%[y]],   #32 \n" /*  store v0, v1 to y */
+            "ld1  {v9.4s, v10.4s},[%[in3]], #32 \n" /* load in0 to v9, v10*/
+            "bne  1b  \n"                       /*  branch to label 1 */
+            "sub  %[in0], %[in0], #32     \n" /* restore in0 address */
+            "sub  %[in1], %[in1], #32     \n" /* restore in1 address */
+            "sub  %[in2], %[in2], #32     \n" /* restore in2 address */
+            "sub  %[in3], %[in3], #32     \n" /* restore in3 address */
+            : [cnt] "+r"(cnt8),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "v6", 
+              "v7", "v8", "v9", "v10", "cc", "memory"
+        );
+      }
+      if (m_cnt4 > 0) {
+        int cnt4 = m_cnt4;
+        asm volatile(
+            "ld1  {v1.4s},  [%[in0]], #16 \n" /* load in0 to v1  */
+            "ld1  {v2.4s},  [%[in1]], #16 \n" /* load in1 to v2  */
+            "ld1  {v3.4s},  [%[in2]], #16 \n" /* load in2 to v3  */
+            "ld1  {v4.4s},  [%[in3]], #16 \n" /* load in3 to v4  */
+            "ld1  {v5.4s},  [%[x]]        \n" /* load x   to v5  */
+            "1:\n"
+            "ld1  {v0.4s},  [%[y]]        \n" /*   load y to v0    */
+            "fmla v0.4s, v1.4s, v5.s[0]   \n" /* v0 += v1 * v5[0]  */
+            "prfm  pldl1keep,   [%[in0]]  \n" /*    preload in0    */
+            "ld1  {v1.4s},  [%[in0]], #16 \n" /*  load in0 to v1   */
+            "fmla v0.4s, v2.4s, v5.s[1]   \n" /* v0 += v2 * v5[1]  */
+            "prfm  pldl1keep,  [%[in1]]   \n" /*    preload in1    */
+            "ld1  {v2.4s},  [%[in1]], #16 \n" /*  load in1 to v2   */
+            "fmla v0.4s, v3.4s, v5.s[2]   \n" /* v0 += v3 * v5[2]  */
+            "prfm pldl1keep,  [%[in2]]    \n" /*    preload in2    */
+            "ld1  {v3.4s},  [%[in2]], #16 \n" /*  load in2 to v3   */
+            "fmla v0.4s, v4.4s, v5.s[3]   \n" /* v0 += v4 * v5[3]  */
+            "subs %w[cnt], %w[cnt], #1    \n" /*      sub cnt      */
+            "prfm pldl1keep,  [%[in3]]    \n" /*    preload in3    */
+            "st1  {v0.4s},  [%[y]], #16   \n" /*  store v0 to y    */
+            "ld1  {v4.4s},  [%[in3]], #16 \n" /*  load in3 to v4   */
+            "bne  1b  \n"                     /* branch to label 1 */
+            "sub  %[in0], %[in0], #16     \n" /* restore in0 address*/
+            "sub  %[in1], %[in1], #16     \n" /* restore in1 address*/
+            "sub  %[in2], %[in2], #16     \n" /* restore in2 address*/
+            "sub  %[in3], %[in3], #16     \n" /* restore in3 address*/
+            : [cnt] "+r"(cnt4),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "v0", "v1", "v2", "v3", "v4", "v5", "cc", "memory"
+        );
+      }
+      // clang-format on
+      for (int r = 0; r < m_remain; ++r) {
+        float val0 = x_ptr[0] * in0_ptr[r];
+        float val1 = x_ptr[1] * in1_ptr[r];
+        float val2 = x_ptr[2] * in2_ptr[r];
+        float val3 = x_ptr[3] * in3_ptr[r];
+        y_ptr[r] += val0 + val1 + val2 + val3;
+      }
+    }
+  }
+  int cnt4 = M >> 2;
+  int remain = M & 3;
+  //! do reduction
+  int rdc_ths = valid_ths >> 1;
+  while (rdc_ths > 0) {
+#pragma omp parallel for
+    for (int t = 0; t < rdc_ths; ++t) {
+      float *y0 = y_buf + t * M;
+      for (int i = t + rdc_ths; i < valid_ths; i += rdc_ths) {
+        float *y0_ptr = y0;
+        float *y_ptr = y_buf + i * M;
+        for (int j = 0; j < cnt4; ++j) {
+          float32x4_t val0 = vld1q_f32(y0_ptr + j * 4);
+          float32x4_t val1 = vld1q_f32(y_ptr + j * 4);
+          float32x4_t val = vaddq_f32(val0, val1);
+          vst1q_f32(y0_ptr + j * 4, val);
+        }
+        y0_ptr += cnt4 * 4;
+        y_ptr += cnt4 * 4;
+        for (int j = 0; j < remain; ++j) {
+          y0_ptr[j] += y_ptr[j];
+        }
+      }
+    }
+    valid_ths = rdc_ths;
+    rdc_ths = rdc_ths >> 1;
+  }
+  if (flag_relu) {
+    float *in_y = y_buf;
+    float32x4_t vzero = vdupq_n_f32(0.f);
+    if (cnt4 > 0) {
+      int cnt = cnt4;
+      asm volatile(
+          "ld1  {v0.4s},  [%[in_y]], #16  \n" /*  load y to v0    */
+          "1:\n"
+          "fmax v1.4s, v0.4s, %[vzero].4s \n" /*      v0 relu     */
+          "ld1  {v0.4s},  [%[in_y]], #16  \n" /*   load y to v0   */
+          "subs %w[cnt],  %w[cnt], #1     \n" /*      sub cnt     */
+          "st1  {v1.4s},  [%[out_y]], #16 \n" /*  store v1 to y   */
+          "bne  1b                        \n" /* branch to label 1*/
+          "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
+          : [cnt] "+r"(cnt), [in_y] "+r"(in_y), [out_y] "+r"(y)
+          : [vzero] "w"(vzero)
+          : "v0", "v1", "cc", "memory");
+    }
+    for (int r = 0; r < remain; ++r) {
+      y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+    }
+  } else {
+    memcpy(y, y_buf, M * sizeof(float));
+  }
+}
+#else
+void sgemv_trans(const int M,
+                 const int N,
+                 const float *A,
+                 const float *x,
+                 float *y,
+                 bool flag_bias,
+                 const float *bias,
+                 bool flag_relu,
+                 const ARMContext *ctx) {
+  int m_cnt8 = M >> 3;
+  int m_cnt4 = (M & 7) >> 2;
+  int m_remain = M & 7 & 3;
+  int ths = ctx->threads();
+  int valid_ths = std::min((N + 3) / 4, ths);
+  int valid_block = std::max(4, (N / valid_ths + 3) / 4 * 4);
+  valid_ths = (N + valid_block - 1) / valid_block;
+  int block_cnt = valid_block / 4;
+  float zero_buf[M];           // NOLINT
+  float y_buf[valid_ths * M];  // NOLINT
+  memset(zero_buf, 0, M * sizeof(float));
+  if (flag_bias) {
+    memcpy(y_buf, bias, M * sizeof(float));
+    memset(y_buf + M, 0, (valid_ths - 1) * M * sizeof(float));
+  } else {
+    memset(y_buf, 0, valid_ths * M * sizeof(float));
+  }
+#pragma omp parallel for
+  for (int t = 0; t < valid_ths; ++t) {
+    float *block_y = y_buf + t * M;
+    const float *block_x = x + t * valid_block;
+    const float *block_A = A + t * valid_block * M;
+    for (int i = 0; i < block_cnt; ++i) {
+      float *y_ptr = block_y;
+      const float *x_ptr = block_x + i * 4;
+      const float *in0_ptr = block_A + i * 4 * M;
+      const float *in1_ptr = in0_ptr + M;
+      const float *in2_ptr = in1_ptr + M;
+      const float *in3_ptr = in2_ptr + M;
+      int offset = t * valid_block + (i + 1) * 4 - N;
+      if (offset > 0) {
+        if (offset > 3) {
+          in0_ptr = zero_buf;
+          in1_ptr = zero_buf;
+          in2_ptr = zero_buf;
+          in3_ptr = zero_buf;
+        } else {
+          switch (offset) {
+            case 3:
+              in1_ptr = zero_buf;
+            case 2:
+              in2_ptr = zero_buf;
+            case 1:
+              in3_ptr = zero_buf;
+            default:
+              break;
+          }
+        }
+      }
+      // clang-format off
+      if (m_cnt8 > 0) {
+        int cnt8 = m_cnt8;
+        asm volatile(
+            "vld1.32  {d4-d5},  [%[x]]    \n" /* load x   to q2     */
+            "vld1.32  {d6-d9},  [%[in0]]! \n" /* load in0 to q3, q4 */
+            "vld1.32  {d10-d13},[%[in1]]! \n" /* load in1 to q5, q6 */
+            "vld1.32  {d14-d17},[%[in2]]! \n" /* load in2 to q7, q8 */
+            "vld1.32  {d18-d21},[%[in3]]! \n" /* load in3 to q9, q10*/
+            "1:\n"
+            "vld1.32  {d0-d3},  [%[y]]    \n" /*  load y to q0, q1  */
+            "vmla.f32 q0, q3,   d4[0]     \n" /*  q0 += q3 * q2[0]  */
+            "vmla.f32 q1, q4,   d4[0]     \n" /*  q1 += q4 * q2[0]  */
+            "pld  [%[in0]]                \n" /*    preload in0     */
+            "vld1.32  {d6-d9},  [%[in0]]! \n" /* load in0 to q3, q4 */
+            "vmla.f32 q0, q5,   d4[1]     \n" /*  q0 += q5 * q2[1]  */
+            "vmla.f32 q1, q6,   d4[1]     \n" /*  q1 += q6 * q2[1]  */
+            "pld  [%[in1]]                \n" /*    preload in1     */
+            "vld1.32  {d10-d13},[%[in1]]! \n" /* load in0 to q5, q6 */
+            "vmla.f32 q0, q7,   d5[0]     \n" /*  q0 += q7 * q2[2]  */
+            "vmla.f32 q1, q8,   d5[0]     \n" /*  q1 += q8 * q2[2]  */
+            "pld  [%[in2]]                \n" /*    preload in2     */
+            "vld1.32  {d14-d17},[%[in2]]! \n" /* load in0 to q7, q8 */
+            "vmla.f32 q0, q9,   d5[1]     \n" /*  q0 += q9 * q2[3]  */
+            "vmla.f32 q1, q10,  d5[1]     \n" /*  q1 += q10 * q2[3] */
+            "subs %[cnt], %[cnt], #1      \n" /*      sub cnt       */
+            "pld  [%[in3]]                \n" /*    preload in3     */
+            "vst1.32  {d0-d3},  [%[y]]!   \n" /*  store q0, q1 to y */
+            "vld1.32  {d18-d21},[%[in3]]! \n" /* load in0 to q9, q10*/
+            "pld  [%[y], #32] \n"             /*     preload y      */
+            "bne  1b  \n"                     /*  branch to label 1 */
+            "sub  %[in0], %[in0], #32     \n" /* restore in0 address */
+            "sub  %[in1], %[in1], #32     \n" /* restore in1 address */
+            "sub  %[in2], %[in2], #32     \n" /* restore in2 address */
+            "sub  %[in3], %[in3], #32     \n" /* restore in3 address */
+            : [cnt] "+r"(cnt8),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "q6", 
+              "q7", "q8", "q9", "q10", "cc", "memory"
+        );
+      }
+      if (m_cnt4 > 0) {
+        int cnt4 = m_cnt4;
+        asm volatile(
+            "vld1.32  {d2-d3},  [%[in0]]! \n" /* load in0 to q1  */
+            "vld1.32  {d4-d5},  [%[in1]]! \n" /* load in1 to q2  */
+            "vld1.32  {d6-d7},  [%[in2]]! \n" /* load in2 to q3  */
+            "vld1.32  {d8-d9},  [%[in3]]! \n" /* load in3 to q4  */
+            "vld1.32  {d10-d11},[%[x]]    \n" /* load x   to q5  */
+            "1:\n"
+            "vld1.32  {d0-d1},  [%[y]]    \n" /*   load y to q0    */
+            "vmla.f32 q0, q1,   d10[0]    \n" /* q0 += q1 * q5[0]  */
+            "pld  [%[in0]]                \n" /*    preload in0    */
+            "vld1.32  {d2-d3},  [%[in0]]! \n" /*  load in0 to q1   */
+            "vmla.f32 q0, q2,   d10[1]    \n" /* q0 += q2 * q5[1]  */
+            "pld  [%[in1]]                \n" /*    preload in1    */
+            "vld1.32  {d4-d5},  [%[in1]]! \n" /*  load in0 to q2   */
+            "vmla.f32 q0, q3,   d11[0]    \n" /* q0 += q3 * q5[2]  */
+            "pld  [%[in2]]                \n" /*    preload in2    */
+            "vld1.32  {d6-d7},  [%[in2]]! \n" /*  load in0 to q3   */
+            "vmla.f32 q0, q4,   d11[1]    \n" /* q0 += q4 * q5[3]  */
+            "subs %[cnt], %[cnt], #1      \n" /*      sub cnt      */
+            "pld  [%[in3]]                \n" /*    preload in3    */
+            "vst1.32  {d0-d1},  [%[y]]!   \n" /*  store q0 to y    */
+            "vld1.32  {d8-d9},  [%[in3]]! \n" /*  load in0 to q4   */
+            "bne  1b  \n"                     /*  branch to label 1 */
+            "sub  %[in0], %[in0], #16     \n" /* restore in0 address*/
+            "sub  %[in1], %[in1], #16     \n" /* restore in1 address*/
+            "sub  %[in2], %[in2], #16     \n" /* restore in2 address*/
+            "sub  %[in3], %[in3], #16     \n" /* restore in3 address*/
+            : [cnt] "+r"(cnt4),
+              [in0] "+r"(in0_ptr),
+              [in1] "+r"(in1_ptr),
+              [in2] "+r"(in2_ptr),
+              [in3] "+r"(in3_ptr),
+              [y] "+r"(y_ptr)
+            : [x] "r"(x_ptr)
+            : "q0", "q1", "q2", "q3", "q4", "q5", "cc", "memory"
+        );
+      }
+      // clang-format on
+      for (int r = 0; r < m_remain; ++r) {
+        float val0 = x_ptr[0] * in0_ptr[r];
+        float val1 = x_ptr[1] * in1_ptr[r];
+        float val2 = x_ptr[2] * in2_ptr[r];
+        float val3 = x_ptr[3] * in3_ptr[r];
+        y_ptr[r] += val0 + val1 + val2 + val3;
+      }
+    }
+  }
+  //! do reduction
+  int rdc_ths = valid_ths >> 1;
+  while (rdc_ths > 0) {
+#pragma omp parallel for
+    for (int t = 0; t < rdc_ths; ++t) {
+      float *y0 = y_buf + t * M;
+      for (int i = t + rdc_ths; i < valid_ths; i += rdc_ths) {
+        float *y0_ptr = y0;
+        float *y_ptr = y_buf + i * M;
+        for (int j = 0; j < m_cnt8; ++j) {
+          float32x4_t val00 = vld1q_f32(y0_ptr + j * 8);
+          float32x4_t val01 = vld1q_f32(y0_ptr + j * 8 + 4);
+          float32x4_t val10 = vld1q_f32(y_ptr + j * 8);
+          float32x4_t val11 = vld1q_f32(y_ptr + j * 8 + 4);
+          float32x4_t val0 = vaddq_f32(val00, val10);
+          float32x4_t val1 = vaddq_f32(val01, val11);
+          vst1q_f32(y0_ptr + j * 8, val0);
+          vst1q_f32(y0_ptr + j * 8 + 4, val1);
+        }
+        y0_ptr += m_cnt8 * 8;
+        y_ptr += m_cnt8 * 8;
+        for (int j = 0; j < m_cnt4; ++j) {
+          float32x4_t val0 = vld1q_f32(y0_ptr + j * 4);
+          float32x4_t val1 = vld1q_f32(y_ptr + j * 4);
+          float32x4_t val = vaddq_f32(val0, val1);
+          vst1q_f32(y0_ptr + j * 4, val);
+        }
+        y0_ptr += m_cnt4 * 4;
+        y_ptr += m_cnt4 * 4;
+        for (int j = 0; j < m_remain; ++j) {
+          y0_ptr[j] += y_ptr[j];
+        }
+      }
+    }
+    valid_ths = rdc_ths;
+    rdc_ths = rdc_ths >> 1;
+  }
+  if (flag_relu) {
+    float *in_y = y_buf;
+    float32x4_t vzero = vdupq_n_f32(0.f);
+    if (m_cnt8 > 0) {
+      int cnt8 = m_cnt8;
+      asm volatile(
+          "vld1.32  {d0-d3},  [%[in_y]]!  \n" /* load y to q0, q1 */
+          "1:\n"
+          "vmax.f32 q2, q0,   %q[vzero]   \n" /*      q0 relu     */
+          "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*   load y to q0   */
+          "vmax.f32 q3, q1,   %q[vzero]   \n" /*      q1 relu     */
+          "subs %[cnt], %[cnt], #1        \n" /*      sub cnt     */
+          "vst1.32  {d4-d7},  [%[out_y]]! \n" /* store q0, q1 to y*/
+          "vld1.32  {d2-d3},  [%[in_y]]!  \n" /*   load y to q0   */
+          "bne  1b                        \n" /* branch to label 1*/
+          "sub  %[in_y],  %[in_y],  #32   \n" /*   restore in_y   */
+          : [cnt] "+r"(cnt8), [in_y] "+r"(in_y), [out_y] "+r"(y)
+          : [vzero] "w"(vzero)
+          : "q0", "q1", "q2", "q3", "cc", "memory");
+    }
+    if (m_cnt4 > 0) {
+      int cnt4 = m_cnt4;
+      asm volatile(
+          "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*  load y to q0    */
+          "1:\n"
+          "vmax.f32 q1, q0,   %q[vzero]   \n" /*      q0 relu     */
+          "vld1.32  {d0-d1},  [%[in_y]]!  \n" /*   load y to q0   */
+          "subs %[cnt], %[cnt], #1        \n" /*      sub cnt     */
+          "vst1.32  {d2-d3},  [%[out_y]]! \n" /*  store q1 to y   */
+          "bne  1b                        \n" /* branch to label 1*/
+          "sub  %[in_y],  %[in_y],  #16   \n" /*   restore in_y   */
+          : [cnt] "+r"(cnt4), [in_y] "+r"(in_y), [out_y] "+r"(y)
+          : [vzero] "w"(vzero)
+          : "q0", "q1", "cc", "memory");
+    }
+    for (int r = 0; r < m_remain; ++r) {
+      y[r] = in_y[r] > 0.f ? in_y[r] : 0.f;
+    }
+  } else {
+    memcpy(y, y_buf, M * sizeof(float));
+  }
+}
+#endif  // __aarch64__
 
 bool sgemv(const float *A,
            const float *x,
@@ -59,33 +549,34 @@ bool sgemv(const float *A,
            int N,
            bool is_bias,
            const float *bias,
-           bool is_relu) {
+           bool is_relu,
+           const ARMContext *ctx) {
   if (transA) {
-    LOG(ERROR) << " sgemv, transA is not supported now";
-    return false;
-  }
-  if (is_bias) {
-    //! with bias
-    if (is_relu) {
-      //! with relu
-      sgemv_bias_relu(transA, M, N, A, x, y, bias);
-    } else {
-      //! without relu
-      sgemv_bias(transA, M, N, A, x, y, bias);
-    }
+    sgemv_trans(M, N, A, x, y, is_bias, bias, is_relu, ctx);
   } else {
-    //! without bias
-    if (is_relu) {
-      //! with relu
-      sgemv_relu(transA, M, N, A, x, y);
+    if (is_bias) {
+      //! with bias
+      if (is_relu) {
+        //! with relu
+        sgemv_bias_relu(transA, M, N, A, x, y, bias);
+      } else {
+        //! without relu
+        sgemv_bias(transA, M, N, A, x, y, bias);
+      }
     } else {
-      //! without relu
-      sgemv(transA, M, N, A, x, y);
+      //! without bias
+      if (is_relu) {
+        //! with relu
+        sgemv_relu(transA, M, N, A, x, y);
+      } else {
+        //! without relu
+        sgemv(transA, M, N, A, x, y);
+      }
     }
   }
   return true;
 }
-
+// clang-format off
 //! define compute kernel
 #ifdef __aarch64__
 #define SGEMV_IN_8                                    \
@@ -179,8 +670,8 @@ bool sgemv(const float *A,
   "fmla v5.4s, v9.4s, v21.4s  \n"                    /* mul + add*/            \
   "fmla v6.4s, v9.4s, v23.4s  \n"                    /* mul + add*/            \
   "fmla v7.4s, v9.4s, v25.4s  \n"                    /* mul + add*/            \
-  "bne 1b                     \n" /* jump to main loop */ /* pair add to final \
-                                                             result */         \
+  "bne 1b                     \n" /* jump to main loop */                      \
+  /* pair add to final result */                                               \
   "2:                         \n"  /* reduce to scale */                       \
   "faddp  v16.4s, v0.4s, v0.4s\n"  /* pair add to vector */                    \
   "faddp  s8, v16.2s          \n"  /* pair add to scale */                     \
@@ -231,8 +722,8 @@ bool sgemv(const float *A,
   "fmla v0.4s, v8.4s, v10.4s  \n"                    /* mul + add*/            \
   "subs %w[cnt], %w[cnt], #1  \n"                    /* sub main loop count */ \
   "fmla v1.4s, v9.4s, v11.4s  \n"                    /* mul + add*/            \
-  "bne 1b                     \n" /* jump to main loop */ /* pair add to final \
-                                                             result */         \
+  "bne 1b                     \n" /* jump to main loop */                      \
+  /* pair add to final result */                                               \
   "2:                         \n" /* reduce to scale */                        \
   "fadd   v9.4s, v0.4s, v1.4s \n" /* add 2 vector */                           \
   "faddp  v10.4s, v9.4s, v9.4s\n" /* pair add to vector */                     \
@@ -283,7 +774,7 @@ bool sgemv(const float *A,
   "fmax   s8, s8, s0          \n" /* relu */               \
   "str s8, [%[out]]           \n" /* save result */
 
-#else  //__aarch64__
+#else  // __aarch64__
 
 #define SGEMV_IN_4                                                    \
   "pld [%[in]]                    @ preload cache line, input\n"      \
@@ -349,8 +840,8 @@ bool sgemv(const float *A,
   "vmla.f32 q1, q5, q9            @ mul add\n"                                 \
   "vmla.f32 q2, q5, q11           @ mul add\n"                                 \
   "vmla.f32 q3, q5, q13           @ mul add\n"                                 \
-  "bne 1b                         @ jump to main loop\n" /* pair add to final  \
-                                                            result */          \
+  "bne 1b                         @ jump to main loop\n"                       \
+  /* pair add to final result */                                               \
   "2:                             @ pair add \n"                               \
   "vpadd.f32 d8, d0, d1           @ pair add, first step\n"                    \
   "vpadd.f32 d9, d2, d3           @ pair add, first step\n"                    \
@@ -382,13 +873,10 @@ bool sgemv(const float *A,
   "vmla.f32 q0, q12, q14              @ mul add\n"                             \
   "vmla.f32 q0, q13, q15              @ mul add\n"                             \
   "subs %[cnt] , #1                   @ sub loop count \n"                     \
-  "bne 1b                             @ jump to main loop\n" /* pair add to    \
-                                                                final result   \
-                                                                */             \
+  "bne 1b                             @ jump to main loop\n"                   \
   "2:                                 @ end processing\n"                      \
   "vpadd.f32 d2, d0, d1               @ pair add, first step\n"                \
-  "vpadd.f32 d0, d2, d2               @ pair add, final step\n" /* check tails \
-                                                                   */          \
+  "vpadd.f32 d0, d2, d2               @ pair add, final step\n"/*check tails*/ \
   "cmp %[tail], #1                    @ check whether has mid cols\n"          \
   "blt  4f                            @ jump to end\n"                         \
   "3:                                 @ tail loop\n"                           \
@@ -422,7 +910,7 @@ bool sgemv(const float *A,
   "vmax.f32   d0, d0, d1          @ relu\n"          \
   "vst1.32 {d0[0]}, [%[out]]      @ save result\n"
 #endif
-
+// clang-format on
 void sgemv(const bool transA,
            const int M,
            const int N,
@@ -523,7 +1011,7 @@ void sgemv(const bool transA,
           [tmp4] "r"(tmp4)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
-#else  //__aarch64__
+#else  // __aarch64__
   int out_cnt = M >> 2;
 #pragma omp parallel for
   for (int j = 0; j < out_cnt; j++) {
@@ -579,7 +1067,7 @@ void sgemv(const bool transA,
                  : [out] "r"(ptr_out)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
-#endif  //__aarch64__
+#endif  // __aarch64__
 }
 
 void sgemv_relu(const bool transA,
@@ -671,7 +1159,7 @@ void sgemv_relu(const bool transA,
         : [out] "r"(ptr_out)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
-#else  //__aarch64__
+#else  // __aarch64__
   int out_cnt = M >> 2;
 #pragma omp parallel for
   for (int j = 0; j < out_cnt; j++) {
@@ -727,7 +1215,7 @@ void sgemv_relu(const bool transA,
                  : [out] "r"(ptr_out)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
-#endif  //__aarch64__
+#endif  // __aarch64__
 }
 
 void sgemv_bias(const bool transA,
@@ -822,7 +1310,7 @@ void sgemv_bias(const bool transA,
         : [out] "r"(ptr_out), [bias0] "r"(bias0)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
-#else  //__aarch64__
+#else  // __aarch64__
   int out_cnt = M >> 2;
 #pragma omp parallel for
   for (int j = 0; j < out_cnt; j++) {
@@ -887,7 +1375,7 @@ void sgemv_bias(const bool transA,
                  : [out] "r"(ptr_out), [bias0] "r"(bias0)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
-#endif  //__aarch64__
+#endif  // __aarch64__
 }
 
 void sgemv_bias_relu(const bool transA,
@@ -980,7 +1468,7 @@ void sgemv_bias_relu(const bool transA,
         : [out] "r"(ptr_out), [bias0] "r"(bias0)
         : "v0", "v1", "v8", "v9", "v10", "v11", "v16", "v17", "cc", "memory");
   }
-#else  //__aarch64__
+#else  // __aarch64__
   int out_cnt = M >> 2;
 #pragma omp parallel for
   for (int j = 0; j < out_cnt; j++) {
@@ -1045,7 +1533,7 @@ void sgemv_bias_relu(const bool transA,
                  : [out] "r"(ptr_out), [bias0] "r"(bias0)
                  : "q0", "q1", "q12", "q13", "q14", "q15", "cc", "memory");
   }
-#endif  //__aarch64__
+#endif  // __aarch64__
 }
 
 }  // namespace math
diff --git a/lite/backends/arm/math/sgemv.h b/lite/backends/arm/math/sgemv.h
index 4d74006f93..aa17349c99 100644
--- a/lite/backends/arm/math/sgemv.h
+++ b/lite/backends/arm/math/sgemv.h
@@ -15,6 +15,8 @@
 #pragma once
 
 #include <cmath>
+#include "lite/core/context.h"
+#include "lite/core/device_info.h"
 
 namespace paddle {
 namespace lite {
@@ -28,9 +30,10 @@ bool sgemv(const float* A,
            bool transA,
            int M,
            int N,
-           bool is_bias = false,
-           const float* bias = nullptr,
-           bool is_relu = false);
+           bool is_bias,
+           const float* bias,
+           bool is_relu,
+           const ARMContext* ctx);
 
 }  // namespace math
 }  // namespace arm
diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt
index a6c3fcc66a..f73b4120e6 100644
--- a/lite/backends/cuda/CMakeLists.txt
+++ b/lite/backends/cuda/CMakeLists.txt
@@ -1,8 +1,7 @@
 if(NOT LITE_WITH_CUDA)
     return()
 endif()
-set(cuda_static_deps cudnn_static cublas_static curand_static
-    culibos_static cudart_static)
+get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
 
 nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_static_deps})
 nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_static_deps})
diff --git a/lite/backends/cuda/cuda_utils.h b/lite/backends/cuda/cuda_utils.h
index 13bf8190ef..9da70262f5 100644
--- a/lite/backends/cuda/cuda_utils.h
+++ b/lite/backends/cuda/cuda_utils.h
@@ -56,6 +56,15 @@
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << CudnnGetErrorInfo(status); \
   }
 
+const int CUDA_NUM_THREADS = 512;
+// CUDA: number of blocks for threads.
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+inline int CUDA_GET_BLOCKS(const int N, const int base) {
+  return (N + base - 1) / base;
+}
+
 namespace paddle {
 namespace lite {
 namespace cuda {
diff --git a/lite/backends/cuda/math/CMakeLists.txt b/lite/backends/cuda/math/CMakeLists.txt
index a5ee25643b..fafd74ae7a 100644
--- a/lite/backends/cuda/math/CMakeLists.txt
+++ b/lite/backends/cuda/math/CMakeLists.txt
@@ -2,8 +2,7 @@ if(NOT LITE_WITH_CUDA)
     return()
 endif()
 
-set(cuda_static_deps cudnn_static cublas_static curand_static
-    culibos_static cudart_static)
+get_property(cuda_static_deps GLOBAL PROPERTY CUDA_STATIC_MODULES)
 
 nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps})
 nv_library(cuda_scale SRCS scale.cu DEPS ${cuda_static_deps})
@@ -12,6 +11,9 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
 nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale
 cuda_type_trans ${cuda_static_deps})
 nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
+nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
+nv_library(cuda_gemm SRCS gemm.cc  DEPS ${cuda_static_deps})
+nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
 
 set (
  math_cuda
@@ -21,6 +23,9 @@ set (
  cuda_type_trans
  cuda_transpose
  cuda_elementwise
+ cudnn_pool
+ cuda_gemm
+ cuda_batched_gemm
 )
 
 set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
diff --git a/lite/backends/cuda/math/batched_gemm.cc b/lite/backends/cuda/math/batched_gemm.cc
new file mode 100644
index 0000000000..e815109276
--- /dev/null
+++ b/lite/backends/cuda/math/batched_gemm.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/batched_gemm.h"
+#include <iostream>
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <>
+bool BatchedGemm<float, float>::init(const bool trans_a,
+                                     const bool trans_b,
+                                     const int max_batch_size,
+                                     Context<TARGET(kCUDA)> *ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cudaMalloc(reinterpret_cast<void **>(&A_),
+             3 * max_batch_size * sizeof(float *));
+  return true;
+}
+
+template <>
+bool BatchedGemm<float, float>::run(const float alpha,
+                                    const float beta,
+                                    const float *a[],
+                                    const float *b[],
+                                    float *c[],
+                                    const int m,
+                                    const int n,
+                                    const int k,
+                                    const int batch_size) {
+  CHECK(a != nullptr);
+  CHECK(b != nullptr);
+  CHECK(c != nullptr);
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  cudaMemcpyAsync(A_,
+                  a,
+                  batch_size * sizeof(const float *),
+                  cudaMemcpyHostToDevice,
+                  exe_stream_);
+  cudaMemcpyAsync(A_ + batch_size,
+                  b,
+                  batch_size * sizeof(const float *),
+                  cudaMemcpyHostToDevice,
+                  exe_stream_);
+  cudaMemcpyAsync(A_ + batch_size * 2,
+                  c,
+                  batch_size * sizeof(float *),
+                  cudaMemcpyHostToDevice,
+                  exe_stream_);
+  CUBLAS_CALL(cublasSgemmBatched(cu_handle_,
+                                 cu_trans_b_,
+                                 cu_trans_a_,
+                                 n_,
+                                 m_,
+                                 k_,
+                                 &alpha,
+                                 const_cast<const float **>(A_ + batch_size),
+                                 ldb_,
+                                 const_cast<const float **>(A_),
+                                 lda_,
+                                 &beta,
+                                 A_ + batch_size * 2,
+                                 ldc_,
+                                 batch_size));
+  return true;
+}
+
+template <>
+bool BatchedGemm<float, float>::run(const float alpha,
+                                    const float beta,
+                                    const float *a[],
+                                    const int m,
+                                    const int n,
+                                    const int k,
+                                    const int batch_size) {
+  CHECK(a != nullptr);
+  lda_ = (cu_trans_a_ == CUBLAS_OP_N) ? k : m;
+  ldb_ = (cu_trans_b_ == CUBLAS_OP_N) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  cudaMemcpyAsync(A_,
+                  a,
+                  3 * batch_size * sizeof(const float *),
+                  cudaMemcpyDefault,
+                  exe_stream_);
+  CUBLAS_CALL(cublasSgemmBatched(cu_handle_,
+                                 cu_trans_b_,
+                                 cu_trans_a_,
+                                 n_,
+                                 m_,
+                                 k_,
+                                 &alpha,
+                                 const_cast<const float **>(A_ + batch_size),
+                                 ldb_,
+                                 const_cast<const float **>(A_),
+                                 lda_,
+                                 &beta,
+                                 A_ + batch_size * 2,
+                                 ldc_,
+                                 batch_size));
+  return true;
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/batched_gemm.h b/lite/backends/cuda/math/batched_gemm.h
new file mode 100644
index 0000000000..2b91d3a524
--- /dev/null
+++ b/lite/backends/cuda/math/batched_gemm.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+class BatchedGemm {
+ public:
+  BatchedGemm() : cu_handle_(nullptr) {}
+  ~BatchedGemm() {
+    if (A_ != nullptr) {
+      cudaFree(A_);
+    }
+  }
+
+  bool init(const bool trans_a,
+            const bool trans_b,
+            const int max_batch_size,
+            Context<TARGET(kCUDA)>* ctx);
+
+  bool run(const PtypeOut alpha,
+           const PtypeOut beta,
+           const PtypeIn* a[],
+           const PtypeIn* b[],
+           PtypeOut* c[],
+           const int m,
+           const int n,
+           const int k,
+           const int batch_size);
+
+  bool run(const PtypeOut alpha,
+           const PtypeOut beta,
+           const PtypeIn* a[],
+           const int m,
+           const int n,
+           const int k,
+           const int batch_size);
+
+ private:
+  cudaStream_t exe_stream_;
+  cublasHandle_t cu_handle_;
+  cublasOperation_t cu_trans_a_;
+  cublasOperation_t cu_trans_b_;
+  int m_{-1};
+  int n_{-1};
+  int k_{-1};
+  int lda_{-1};
+  int ldb_{-1};
+  int ldc_{-1};
+  PtypeIn** A_{nullptr};
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_conv.cc b/lite/backends/cuda/math/cudnn_conv.cc
index 72ed3951f6..a4f33f467f 100644
--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -31,6 +31,9 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
   auto o_dims = param.output->dims();
   int batch = x_dims[0];
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   int iw = x_dims[3];  // nchw
   int ih = x_dims[2];
   int ic = x_dims[1];
@@ -41,10 +44,10 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
   int kh = w_dims[2];
   int sw = param.strides[1];
   int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
+  int pw = paddings[2];
+  int ph = paddings[0];
+  int dw = dilations[1];
+  int dh = dilations[0];
 
   CHECK(ic % param.groups == 0)
       << "The conv input channel shoud be divide group number.";
@@ -133,8 +136,8 @@ bool CudnnConv2D<PRECISION(kFloat)>::create(const operators::ConvParam& param,
     this->fwd_algo_ = algo_cache.GetAlgorithm(x_dims.Vectorize(),
                                               w_dims.Vectorize(),
                                               param.strides,
-                                              param.paddings,
-                                              param.dilations,
+                                              *param.paddings,
+                                              *param.dilations,
                                               0,
                                               search_func);
 
@@ -311,12 +314,15 @@ bool CudnnConv2DInt8<Ptype_out>::create(const operators::ConvParam& param,
   int kw = w_dims[2];
   int kh = w_dims[1];
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   int sw = param.strides[1];
   int sh = param.strides[0];
-  int pw = param.paddings[1];
-  int ph = param.paddings[0];
-  int dw = param.dilations[1];
-  int dh = param.dilations[0];
+  int pw = paddings[2];
+  int ph = paddings[0];
+  int dw = dilations[1];
+  int dh = dilations[0];
 
   std::vector<float> weight_scale = param.weight_scale;
   float input_scale = param.input_scale;
diff --git a/lite/backends/cuda/math/cudnn_pool.cc b/lite/backends/cuda/math/cudnn_pool.cc
new file mode 100644
index 0000000000..f970fc326b
--- /dev/null
+++ b/lite/backends/cuda/math/cudnn_pool.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/cudnn_pool.h"
+#include "lite/backends/cuda/math/activation.h"
+#include "lite/backends/cuda/math/scale.h"
+#include "lite/backends/cuda/math/type_trans.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+inline void UpdatePadding(std::vector<int>* paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::vector<int>& data_dims,
+                          const std::vector<int>& strides,
+                          const std::vector<int>& ksize) {
+  if (paddings->size() == data_dims.size()) {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      int copy_pad = *(paddings->begin() + 2 * i);
+      paddings->insert(paddings->begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    CHECK(data_dims.size() * 2 == paddings->size())
+        << "Paddings size should be the same or twice as the pooling size.";
+  }
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
+inline void UpdateKsize(std::vector<int>* ksize,
+                        const std::vector<int>& data_dims) {
+  ksize->resize(static_cast<size_t>(data_dims.size()));
+  for (size_t i = 0; i < ksize->size(); ++i) {
+    *(ksize->begin() + i) = static_cast<int>(data_dims[i]);
+  }
+}
+
+template <>
+bool CudnnPool2DNHWC<PRECISION(kFloat)>::create(
+    const operators::PoolParam& param, Context<TARGET(kCUDA)>* ctx) {
+  return true;
+}
+
+template <>
+bool CudnnPool2DNHWC<PRECISION(kFloat)>::init(const operators::PoolParam& param,
+                                              Context<TARGET(kCUDA)>* ctx) {
+  this->stream_ = ctx->exec_stream();
+  CUDNN_CHECK(cudnnCreate(&this->handle_));
+  CUDNN_CHECK(cudnnSetStream(this->handle_, this->stream_));
+
+  cudnnCreateTensorDescriptor(&this->input_desc_);
+  cudnnCreateTensorDescriptor(&this->output_desc_);
+  cudnnCreatePoolingDescriptor(&this->pooling_desc_);
+
+  return create(param, ctx);
+}
+
+template <>
+bool CudnnPool2DNHWC<PRECISION(kFloat)>::run(
+    const operators::PoolParam& param) {
+  auto x_dims = param.x->dims();
+  auto o_dims = param.output->dims();
+  int batch = x_dims[0];
+  const float* in_data = param.x->data<float>();
+  float* out_data = param.output->mutable_data<float>(TARGET(kCUDA));
+
+  int ih = x_dims[1];
+  int iw = x_dims[2];  // nchw
+  int ic = x_dims[3];
+
+  int oh = o_dims[1];
+  int ow = o_dims[2];
+  int oc = o_dims[3];
+
+  std::vector<int> ksize = param.ksize;
+  std::vector<int> strides = param.strides;
+  std::vector<int> paddings = *(param.paddings.get());
+
+  std::string pooling_type = param.pooling_type;
+  bool global_pooling = param.global_pooling;
+  bool exclusive = param.exclusive;
+  bool adaptive = param.adaptive;
+
+  std::vector<int> data_dims = {ih, iw};
+  UpdatePadding(&paddings, global_pooling, adaptive, data_dims, strides, ksize);
+
+  if (data_dims.size() * 2 == paddings.size()) {
+    for (size_t i = 0; i < data_dims.size(); ++i) {
+      paddings.erase(paddings.begin() + i + 1);
+    }
+  }
+
+  if (global_pooling) {
+    UpdateKsize(&ksize, data_dims);
+  }
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->input_desc_,
+                                         CUDNN_TENSOR_NHWC,
+                                         CUDNN_DATA_FLOAT,
+                                         batch,
+                                         ic,
+                                         ih,
+                                         iw));
+
+  CUDNN_CHECK(cudnnSetTensor4dDescriptor(this->output_desc_,
+                                         CUDNN_TENSOR_NHWC,
+                                         CUDNN_DATA_FLOAT,
+                                         batch,
+                                         oc,
+                                         oh,
+                                         ow));
+  cudnnPoolingMode_t mode;
+  if (pooling_type == "max") {
+    mode = CUDNN_POOLING_MAX;
+  } else {
+    mode = exclusive ? CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING
+                     : CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+  }
+  CUDNN_CHECK(cudnnSetPoolingNdDescriptor(this->pooling_desc_,
+                                          mode,
+                                          CUDNN_NOT_PROPAGATE_NAN,
+                                          ksize.size(),
+                                          ksize.data(),
+                                          paddings.data(),
+                                          strides.data()));
+  float alpha = 1.0f;
+  float beta = 0.0f;
+  CUDNN_CHECK(cudnnPoolingForward(this->handle_,
+                                  this->pooling_desc_,
+                                  &alpha,
+                                  this->input_desc_,
+                                  in_data,
+                                  &beta,
+                                  this->output_desc_,
+                                  out_data));
+
+  return true;
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/cudnn_pool.h b/lite/backends/cuda/math/cudnn_pool.h
new file mode 100644
index 0000000000..acdc695b50
--- /dev/null
+++ b/lite/backends/cuda/math/cudnn_pool.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <PrecisionType Ptype_out>
+class CudnnPool2DBase {
+ public:
+  CudnnPool2DBase()
+      : handle_(NULL),
+        input_desc_(NULL),
+        output_desc_(NULL),
+        pooling_desc_(NULL) {}
+
+  ~CudnnPool2DBase() {
+    if (handle_ != NULL) {
+      CUDNN_CHECK(cudnnDestroy(handle_));
+    }
+    if (input_desc_) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(input_desc_));
+    }
+    if (output_desc_) {
+      CUDNN_CHECK(cudnnDestroyTensorDescriptor(output_desc_));
+    }
+    if (pooling_desc_) {
+      cudnnDestroyPoolingDescriptor(pooling_desc_);
+    }
+  }
+
+ protected:
+  cudaStream_t stream_;
+  cudnnHandle_t handle_;
+  cudnnTensorDescriptor_t input_desc_;
+  cudnnTensorDescriptor_t output_desc_;
+  cudnnPoolingDescriptor_t pooling_desc_;
+};
+
+template <PrecisionType Ptype_out>
+class CudnnPool2DNHWC : public CudnnPool2DBase<Ptype_out> {
+ public:
+  CudnnPool2DNHWC() : CudnnPool2DBase<Ptype_out>() {}
+  virtual ~CudnnPool2DNHWC() = default;
+  virtual bool init(const operators::PoolParam& param,
+                    Context<TARGET(kCUDA)>* ctx);
+
+  virtual bool create(const operators::PoolParam& param,
+                      Context<TARGET(kCUDA)>* ctx);
+
+  virtual bool run(const operators::PoolParam& param);
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/elementwise.cu b/lite/backends/cuda/math/elementwise.cu
index 57c9ec022a..8f0ebd1f97 100644
--- a/lite/backends/cuda/math/elementwise.cu
+++ b/lite/backends/cuda/math/elementwise.cu
@@ -13,13 +13,55 @@
 // limitations under the License.
 
 #include "lite/backends/cuda/math/elementwise.h"
-#include "lite/backends/cuda/math/utils.h"
 
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
 
+template <typename Dtype>
+__global__ void elementwise_kernel(const size_t total,
+                                   const Dtype* x_data,
+                                   const Dtype* y_data,
+                                   Dtype* out_data,
+                                   int pre,
+                                   int n,
+                                   int post,
+                                   BinaryOperation type) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < total) {
+    int idx = tid / post % n;
+#if __CUDA_ARCH__ >= 350
+    out_data[tid] = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type);
+#else
+    out_data[tid] = binary_calc(x_data[tid], y_data[idx], type);
+#endif
+  }
+}
+
+template <typename Dtype>
+__global__ void elementwise_relu_kernel(const size_t total,
+                                        const Dtype* x_data,
+                                        const Dtype* y_data,
+                                        Dtype* out_data,
+                                        int pre,
+                                        int n,
+                                        int post,
+                                        BinaryOperation type) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < total) {
+    int idx = tid / post % n;
+    Dtype temp;
+#if __CUDA_ARCH__ >= 350
+    temp = binary_calc(__ldg(x_data + tid), __ldg(y_data + idx), type);
+
+#else
+    temp = binary_calc(x_data[tid], y_data[idx], type);
+#endif
+    out_data[tid] = temp > 0 ? temp : 0;
+  }
+}
+
 template <typename Dtype>
 __global__ void elementwise_add_kernel(const size_t total,
                                        const Dtype* x_data,
@@ -76,6 +118,56 @@ __global__ void elementwise_add_nhwc4_int8_kernel(const size_t total,
   }
 }
 
+template <typename Dtype>
+void elementwise(const Dtype* x_data,
+                 const Dtype* y_data,
+                 Dtype* out_data,
+                 int pre,
+                 int n,
+                 int post,
+                 BinaryOperation type,
+                 cudaStream_t stream) {
+  int num = pre * n * post;
+  int thread = 256;
+  int block = (num + thread - 1) / thread;
+  elementwise_kernel<<<block, thread, 0, stream>>>(
+      num, x_data, y_data, out_data, pre, n, post, type);
+}
+
+template <typename Dtype>
+void elementwise_relu(const Dtype* x_data,
+                      const Dtype* y_data,
+                      Dtype* out_data,
+                      int pre,
+                      int n,
+                      int post,
+                      BinaryOperation type,
+                      cudaStream_t stream) {
+  int num = pre * n * post;
+  int thread = 256;
+  int block = (num + thread - 1) / thread;
+  elementwise_relu_kernel<<<block, thread, 0, stream>>>(
+      num, x_data, y_data, out_data, pre, n, post, type);
+}
+
+template void elementwise(const float*,
+                          const float*,
+                          float*,
+                          int,
+                          int,
+                          int,
+                          BinaryOperation,
+                          cudaStream_t);
+
+template void elementwise_relu(const float*,
+                               const float*,
+                               float*,
+                               int,
+                               int,
+                               int,
+                               BinaryOperation,
+                               cudaStream_t);
+
 template <typename Dtype>
 void elementwise_add(int num,
                      const Dtype* x_data,
diff --git a/lite/backends/cuda/math/elementwise.h b/lite/backends/cuda/math/elementwise.h
index 7fcdf95021..ce45d0544e 100644
--- a/lite/backends/cuda/math/elementwise.h
+++ b/lite/backends/cuda/math/elementwise.h
@@ -15,12 +15,33 @@
 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
+#include "lite/backends/cuda/math/utils.h"
 
 namespace paddle {
 namespace lite {
 namespace cuda {
 namespace math {
 
+template <typename Dtype>
+void elementwise(const Dtype* x_data,
+                 const Dtype* y_data,
+                 Dtype* out_data,
+                 int pre,
+                 int n,
+                 int post,
+                 BinaryOperation type,
+                 cudaStream_t stream);
+
+template <typename Dtype>
+void elementwise_relu(const Dtype* x_data,
+                      const Dtype* y_data,
+                      Dtype* out_data,
+                      int pre,
+                      int n,
+                      int post,
+                      BinaryOperation type,
+                      cudaStream_t stream);
+
 template <typename Dtype>
 void elementwise_add(int num,
                      const Dtype* x_data,
diff --git a/lite/backends/cuda/math/gemm.cc b/lite/backends/cuda/math/gemm.cc
new file mode 100644
index 0000000000..a9f12984aa
--- /dev/null
+++ b/lite/backends/cuda/math/gemm.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/cuda/math/gemm.h"
+#include <iostream>
+#include "lite/core/device_info.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <>
+bool Gemm<float, float>::init(const bool trans_a,
+                              bool trans_b,
+                              const int m,
+                              const int n,
+                              const int k,
+                              Context<TARGET(kCUDA)> *ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  lda_ = (!trans_a) ? k : m;
+  ldb_ = (!trans_b) ? n : k;
+  ldc_ = n;
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  return true;
+}
+
+template <>
+bool Gemm<float, float>::init(const bool trans_a,
+                              bool trans_b,
+                              const int m,
+                              const int n,
+                              const int k,
+                              const int lda,
+                              const int ldb,
+                              const int ldc,
+                              Context<TARGET(kCUDA)> *ctx) {
+  if (cu_handle_ == nullptr) {
+    this->exe_stream_ = ctx->exec_stream();
+    CUBLAS_CALL(cublasCreate(&cu_handle_));
+    CUBLAS_CALL(cublasSetStream(cu_handle_, this->exe_stream_));
+  }
+  m_ = m;
+  n_ = n;
+  k_ = k;
+  lda_ = lda;
+  ldb_ = ldb;
+  ldc_ = ldc;
+  cu_trans_a_ = trans_a ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cu_trans_b_ = trans_b ? CUBLAS_OP_T : CUBLAS_OP_N;
+  return true;
+}
+
+template <>
+bool Gemm<float, float>::run(const float alpha,
+                             const float beta,
+                             const float *a,
+                             const float *b,
+                             float *c,
+                             Context<TARGET(kCUDA)> *ctx) {
+  CUBLAS_CALL(cublasSgemm(cu_handle_,
+                          cu_trans_b_,
+                          cu_trans_a_,
+                          n_,
+                          m_,
+                          k_,
+                          &alpha,
+                          b,
+                          ldb_,
+                          a,
+                          lda_,
+                          &beta,
+                          c,
+                          ldc_));
+  return true;
+}
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/gemm.h b/lite/backends/cuda/math/gemm.h
new file mode 100644
index 0000000000..12194d54b0
--- /dev/null
+++ b/lite/backends/cuda/math/gemm.h
@@ -0,0 +1,74 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include <string>
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/context.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace cuda {
+namespace math {
+
+template <typename PtypeIn, typename PtypeOut>
+class Gemm {
+ public:
+  Gemm() : cu_handle_(nullptr) {}
+  ~Gemm() {}
+  bool init(const bool trans_a,
+            const bool trans_b,
+            const int m,
+            const int n,
+            const int k,
+            Context<TARGET(kCUDA)>* ctx);
+  bool init(const bool trans_a,
+            const bool trans_b,
+            const int m,
+            const int n,
+            const int k,
+            const int lda,
+            const int ldb,
+            const int ldc,
+            Context<TARGET(kCUDA)>* ctx);
+
+  bool run(const PtypeOut alpha,
+           const PtypeOut beta,
+           const PtypeIn* a,
+           const PtypeIn* b,
+           PtypeOut* c,
+           Context<TARGET(kCUDA)>* ctx);
+
+ private:
+  cudaStream_t exe_stream_;
+  cublasHandle_t cu_handle_;
+  cublasOperation_t cu_trans_a_;
+  cublasOperation_t cu_trans_b_;
+  int m_{-1};
+  int n_{-1};
+  int k_{-1};
+  int lda_{-1};
+  int ldb_{-1};
+  int ldc_{-1};
+};
+
+}  // namespace math
+}  // namespace cuda
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h
index b4cd82fd8d..b6aa9c7d16 100644
--- a/lite/backends/cuda/math/utils.h
+++ b/lite/backends/cuda/math/utils.h
@@ -25,6 +25,24 @@ namespace lite {
 namespace cuda {
 namespace math {
 
+enum class BinaryOperation {
+  kADD = 0,
+  kMUL = 1,
+  kDIV = 2,
+};
+
+template <typename T>
+__device__ T binary_calc(T x, T y, BinaryOperation type);
+
+template <>
+__device__ __forceinline__ float binary_calc(float x,
+                                             float y,
+                                             BinaryOperation type) {
+  if (type == BinaryOperation::kADD) return x + y;
+  if (type == BinaryOperation::kMUL) return x * y;
+  if (type == BinaryOperation::kDIV) return x / y;
+}
+
 template <typename T>
 __device__ T from_float(float x);
 
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
index fd17218d06..23332b422d 100644
--- a/lite/backends/fpga/KD/pes/conv_process.hpp
+++ b/lite/backends/fpga/KD/pes/conv_process.hpp
@@ -294,10 +294,17 @@ inline void split_filter_num(const ConvParam& c_param) {
     args.image.channels = input->shape().channel();
     args.image.width = input->shape().width();
     args.image.height = input->shape().height();
-    args.image.pad_width = param.paddings[1];
+    auto paddings = *param.padding;
+    args.image.pad_width = param.paddings[2];
     args.image.pad_height = param.paddings[0];
     args.output.address = out_address;
     args.output.scale_address = out_scale_address;
+    bool pad_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+    if (!pad_equal) {
+      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
+                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
+    }
     param.splitParams().push_back(conv_param);
   }
 }
@@ -372,10 +379,18 @@ inline void split_channel(const ConvParam& c_param) {
     args.image.channels = conv_param->input.shape().channel();
     args.image.width = conv_param->input.shape().width();
     args.image.height = conv_param->input.shape().height();
-    args.image.pad_width = param.paddings[1];
-    args.image.pad_height = param.paddings[0];
+    auto paddings = *param.paddings;
+    args.image.pad_width = paddings[2];
+    args.image.pad_height = paddings[0];
+
     args.output.address = conv_param->output.mutableData<void>();
     args.output.scale_address = conv_param->output.scale();
+    bool pad_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+    if (!pad_equal) {
+      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
+                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
+    }
     param.splitParams().push_back(conv_param);
   }
 }
diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
old mode 100755
new mode 100644
index 9d7b9b544b..f86806102d
--- a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
+++ b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
@@ -61,14 +61,21 @@ class DepthwiseConvPE : public PE {
     args.image.channels = input->shape().channel();
     args.image.height = input->shape().height();
     args.image.width = input->shape().width();
-    args.image.pad_width = param.paddings[0];
-    args.image.pad_height = param.paddings[1];
+    auto paddings = *param.paddings;
+    args.image.pad_width = param.paddings[2];
+    args.image.pad_height = param.paddings[0];
     args.image.scale_address = input->scale();
     args.output.address = output->data<void>();
     args.output.scale_address = output->scale();
     args.out_width = param.output->shape().width();
     args.out_height = param.output->shape().height();
     args.sub_conv_num = 1;
+    bool pad_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+    if (!pad_equal) {
+      LOG(FATA) << "This pad not support ! " << paddings[0] << ", "
+                << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
+    }
     param.args = args;
 
     inplace_.relu_enable = param_.relu.enabled;
diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp
index fd3be1f463..5bb4f5285a 100644
--- a/lite/backends/fpga/KD/pes/pooling_pe.hpp
+++ b/lite/backends/fpga/KD/pes/pooling_pe.hpp
@@ -45,13 +45,14 @@ class PoolingPE : public PE {
 
     PoolingArgs args = {0};
     args.mode = param_.type;
+    auto paddings = *param_.paddings;
     args.kernel_reciprocal = fp32_2_fp16(1.0f / (k_width * k_height));
     args.image.address = input->data<float16>();
     args.image.channels = input->shape().channel();
     args.image.height = input->shape().height();
     args.image.width = input->shape().width();
-    args.image.pad_height = param_.paddings[0];
-    args.image.pad_width = param_.paddings[1];
+    args.image.pad_height = paddings[0];
+    args.image.pad_width = paddings[2];
     args.image.scale_address = input->scale();
     args.output.address = output->mutableData<float16>();
     args.output.scale_address = output->scale();
@@ -76,12 +77,13 @@ class PoolingPE : public PE {
     float* image_addr = float_input.mutableData<float>(FP32, input->shape());
     float_input.copyFrom(input);
     float16* data_out = output->data<float16>();
+    auto paddings = *param_.paddings;
 
     int image_height = input->shape().height();
     int image_width = input->shape().width();
     int image_channels = input->shape().channel();
-    int image_pad_h = param_.paddings[0];
-    int image_pad_w = param_.paddings[1];
+    int image_pad_h = paddings[0];
+    int image_pad_w = paddings[2];
     int kernel_height = param_.kernelSize[1];
     int kernel_width = param_.kernelSize[0];
     int kernel_step_h = param_.strides[0];
diff --git a/lite/backends/npu/builder.cc b/lite/backends/npu/builder.cc
index ad5bed5be9..954fad8c91 100644
--- a/lite/backends/npu/builder.cc
+++ b/lite/backends/npu/builder.cc
@@ -142,21 +142,25 @@ ge::TensorPtr CvtTensor(lite::Tensor* in_tensor,
 
 int CvtActMode(std::string act_type) {
   int act_mode = 1;
-  if (act_type == "sigmod") {
+  if (act_type == "sigmoid") {
     act_mode = 0;
   } else if (act_type == "relu") {
     act_mode = 1;
   } else if (act_type == "tanh") {
     act_mode = 2;
+  } else if (act_type == "relu_clipped") {
+    act_mode = 3;
   } else if (act_type == "elu") {
     act_mode = 4;
+  } else if (act_type == "leaky_relu") {
+    act_mode = 5;
   } else if (act_type == "abs") {
     act_mode = 6;
   } else if (act_type == "softsign") {
     act_mode = 8;
   } else if (act_type == "softplus") {
     act_mode = 9;
-  } else if (act_type == "hardsigmoid") {
+  } else if (act_type == "hard_sigmoid") {
     act_mode = 10;
   } else {
     // TODO(hong19860320) support more activation mode
diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h
index 02f7071a4e..70200354fb 100644
--- a/lite/backends/npu/builder.h
+++ b/lite/backends/npu/builder.h
@@ -31,117 +31,6 @@
 
 // Extended Ops of HIAI DDK
 namespace ge {
-/**
- * Multiply the matrix x1 by the matrix x2 to generate x1 * x2.
- * The inputs must be two-dimensional matrices and the inner dimension of "x1"
- * (after being transposed if transpose_x1 is true) must match the outer
- * dimension of "x2" (after being transposed if transposed_x2 is true). <Input>
- *      x : the first input tensor, must be non const op.
- *      w : the second input tensor, must be const op.
- *      bias: the optional bias tensor, must be const op.
- * <Output>
- *      y : the output tensor.
- * <Attr>
- *      has_bias: If true, enable input bias.
- */
-REG_OP(MatMul)
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .INPUT(w, TensorType({DT_FLOAT}))
-    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT}))  // bias must be const input
-    .OUTPUT(y, TensorType({DT_FLOAT}))
-    .ATTR(has_bias, AttrValue::BOOL{false})  // when has input::bias,set true
-    .OP_END();
-
-/**
- * Computes the gradients of convolution with respect to the input.
- * <Input>
- *      input_sizes : An integer vector representing the shape of input,
- * where input is a 4-D [batch, height, width, channels] tensor.
- *      filter : the filter tensor, with shape [H , W, filter_channel,
- * filter_number], filter_channel must be same as x channel.
- *      x :  The input tensor.
- * <Output>
- *      y : The output tensor.
- * <Attr>
- *      format: 0: NCHW. 1: NHWC
- *      group : 1: default
- *      num_output : 0: default, num_output must be equal to
- * (filter_channel * group)
- *      pad : Padding for the beginning and ending along each axis
- *      stride : Stride along each axis.
- *      dilation : dilation value along each axis of the filter.
- *      pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET
- *      bias_term : 0: default
- *      kernel : The shape of the convolution kernel
- */
-REG_OP(Deconvolution)
-    .INPUT(input_sizes, TensorType({DT_UINT8}))
-    .INPUT(filter, TensorType({DT_FLOAT}))
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT}))
-    .OUTPUT(y, TensorType({DT_FLOAT}))
-    .ATTR(mode, AttrValue::INT{1})
-    .ATTR(format, AttrValue::INT{1})
-    .ATTR(group, AttrValue::INT{1})
-    .ATTR(num_output, AttrValue::INT{0})
-    .ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0}))
-    .ATTR(stride, AttrValue::LIST_INT({1, 1}))
-    .ATTR(dilation, AttrValue::LIST_INT({1, 1}))
-    .ATTR(pad_mode, AttrValue::INT{0})
-    .ATTR(bias_term, AttrValue::INT{0})
-    .ATTR(kernel, AttrValue::LIST_INT({0, 0}))
-    .OP_END();
-
-/**
- * Resize images to size using bilinear interpolation.
- * <Input>
- *      x : The tensor of 4-D
- *      w : A int32 Tensor of 2 elements: [height, width].
- * <Output>
- *      y : the output tensor
- * <Attr>
- *      align_corners : If true, the centers of the 4 corner pixels of the
- * input and output tensors are aligned, preserving the values at the corner
- * pixels.
- *      output_dim_mode : Defaults 2, including 0: zoom_factor , 1:
- * shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is
- * controled by the [height, width] of w.
- *      shrink_factor : shrink factor.
- *      zoom_factor : zoom factor.
- *      pad_begin : begin of pad.
- *      pad_end : end of pad.
- */
-REG_OP(ResizeBilinear)
-    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
-    .INPUT(w, TensorType({DT_FLOAT, DT_INT32}))
-    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32}))
-    .ATTR(align_corners, AttrValue::BOOL{false})
-    .ATTR(output_dim_mode, AttrValue::INT{2})
-    .ATTR(shrink_factor, AttrValue::INT{1})
-    .ATTR(zoom_factor, AttrValue::INT{1})
-    .ATTR(pad_begin, AttrValue::INT{0})
-    .ATTR(pad_end, AttrValue::INT{0})
-    .OP_END();
-
-/**
- * Resize images to size using nearest neighbor interpolation.
- * <Input>
- *      image : Resize images to size using nearest neighbor interpolation.
- *      size : Must be one dimension and two  elements
- * <Output>
- *      output : the output tensor
- * <Attr>
- *      align_corners : If true, the centers of the 4 corner pixels of the
- * input and output tensors are aligned, preserving the values at the corner
- * pixels. Defaults to false
- */
-REG_OP(ResizeNearestNeighbor)
-    .INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
-    .INPUT(size, TensorType({DT_INT32}))
-    .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
-    .ATTR(align_corners, AttrValue::BOOL{false})
-    .OP_END();
-
 /**
  * Pads a tensor.
  * <Input>
diff --git a/lite/backends/opencl/cl_wrapper.cc b/lite/backends/opencl/cl_wrapper.cc
index 357ac8c2d6..93e176f9ed 100644
--- a/lite/backends/opencl/cl_wrapper.cc
+++ b/lite/backends/opencl/cl_wrapper.cc
@@ -75,7 +75,7 @@ void CLWrapper::InitFunctions() {
   do {                                                               \
     cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func);            \
     if (cl_func##_ == nullptr) {                                     \
-      LOG(ERROR) << "Cannot find the " << #cl_func                   \
+      LOG(FATAL) << "Cannot find the " << #cl_func                   \
                  << " symbol in libOpenCL.so!";                      \
       break;                                                         \
     }                                                                \
diff --git a/lite/backends/x86/math/CMakeLists.txt b/lite/backends/x86/math/CMakeLists.txt
index 2dea4364d5..a891076323 100644
--- a/lite/backends/x86/math/CMakeLists.txt
+++ b/lite/backends/x86/math/CMakeLists.txt
@@ -50,7 +50,8 @@ math_library(unpooling)
 math_library(vol2col)
 ## math_library(prelu)
 math_library(tree2col DEPS math_function)
-
+math_library(sequence_topk_avg_pooling)
+math_library(search_fc DEPS blas dynload_mklml)
 # cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 # cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 # cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc
index bbe35b4de5..8d61fb3bbb 100644
--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "lite/backends/x86/math/beam_search.h"
 #include <algorithm>
+#include <cmath>
 #include <map>
 #include "lite/fluid/lod.h"
 
diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc
index 9da239f9c6..ab6c1edb48 100644
--- a/lite/backends/x86/math/pooling.cc
+++ b/lite/backends/x86/math/pooling.cc
@@ -49,7 +49,7 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
 
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
@@ -130,7 +130,7 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
@@ -213,7 +213,7 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
@@ -629,7 +629,7 @@ class MaxPool2dWithIndexFunctor<lite::TargetType::kX86, T1, T2> {
     const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
-    const int padding_width = paddings[1];
+    const int padding_width = paddings[2];
     const int input_stride = input_height * input_width;
     const int output_stride = output_height * output_width;
 
diff --git a/lite/backends/x86/math/search_fc.cc b/lite/backends/x86/math/search_fc.cc
new file mode 100644
index 0000000000..56fc363cb4
--- /dev/null
+++ b/lite/backends/x86/math/search_fc.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/x86/math/search_fc.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SearchFcFunctor<lite::TargetType::kX86, T> {
+ public:
+  void operator()(const lite::X86Context& context,
+                  const lite::Tensor& bottom,
+                  const lite::Tensor& w,
+                  const lite::Tensor& b,
+                  lite::Tensor* top,
+                  int out_size) {
+    int batch = bottom.dims()[0];
+
+    int _out = w.dims()[0];  // 100
+    int _in = w.dims()[1];   // 228
+
+    lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
+
+    const auto bottom_data = bottom.data<T>();
+    auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
+    const auto weights = w.data<T>();
+    auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
+    call_gemm<lite::X86Context, T>(blas,
+                                   CblasNoTrans,
+                                   CblasTrans,
+                                   batch,
+                                   _out,
+                                   _in,
+                                   1.0f,
+                                   bottom_data,
+                                   weights,
+                                   0.0f,
+                                   top_data);
+    if (true) {
+      const auto* bias_data = b.data<T>();
+      for (int i = 0; i < batch; ++i) {
+        // add bias here
+        sse_eltadd(top_data + i * _out, bias_data, top_data + i * _out, _out);
+      }
+    }
+  }
+
+  // private:
+};
+
+#define DEFINE_FUNCTOR(type) \
+  template class SearchFcFunctor<lite::TargetType::kX86, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/search_fc.h b/lite/backends/x86/math/search_fc.h
new file mode 100644
index 0000000000..e415c39602
--- /dev/null
+++ b/lite/backends/x86/math/search_fc.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/backends/x86/mklml.h"
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/fluid/data_type.h"
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+template <typename DeviceContext, typename T>
+void call_gemm(const BlasT<lite::TargetType::kX86, T> blas,
+               const CBLAS_TRANSPOSE TransA,
+               const CBLAS_TRANSPOSE TransB,
+               const int M,
+               const int N,
+               const int K,
+               const T alpha,
+               const T* A,
+               const T* B,
+               const T beta,
+               T* C) {
+#ifndef __NAIVE_GEMM__
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+#else
+  naive::gemm((TransA == CblasTrans),
+              (TransB == CblasTrans),
+              M,
+              N,
+              K,
+              alpha,
+              A,
+              B,
+              beta,
+              C);
+#endif  // !__NAIVE_GEMM__
+}
+
+// To align with Lego
+#ifndef LEGO_USE_FLOAT
+#define LEGO_USE_FLOAT
+#endif
+#ifndef LEGO_SSE
+#define LEGO_SSE
+#endif
+
+#if defined(LEGO_USE_FLOAT)
+
+#define __m256x __m256
+#define __m128x __m128
+
+static const unsigned int AVX_STEP_SIZE = 8;
+static const unsigned int SSE_STEP_SIZE = 4;
+static const unsigned int AVX_CUT_LEN_MASK = 7U;
+static const unsigned int SSE_CUT_LEN_MASK = 3U;
+
+#define _mm256_setzero_px _mm256_setzero_ps
+#define _mm256_mul_px _mm256_mul_ps
+#define _mm256_add_px _mm256_add_ps
+#define _mm256_load_px _mm256_loadu_ps
+#define _mm256_hadd_px _mm256_hadd_ps
+#define _mm256_permute2f128_px _mm256_permute2f128_ps
+#define _mm256_store_px _mm256_storeu_ps
+#define _mm256_broadcast_sx _mm256_broadcast_ss
+#define _mm256_castpx256_px128 _mm256_castps256_ps128
+#define _mm256_max_px _mm256_max_ps
+#define _mm256_sub_px _mm256_sub_ps
+#define _mm256_set1_px _mm256_set1_ps
+#define _mm256_sqrt_px _mm256_sqrt_ps
+#define _mm256_div_px _mm256_div_ps
+#define _mm_setzero_px _mm_setzero_ps
+#define _mm_add_px _mm_add_ps
+#define _mm_mul_px _mm_mul_ps
+#define _mm_load_px _mm_loadu_ps
+#define _mm_hadd_px _mm_hadd_ps
+#define _mm_store_sx _mm_store_ss
+#define _mm_store_px _mm_storeu_ps
+#define _mm_load1_px _mm_load1_ps
+#define _mm_max_px _mm_max_ps
+#define _mm_sub_px _mm_sub_ps
+#define _mm_set1_px _mm_set1_ps
+#define _mm_sqrt_px _mm_sqrt_ps
+#define _mm_div_px _mm_div_ps
+
+#elif defined(LEGO_USE_DOUBLE)
+
+#define __m256x __m256d
+#define __m128x __m128d
+
+static const unsigned int AVX_STEP_SIZE = 4;
+static const unsigned int SSE_STEP_SIZE = 2;
+static const unsigned int AVX_CUT_LEN_MASK = 3U;
+static const unsigned int SSE_CUT_LEN_MASK = 1U;
+
+#define _mm256_setzero_px _mm256_setzero_pd
+#define _mm256_mul_px _mm256_mul_pd
+#define _mm256_add_px _mm256_add_pd
+#define _mm256_load_px _mm256_loadu_pd
+#define _mm256_hadd_px _mm256_hadd_pd
+#define _mm256_permute2f128_px _mm256_permute2f128_pd
+#define _mm256_store_px _mm256_storeu_pd
+#define _mm256_broadcast_sx _mm256_broadcast_sd
+#define _mm256_castpx256_px128 _mm256_castpd256_pd128
+#define _mm256_max_px _mm256_max_pd
+#define _mm256_sub_px _mm256_sub_pd
+#define _mm256_set1_px _mm256_set1_pd
+#define _mm256_sqrt_px _mm256_sqrt_pd
+#define _mm256_div_px _mm256_div_pd
+#define _mm_setzero_px _mm_setzero_pd
+#define _mm_add_px _mm_add_pd
+#define _mm_mul_px _mm_mul_pd
+#define _mm_load_px _mm_loadu_pd
+#define _mm_hadd_px _mm_hadd_pd
+#define _mm_store_sx _mm_store_sd
+#define _mm_store_px _mm_storeu_pd
+#define _mm_load1_px _mm_load1_pd
+#define _mm_max_px _mm_max_pd
+#define _mm_sub_px _mm_sub_pd
+#define _mm_set1_px _mm_set1_pd
+#define _mm_sqrt_px _mm_sqrt_pd
+#define _mm_div_px _mm_div_pd
+#endif
+
+template <typename T>
+inline void sse_eltadd(const T* x, const T* y, T* z, size_t len) {
+  unsigned int jjj, lll;
+  jjj = lll = 0;
+
+#if defined(LEGO_AVX)
+  lll = len & ~AVX_CUT_LEN_MASK;
+  for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
+    _mm256_store_px(
+        z + jjj,
+        _mm256_add_px(_mm256_load_px(x + jjj), _mm256_load_px(y + jjj)));
+  }
+#elif defined(LEGO_SSE)
+  lll = len & ~SSE_CUT_LEN_MASK;
+
+  for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) {
+    _mm_store_px(z + jjj,
+                 _mm_add_px(_mm_load_px(x + jjj), _mm_load_px(y + jjj)));
+  }
+#endif
+  for (; jjj < len; jjj++) {
+    z[jjj] = x[jjj] + y[jjj];
+  }
+}
+
+template <lite::TargetType Target, typename T>
+class SearchFcFunctor {
+ public:
+  void operator()(const lite::Context<Target>& context,
+                  const lite::Tensor& X,
+                  const lite::Tensor& W,
+                  const lite::Tensor& b,
+                  lite::Tensor* Out,
+                  int out_size);
+};
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
+
+#define FOR_ALL_TYPES(macro) macro(float);
diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.cc b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
new file mode 100644
index 0000000000..035a7923c7
--- /dev/null
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "lite/backends/x86/math/sequence_topk_avg_pooling.h"
+#include <algorithm>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+
+template <typename T>
+void get_topk_pos(const T* data, int length, int k, int* pos, bool debug) {
+  size_t real_k = k < length ? k : length;
+
+  std::vector<T> v(data, data + length);
+
+  std::vector<int> topk_pos;
+  T min_val = -10000000.0;
+  while (topk_pos.size() < real_k) {
+    T max_val = min_val;
+    int max_pos = -1;
+    for (int i = 0; i < length; ++i) {
+      if (v[i] > max_val) {
+        max_pos = i;
+        max_val = v[i];
+      }
+    }
+
+    assert(max_pos >= 0);
+
+    topk_pos.push_back(max_pos);
+    v[max_pos] = min_val;
+  }
+
+  assert(topk_pos.size() > 0);
+  while (topk_pos.size() < (size_t)k) {
+    topk_pos.push_back(-1);
+  }
+
+  for (size_t i = 0; i < topk_pos.size(); ++i) {
+    pos[i] = topk_pos[i];
+  }
+}
+
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension must be the same, except the axis dimension.
+ */
+template <typename T>
+class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
+ public:
+  void operator()(const lite::Tensor& in,
+                  const lite::Tensor& row,
+                  const lite::Tensor& col,
+                  lite::Tensor* out,
+                  lite::Tensor* pos,
+                  int channel_num,
+                  std::vector<int> topks) {
+    auto k_num = topks.size();
+    auto max_k = topks[topks.size() - 1];
+    std::vector<int64_t> vec_pos_shape;
+    auto in_lod = in.lod()[0];
+    auto row_lod = row.lod()[0];
+    auto col_lod = col.lod()[0];
+    int batch_size = row_lod.size() - 1;
+    int pos_total_size = row_lod[batch_size] * channel_num * max_k;
+    vec_pos_shape.push_back(pos_total_size);
+    lite::DDim dims(vec_pos_shape);
+    pos->Resize(dims);
+    auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
+
+    int offset = 0;
+    std::vector<size_t> vec_out_lod;
+    vec_out_lod.reserve(batch_size + 1);
+    for (int i = 0; i <= batch_size; ++i) {
+      offset = row_lod[i];
+      vec_out_lod.push_back(offset);
+    }
+
+    lite::LoD lod_temp;
+    lod_temp.push_back(vec_out_lod);
+    out->set_lod(lod_temp);
+
+    auto in_data = in.data<T>();
+    auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
+
+    T* sum_data = new T[max_k];
+    for (int i = 0; i < batch_size; ++i) {
+      int total_size = in_lod[i + 1] - in_lod[i];
+      int row_size = row_lod[i + 1] - row_lod[i];
+      int col_size = col_lod[i + 1] - col_lod[i];
+
+      CHECK_EQ(total_size, channel_num * row_size * col_size)
+          << "size wrong in sequence_topk_avg_pooling_op!";
+
+      int feature_num = row_size * col_size;
+      for (int j = 0; j < channel_num; ++j) {
+        auto input_offset_feature_data = in_data + in_lod[i] + j * feature_num;
+
+        for (int r = 0; r < row_size; ++r) {
+          auto row_data = input_offset_feature_data + r * col_size;
+          auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k +
+                                r * channel_num * max_k + j * max_k;
+          auto out_slice_data = out_data + row_lod[i] * channel_num * k_num +
+                                r * channel_num * k_num + j * k_num;
+
+          get_topk_pos<T>(row_data, col_size, max_k, pos_slice_data);
+          if (pos_slice_data[0] == -1) {
+            sum_data[0] = 0.0;
+          } else {
+            sum_data[0] = row_data[pos_slice_data[0]];
+          }
+          for (int k = 1; k < max_k; ++k) {
+            if (pos_slice_data[k] == -1) {
+              sum_data[k] = sum_data[k - 1];
+            } else {
+              sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]];
+            }
+          }
+          for (size_t k = 0; k < k_num; ++k) {
+            out_slice_data[k] = sum_data[topks[k] - 1] / topks[k];
+          }
+        }
+      }
+    }
+    delete[] sum_data;
+  }
+};
+
+#define DEFINE_FUNCTOR(type) \
+  template class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, type>;
+
+FOR_ALL_TYPES(DEFINE_FUNCTOR);
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.h b/lite/backends/x86/math/sequence_topk_avg_pooling.h
new file mode 100644
index 0000000000..78d458c4d8
--- /dev/null
+++ b/lite/backends/x86/math/sequence_topk_avg_pooling.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/core/context.h"
+#include "lite/core/tensor.h"
+#include "lite/fluid/data_type.h"
+
+namespace paddle {
+namespace lite {
+namespace x86 {
+namespace math {
+template <typename T>
+void get_topk_pos(
+    const T* data, int length, int k, int* pos, bool debug = false);
+
+template <lite::TargetType Target, typename T>
+class SequenceTopkAvgPoolingFunctor {
+ public:
+  void operator()(const lite::Tensor& X,
+                  const lite::Tensor& ROW,
+                  const lite::Tensor& COLUMN,
+                  lite::Tensor* Out,
+                  lite::Tensor* pos,
+                  int channel_num,
+                  std::vector<int> topks);
+};
+
+}  // namespace math
+}  // namespace x86
+}  // namespace lite
+}  // namespace paddle
+
+#define FOR_ALL_TYPES(macro) macro(float);
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index b02ef8fed6..641302cd2d 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -100,7 +100,7 @@ add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
 #----------------------------------------------- NOT CHANGE -----------------------------------------------
 lite_cc_library(kernel SRCS kernel.cc
         DEPS context type_system target_wrapper any op_params tensor
-        PROFILE_DEPS basic_profiler
+        PROFILE_DEPS lite_profiler
   )
 lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
   cpp_op_desc tensor
@@ -114,7 +114,7 @@ lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper)
 
 lite_cc_library(program SRCS program.cc
     DEPS op kernel model_parser ${ops} ${cpp_wrapper}
-    PROFILE_DEPS basic_profiler)
+    PROFILE_DEPS lite_profiler)
 
 if (NOT LITE_ON_TINY_PUBLISH)
   lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program)
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index c59c078787..561a508d20 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -37,6 +37,9 @@ void TestCase::CreateInstruction() {
   // prepare context
   (*it)->SetContext(std::move(ctx_));
   instruction_.reset(new Instruction(op, std::move(*it)));
+#ifdef LITE_WITH_PROFILE
+  instruction_->set_profiler(new profile::Profiler());
+#endif
 }
 
 void TestCase::PrepareInputsForInstruction() {
diff --git a/lite/core/context.h b/lite/core/context.h
index 19238f1a9b..5063600d36 100644
--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -253,6 +253,13 @@ class Context<TargetType::kCUDA> {
 
   std::string name() const { return "CUDAContext"; }
 
+  CUDAContext& operator=(const CUDAContext& context) {
+    this->Init(
+        context.device_id_, context.exec_stream_id_, context.io_stream_id_);
+    cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
+    return *this;
+  }
+
  private:
   int device_id_;
   // overall information
@@ -345,7 +352,6 @@ class ContextScheduler {
 
   std::unique_ptr<KernelContext> NewContext(TargetType target) {
     std::unique_ptr<KernelContext> ctx(new KernelContext);
-
     switch (target) {
       case TARGET(kHost):
         kernel_contexts_[TargetType::kHost].As<HostContext>().CopySharedTo(
@@ -416,6 +422,7 @@ class ContextScheduler {
   void InitContext() {
     kernel_contexts_[Type].As<ContextT>().InitOnce();
   }
+
   ContextScheduler() {
     InitContext<TargetType::kHost, HostContext>();
 #ifdef LITE_WITH_X86
diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc
index 166c04c000..f5b757ac3c 100644
--- a/lite/core/device_info.cc
+++ b/lite/core/device_info.cc
@@ -1039,7 +1039,7 @@ int DeviceInfo::Setup() {
               << ", max freq: " << max_freqs_[i]
               << ", min freq: " << min_freqs_[i]
               << ", cluster ID: " << cluster_ids_[core_ids_[i]]
-              << ", CPU ARCH: A" << archs_[i];
+              << ", CPU ARCH: A" << static_cast<int>(archs_[i]);
   }
   LOG(INFO) << "L1 DataCache size is: ";
   for (int i = 0; i < core_num_; ++i) {
@@ -1093,7 +1093,7 @@ void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) {
       RequestPowerRandLowMode(shift_num, thread_num);
       break;
     default:
-      LOG(FATAL) << "Unsupported power mode: " << mode;
+      LOG(FATAL) << "Unsupported power mode: " << static_cast<int>(mode);
       break;
   }
   if (active_ids_.empty()) {
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
index 05d7a6b333..86193235a2 100644
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -31,7 +31,7 @@
 #include "lite/utils/replace_stl/stream.h"
 
 #ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
+#include "lite/core/profile/profiler.h"
 #endif  // LITE_WITH_PROFILE
 
 namespace paddle {
@@ -58,7 +58,10 @@ class KernelBase {
   virtual void Run() = 0;
 
 #ifdef LITE_WITH_PROFILE
-  void SetProfileID(uint32_t id) { profile_id_ = id; }
+  void SetProfiler(profile::Profiler* profiler, int id) {
+    profiler_ = profiler;
+    profile_id_ = id;
+  }
 #endif
 
   void Launch() {
@@ -82,10 +85,12 @@ class KernelBase {
 #endif
 
 #ifdef LITE_WITH_PROFILE
-    if (profile_id_ >= 0) {
-      profile::ProfileBlock x(profile_id_, "kernel");
-      Run();
-    }
+    CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
+                        "When LITE_WITH_PROFILE is defined, please set a "
+                        "Profiler for Instruction.";
+    profiler_->StartTiming(profile_id_, ctx_.get());
+    Run();
+    profiler_->StopTiming(profile_id_, ctx_.get());
 #else
     Run();
 #endif
@@ -175,6 +180,7 @@ class KernelBase {
   bool is_first_epoch_{true};
 
 #ifdef LITE_WITH_PROFILE
+  profile::Profiler* profiler_{nullptr};
   int profile_id_{-1};
 #endif
 };
diff --git a/lite/core/memory.cc b/lite/core/memory.cc
index ec94f69be1..eefada3f99 100644
--- a/lite/core/memory.cc
+++ b/lite/core/memory.cc
@@ -110,7 +110,7 @@ void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
       TargetWrapper<TARGET(kBM)>::MemcpySync(
           dst, src, size, IoDirection::DtoD);
       break;
-#endif 
+#endif
 #ifdef LITE_WITH_OPENCL
     case TargetType::kOpenCL:
       TargetWrapperCL::MemcpySync(dst, src, size, IoDirection::DtoD);
diff --git a/lite/core/mir/fusion/conv_activation_fuse_pass.cc b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
index ff064fb2ee..0d11b47db6 100644
--- a/lite/core/mir/fusion/conv_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_activation_fuse_pass.cc
@@ -47,4 +47,5 @@ void ConvActivationFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 REGISTER_MIR_PASS(lite_conv_activation_fuse_pass,
                   paddle::lite::mir::ConvActivationFusePass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)})
     .BindKernel("conv2d");
diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
index d9d9c1bbf5..5ab5f8c0a4 100644
--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -45,4 +45,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass)
     .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kX86)});
+    .ExcludeTargets({TARGET(kX86), TARGET(kXPU)});
diff --git a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
index fd9aadc5d0..b1b492ce03 100644
--- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
@@ -46,4 +46,5 @@ void ConvElementwiseFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_conv_elementwise_fuse_pass,
                   paddle::lite::mir::ConvElementwiseFusePass)
-    .BindTargets({TARGET(kAny)});
+    .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)});
diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
index af66f5ab66..e4391cd242 100644
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
@@ -35,4 +35,5 @@ void ElementwiseAddActivationFusePass::Apply(
 REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
                   paddle::lite::mir::ElementwiseAddActivationFusePass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)})
     .BindKernel("fusion_elementwise_add_activation");
diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc
index ed10f06f56..7fc4492192 100644
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -33,4 +33,5 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
     .BindTargets({TARGET(kAny)})
+    .ExcludeTargets({TARGET(kXPU)})
     .BindKernel("fc");
diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
index f823f45dc6..da611e4490 100644
--- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc
+++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc
@@ -396,6 +396,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
     op_desc->SetAttr<float>("input_scale", scale_value);
     op_desc->SetInput("X", {input_act_node->arg()->name});
     IR_NODE_LINK_TO(input_act_node, quantized_node)
+    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
+    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
 
     // delete nodes and edges
     std::unordered_set<const Node*> nodes2rm = {input_scale_node,
@@ -440,6 +442,8 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
     op_desc->SetInput("Y", {input_act_right_node->arg()->name});
     IR_NODE_LINK_TO(input_act_left_node, quantized_node)
     IR_NODE_LINK_TO(input_act_right_node, quantized_node)
+    auto update_op_desc = *quantized_node->stmt()->mutable_op_info();
+    quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places());
 
     // delete nodes and edges
     std::unordered_set<const Node*> nodes2rm = {input_scale_left_node,
diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc
index 1f2355e8a3..4f41ba4a60 100644
--- a/lite/core/mir/memory_optimize_pass.cc
+++ b/lite/core/mir/memory_optimize_pass.cc
@@ -255,4 +255,5 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }  // namespace paddle
 
 REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
-    .BindTargets({TARGET(kARM)});
+    .BindTargets({TARGET(kARM)})
+    .ExcludeTargets({TARGET(kOpenCL), TARGET(kNPU), TARGET(kXPU)});
diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h
index 4de0fdbf35..4e8c8be292 100644
--- a/lite/core/mir/pass.h
+++ b/lite/core/mir/pass.h
@@ -52,34 +52,44 @@ class Pass {
 
   // Bind targets. At runtime, there must be one device in the bound targets.
   void BindTargets(const std::set<TargetType>& targets) {
-    std::set<TargetType> res;
     for (const auto& target : targets) {
       const std::set<TargetType>& universe = ExpandValidTargets(target);
       std::set_union(bound_targets_.begin(),
                      bound_targets_.end(),
                      universe.begin(),
                      universe.end(),
-                     std::inserter(res, res.begin()));
+                     std::inserter(bound_targets_, bound_targets_.begin()));
     }
-    bound_targets_ = res;
   }
 
   // Exclude targets. At runtime, there must be one device in the bound targets.
+  // Disable the pass if one of the valid devices is in the excluded targets.
   void ExcludeTargets(const std::set<TargetType>& targets) {
-    std::set<TargetType> res;
     for (const auto& target : targets) {
       const std::set<TargetType>& universe = ExpandValidTargets(target);
-      std::set_difference(bound_targets_.begin(),
-                          bound_targets_.end(),
-                          universe.begin(),
-                          universe.end(),
-                          std::inserter(res, res.begin()));
+      std::set<TargetType> updated_bound_targets;
+      std::set_difference(
+          bound_targets_.begin(),
+          bound_targets_.end(),
+          universe.begin(),
+          universe.end(),
+          std::inserter(updated_bound_targets, updated_bound_targets.begin()));
+      bound_targets_ = updated_bound_targets;
+      std::set_union(
+          excluded_targets_.begin(),
+          excluded_targets_.end(),
+          universe.begin(),
+          universe.end(),
+          std::inserter(excluded_targets_, excluded_targets_.begin()));
     }
-    bound_targets_ = res;
   }
 
   // Get all bound targets.
-  const std::set<TargetType>& Targets() const { return bound_targets_; }
+  const std::set<TargetType>& BoundTargets() const { return bound_targets_; }
+  // Get all excluded targets.
+  const std::set<TargetType>& ExcludedTargets() const {
+    return excluded_targets_;
+  }
 
   // Some passes are only available on qualified kernels and need to be
   // explicitly declared.
@@ -116,6 +126,7 @@ class Pass {
   std::string name_;
   std::string doc_;
   std::set<TargetType> bound_targets_;
+  std::set<TargetType> excluded_targets_;
   std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_;
 };
 
diff --git a/lite/core/mir/pass_utils.cc b/lite/core/mir/pass_utils.cc
index 4f6be2c186..5bddfcbd3c 100644
--- a/lite/core/mir/pass_utils.cc
+++ b/lite/core/mir/pass_utils.cc
@@ -47,10 +47,34 @@ bool KernelRegistered(const std::string name, const Place& place) {
   return false;
 }
 
-bool PassMatchesTarget(const mir::Pass& pass, TargetType target) {
-  const auto& targets = pass.Targets();
-  if (targets.find(TARGET(kAny)) != targets.end()) return true;
-  return (targets.find(target) != targets.end());
+bool PassMatchesTarget(const mir::Pass& pass,
+                       const std::set<TargetType>& targets) {
+  // Whether the pass is suitable for targets ? The condition is the
+  // intersection of targets and pass's bound targets is not empty, besides the
+  // intersection of targets and pass's excluded targets is empty. The formula
+  // is as follows: matched = !empty(targets ^ pass.bound_targets) &&
+  // empty(targets ^ pass.excluded_targets), where ^ is intersection operation.
+  const auto& bound_targets = pass.BoundTargets();
+  bool matched = bound_targets.find(TARGET(kAny)) != bound_targets.end();
+  std::set<TargetType> inter_bound_targets;
+  std::set_intersection(
+      bound_targets.begin(),
+      bound_targets.end(),
+      targets.begin(),
+      targets.end(),
+      std::inserter(inter_bound_targets, inter_bound_targets.begin()));
+  matched |= !inter_bound_targets.empty();
+  const auto& excluded_targets = pass.ExcludedTargets();
+  matched &= excluded_targets.find(TARGET(kAny)) == excluded_targets.end();
+  std::set<TargetType> inter_excluded_targets;
+  std::set_intersection(
+      excluded_targets.begin(),
+      excluded_targets.end(),
+      targets.begin(),
+      targets.end(),
+      std::inserter(inter_excluded_targets, inter_excluded_targets.begin()));
+  matched &= inter_excluded_targets.empty();
+  return matched;
 }
 
 bool PassMatchesKernels(const mir::Pass& pass) {
diff --git a/lite/core/mir/pass_utils.h b/lite/core/mir/pass_utils.h
index 942f64bf31..57e8da5e46 100644
--- a/lite/core/mir/pass_utils.h
+++ b/lite/core/mir/pass_utils.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <set>
 #include <string>
 #include "lite/core/mir/pass.h"
 
@@ -24,7 +25,8 @@ namespace lite {
 bool KernelRegistered(const std::string name, const Place& place);
 
 // Check if the pass hits the hardware target.
-bool PassMatchesTarget(const mir::Pass& pass, TargetType target);
+bool PassMatchesTarget(const mir::Pass& pass,
+                       const std::set<TargetType>& targets);
 
 // Check if the pass hits all necessary operators.
 bool PassMatchesKernels(const mir::Pass& pass);
diff --git a/lite/core/mir/static_kernel_pick_pass.h b/lite/core/mir/static_kernel_pick_pass.h
index 7187ddcef6..cd54e2654c 100644
--- a/lite/core/mir/static_kernel_pick_pass.h
+++ b/lite/core/mir/static_kernel_pick_pass.h
@@ -48,7 +48,8 @@ class StaticKernelPickPass : public mir::StmtPass {
 
  private:
   // Score the kernel.
-  size_t KernelGrade(const lite::KernelBase& kernel,
+  size_t KernelGrade(const lite::mir::Node::Stmt& instruct,
+                     const lite::KernelBase& kernel,
                      const std::vector<Place>& places) {
     CHECK_GT(places.size(), 0) << "valid_places is empty.";
     float final_score{-1.};
@@ -66,10 +67,11 @@ class StaticKernelPickPass : public mir::StmtPass {
     // valid_places.size() as default.
     //         where i is the place's index in valid_places array.
     // score:  score is the weighted sum of target、percision and layout
-    for (int i = 0; i < place_size; ++i) {
+    for (size_t i = 0; i < place_size; ++i) {
       const auto& place = places[i];
       float weight = static_cast<float>(place_size - i) / place_size;
       size_t score{};
+
       // The more important factor comes first
       if (kernel_pick_factors_.IsTargetConsidered() &&
           (place.target == kernel.target() || kernel.target() == TARGET(kAny) ||
@@ -82,8 +84,12 @@ class StaticKernelPickPass : public mir::StmtPass {
           (place.precision == kernel.precision() ||
            kernel.precision() == PRECISION(kAny) ||
            place.precision == PRECISION(kAny))) {
-        score += kMax / static_cast<int>(
-                            core::KernelPickFactor::Factor::PrecisionFirst);
+        // score skipped, if kernel is int8, but op is not int8
+        if (!(kernel.precision() == PRECISION(kInt8) &&
+              !instruct.op_info()->HasAttr("enable_int8"))) {
+          score += kMax / static_cast<int>(
+                              core::KernelPickFactor::Factor::PrecisionFirst);
+        }
       }
       VLOG(4) << "[score s2]:" << score;
       if (kernel_pick_factors_.IsDataLayoutConsidered() &&
@@ -102,17 +108,17 @@ class StaticKernelPickPass : public mir::StmtPass {
 
     VLOG(4) << "[score(final)]:" << final_score;
     VLOG(4) << "-------- pick summary --------";
-    VLOG(4) << " ===> place():" << PrecisionToStr(winner_place.precision) << " "
-            << DataLayoutToStr(winner_place.layout) << " "
+    VLOG(4) << " ===> winner_place():" << PrecisionToStr(winner_place.precision)
+            << " " << DataLayoutToStr(winner_place.layout) << " "
             << TargetToStr(winner_place.target);
     VLOG(4) << " ===> kernel.place():"
             << PrecisionToStr(kernel.place().precision) << " "
             << DataLayoutToStr(kernel.place().layout) << " "
             << TargetToStr(kernel.place().target);
     VLOG(4) << "kernel.op_type():" << kernel.op_type();
-    VLOG(4) << "picker tactic " << kernel_pick_factors_;
-    VLOG(4) << "kernel place " << kernel.place().DebugString();
-    VLOG(4) << "picker place " << winner_place.DebugString();
+    VLOG(4) << "kernel picker factors:" << kernel_pick_factors_;
+    VLOG(4) << "kernel place:" << kernel.place().DebugString();
+    VLOG(4) << "winner_picker place:" << winner_place.DebugString();
     VLOG(4) << "------------------------------";
 
     // The data layout is not considered, for the input and output arguments
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.cc b/lite/core/mir/subgraph/generate_npu_program_pass.cc
index c83cd70d82..65c29aa68f 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.cc
@@ -128,10 +128,10 @@ std::string GenerateNPUProgramPass::BuildNPUGraph(
   // persistable=true, Sothat the model parser can recognize it and save it to
   // param files
   if (!lite::npu::BuildModel(inputs, outputs, weight)) {
-    LOG(WARNING) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
-    throw std::runtime_error("Build NPU graph failed.");
+    LOG(FATAL) << "[NPU] Build NPU graph failed (subgraph=" << sub_id << ")";
+  } else {
+    LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
   }
-  LOG(INFO) << "[NPU] Build NPU graph success (subgraph=" << sub_id << ")";
   return weight_var_name;
 }
 
@@ -175,40 +175,19 @@ void GenerateNPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     supported_op_types.push_back(i.first);
   }
 
-  try {
-    int num_subgraph = FuseSubgraph(graph, supported_op_types);
-    InferOnce(graph);
-    auto op_nodes_all = ClassifySubgraph(graph);
-    CHECK_EQ(op_nodes_all.size(), num_subgraph);
-    int id = 1;
-    for (auto& op_nodes : op_nodes_all) {
-      LOG(INFO) << "[NPU] Converting Subgraph " << id;
-      GenNPUSubgraph(graph, op_nodes.second, id);
-      LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
-                << Visualize(graph.get());
-      id++;
-    }
-  } catch (...) {
-    LOG(WARNING) << "[NPU] Build NPU graph failed.";
-    throw std::runtime_error("[NPU] Build NPU graph failed.");
-  }
-
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (item->IsStmt()) {
-      auto& stmt = item->AsStmt();
-      LOG(INFO) << stmt;
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
-    }
+  int num_subgraph = FuseSubgraph(graph, supported_op_types);
+  InferOnce(graph);
+  auto op_nodes_all = ClassifySubgraph(graph);
+  CHECK_EQ(op_nodes_all.size(), num_subgraph);
+  int id = 1;
+  for (auto& op_nodes : op_nodes_all) {
+    LOG(INFO) << "[NPU] Converting Subgraph " << id;
+    GenNPUSubgraph(graph, op_nodes.second, id);
+    LOG(INFO) << "[NPU] After NPU Pass Subgraph " << id << "\n"
+              << Visualize(graph.get());
+    id++;
   }
 }
-
-std::unique_ptr<RuntimeProgram> GenerateNPUProgramPass::GenProgram() {
-  LOG(INFO) << "[NPU] program insts.size " << insts_.size();
-  std::unique_ptr<RuntimeProgram> program(
-      new RuntimeProgram(std::move(insts_)));
-  return program;
-}
-
 }  // namespace subgraph
 }  // namespace mir
 }  // namespace lite
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass.h b/lite/core/mir/subgraph/generate_npu_program_pass.h
index 823ca5f1f6..5b1a98c6ed 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_npu_program_pass.h
@@ -35,7 +35,6 @@ class GenerateNPUProgramPass : public SubgraphProgramPass {
   using key2nodes_t = std::map<std::string, Node*>;
 
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-  std::unique_ptr<RuntimeProgram> GenProgram();
 
  protected:
   // nodes2cvt: op nodes to convert
@@ -54,9 +53,6 @@ class GenerateNPUProgramPass : public SubgraphProgramPass {
   void GenNPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
                       const std::unordered_set<Node*>& op_nodes,
                       int sub_id);
-
- private:
-  std::vector<Instruction> insts_;
 };
 
 }  // namespace subgraph
diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
index 95339d6175..1afb54c692 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
@@ -160,8 +160,8 @@ TEST(NPUSubgraph, compare) {
       TestModel(FLAGS_model_dir,
                 FLAGS_model_file,
                 FLAGS_params_file,
-                {lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
-                 lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}},
+                {lite_api::Place{TARGET(kNPU), PRECISION(kFloat)},
+                 lite_api::Place{TARGET(kARM), PRECISION(kFloat)}},
                 input_tensor_shape,
                 FLAGS_optimized_model_dir + "/NPU");
   // verify results
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.cc b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
index 319e1e51fe..4340cb4ee3 100644
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.cc
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.cc
@@ -115,10 +115,10 @@ std::string GenerateXPUProgramPass::BuildXPUGraph(
                              graph_ctx.params,
                              &ordered_cvted_var_nodes,
                              weight)) {
-    LOG(WARNING) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
-    throw std::runtime_error("[XPU] Build XPU graph failed.");
+    LOG(FATAL) << "[XPU] Build XPU graph failed (subgraph=" << sub_id << ")";
+  } else {
+    LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
   }
-  LOG(INFO) << "[XPU] Build XPU graph success (subgraph=" << sub_id << ")";
   return weight_var_name;
 }
 
@@ -162,40 +162,19 @@ void GenerateXPUProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     supported_op_types.push_back(i.first);
   }
 
-  try {
-    int num_subgraph = FuseSubgraph(graph, supported_op_types);
-    InferOnce(graph);
-    auto op_nodes_all = ClassifySubgraph(graph);
-    CHECK_EQ(op_nodes_all.size(), num_subgraph);
-    int id = 1;
-    for (auto& op_nodes : op_nodes_all) {
-      LOG(INFO) << "[XPU] Converting Subgraph " << id;
-      GenXPUSubgraph(graph, op_nodes.second, id);
-      LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
-                << Visualize(graph.get());
-      id++;
-    }
-  } catch (...) {
-    LOG(WARNING) << "[XPU] Build XPU graph failed.";
-    throw std::runtime_error("[XPU] Build XPU graph failed.");
-  }
-
-  for (auto& item : graph->StmtTopologicalOrder()) {
-    if (item->IsStmt()) {
-      auto& stmt = item->AsStmt();
-      LOG(INFO) << stmt;
-      insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
-    }
+  int num_subgraph = FuseSubgraph(graph, supported_op_types);
+  InferOnce(graph);
+  auto op_nodes_all = ClassifySubgraph(graph);
+  CHECK_EQ(op_nodes_all.size(), num_subgraph);
+  int id = 1;
+  for (auto& op_nodes : op_nodes_all) {
+    LOG(INFO) << "[XPU] Converting Subgraph " << id;
+    GenXPUSubgraph(graph, op_nodes.second, id);
+    LOG(INFO) << "[XPU] After XPU Pass Subgraph " << id << "\n"
+              << Visualize(graph.get());
+    id++;
   }
 }
-
-std::unique_ptr<RuntimeProgram> GenerateXPUProgramPass::GenProgram() {
-  LOG(INFO) << "[XPU] program insts.size=" << insts_.size();
-  std::unique_ptr<RuntimeProgram> program(
-      new RuntimeProgram(std::move(insts_)));
-  return program;
-}
-
 }  // namespace subgraph
 }  // namespace mir
 }  // namespace lite
diff --git a/lite/core/mir/subgraph/generate_xpu_program_pass.h b/lite/core/mir/subgraph/generate_xpu_program_pass.h
index cf121ae950..777642cfb6 100644
--- a/lite/core/mir/subgraph/generate_xpu_program_pass.h
+++ b/lite/core/mir/subgraph/generate_xpu_program_pass.h
@@ -35,7 +35,6 @@ class GenerateXPUProgramPass : public SubgraphProgramPass {
   using key2nodes_t = std::map<std::string, Node*>;
 
   void Apply(const std::unique_ptr<SSAGraph>& graph) override;
-  std::unique_ptr<RuntimeProgram> GenProgram();
 
  protected:
   // nodes2cvt: op nodes to convert
@@ -58,9 +57,6 @@ class GenerateXPUProgramPass : public SubgraphProgramPass {
   void GenXPUSubgraph(const std::unique_ptr<SSAGraph>& graph,
                       const std::unordered_set<Node*>& op_nodes,
                       int sub_id);
-
- private:
-  std::vector<Instruction> insts_;
 };
 
 }  // namespace subgraph
diff --git a/lite/core/mir/type_layout_cast_pass.cc b/lite/core/mir/type_layout_cast_pass.cc
index 9d63dcbb38..b3b7a858f6 100644
--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -127,24 +127,30 @@ void TypeLayoutTransformPass::AddLayoutInst(
   for (auto& kernel : kernels) {
     const Type* in_arg_ty = kernel->GetInputDeclType("Input");
     const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
-#ifdef LITE_WITH_OPENCL
+
     // layout kernel choose
     //   must ignore [layout check] for layout of kernels's input and output
-    if (TargetCompatibleTo(*in_arg_ty, from) &&
-        PrecisionCompatibleTo(*in_arg_ty, from) &&
-        DeviceCompatibleTo(*in_arg_ty, from) &&
-        out_arg_ty->layout() == to.layout()) {
-#else
-    if (TypeCompatible(*in_arg_ty, from) &&
-        out_arg_ty->layout() == to.layout()) {
-#endif
+    // note: replace LITE_WITH_OPENCL macro with judge input and output target
+    // of layout_trans
+    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
+         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
+        (TargetCompatibleTo(*in_arg_ty, from) &&
+         PrecisionCompatibleTo(*in_arg_ty, from) &&
+         DeviceCompatibleTo(*in_arg_ty, from) &&
+         out_arg_ty->layout() == to.layout())) {
+      is_found = true;
+    } else if (TypeCompatible(*in_arg_ty, from) &&
+               out_arg_ty->layout() == to.layout()) {
       is_found = true;
+    }
+    if (is_found) {
       selected_kernels.emplace_back(std::move(kernel));
       // we pick the kernel
       layout_inst->AsStmt(layout_type, std::move(selected_kernels), layout_op);
       break;
     }
   }
+
   CHECK(is_found) << "Can't find a layout kernel for layout op: " << from << ":"
                   << in->AsArg().name << "->" << to << ":"
                   << inst_node->AsStmt().op_info()->Type();
diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc
index 7a32777865..b008faa687 100644
--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -128,10 +128,9 @@ void TypeTargetTransformPass::AddIoCopyInst(
     VLOG(4) << "out_arg_ty(io_copy kernel output):" << *out_arg_ty;
     VLOG(4) << "to:" << to << "\n";
 
-// kernel choose branch for opencl backend
-//   judge inst's target whether is kOpenCL
-//   Note: to == *decl_arg_type == in of inst, not output of last inst
-#ifdef LITE_WITH_OPENCL
+    // kernel choose branch for opencl backend
+    //   judge inst's target whether is kOpenCL
+    //   Note: to == *decl_arg_type == in of inst, not output of last inst
     // ignore [layout check] for layout between [to] and [from]
     //   Because all of origin opencl insts in model, are not default layout
     //   NCHW,
@@ -141,25 +140,34 @@ void TypeTargetTransformPass::AddIoCopyInst(
     //     [*decl_arg_type] -> [to]: input of inst, not output of last
     //     [in_arg_ty]: in of io_copy
     //     [out_arg_ty]: out of io_copy
-    if (TargetCompatibleTo(*in_arg_ty, from) &&
-        PrecisionCompatibleTo(*in_arg_ty, from) &&
-        DeviceCompatibleTo(*in_arg_ty, from) &&
-        TargetCompatibleTo(*out_arg_ty, to)) {
-      VLOG(4) << "do nothing. opencl found";
-#else
-    if (TypeCompatible(*in_arg_ty, from) &&
-        out_arg_ty->target() == to.target()) {
-#endif
+    //
+    // noto: replace LITE_WITH_OPENCL macro with judge input and output target
+    // of io_copy
+    if ((in_arg_ty->target() == TARGET(kOpenCL) ||
+         out_arg_ty->target() == TARGET(kOpenCL)) &&  // judge OpenCL first
+        (TargetCompatibleTo(*in_arg_ty, from) &&
+         PrecisionCompatibleTo(*in_arg_ty, from) &&
+         DeviceCompatibleTo(*in_arg_ty, from) &&
+         TargetCompatibleTo(*out_arg_ty, to))) {
+      VLOG(4) << "picked, opencl found";
+      is_found = true;
+    } else if (TypeCompatible(*in_arg_ty, from) &&
+               out_arg_ty->target() == to.target()) {
       VLOG(4) << "picked";
       is_found = true;
+    }
+
+    if (is_found) {
       selected_kernels.emplace_back(std::move(kernel));
       // we pick the kernel
       io_copy_inst->AsStmt(
           io_copy_type, std::move(selected_kernels), io_copy_op);
       break;
     }
+
     VLOG(4) << "not picked";
   }
+
   CHECK(is_found) << "Can't find a io_copy  kernel for io_copy op: " << from
                   << ":" << in->AsArg().name << " -> " << to << ":"
                   << inst_node->AsStmt().op_info()->Type();
diff --git a/lite/core/mir/variable_place_inference_pass.h b/lite/core/mir/variable_place_inference_pass.h
index fe6ecfd66d..3f5d161a56 100644
--- a/lite/core/mir/variable_place_inference_pass.h
+++ b/lite/core/mir/variable_place_inference_pass.h
@@ -54,40 +54,50 @@ class VariablePlaceInferencePass : public DebugPass {
     }
   }
 
-  // Set the tye of the weight
-  void SetWeightType(Node* w, const LiteType& type) {
-// TODO(xg) to optimize this
-#ifdef LITE_WITH_FPGA
-    w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-#endif
-
-#ifdef LITE_WITH_OPENCL
-    w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-#endif
-
-#ifndef LITE_WITH_FPGA
-#ifndef LITE_WITH_OPENCL
-    w->AsArg().type = LiteType::GetTensorTy(
-        TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
-#endif
-#endif
+  // Set the type of the weight
+  void SetWeightType(Node* w,
+                     const LiteType& type,
+                     const std::map<std::string, bool>& lite_with_targets) {
+    VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
+    if (lite_with_targets.at("kFPGA")) {
+      w->AsArg().type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    } else if (lite_with_targets.at("kOpenCL")) {
+      w->AsArg().type = LiteType::GetTensorTy(
+          TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+    } else {
+      w->AsArg().type = LiteType::GetTensorTy(
+          TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
+    }
   }
 
   void InferenceArgumentPlace(SSAGraph* graph) {
+    auto& valid_places = graph->valid_places();
+    auto valid_places_has_target = [&](TargetType t) -> bool {
+      for (auto& p : valid_places) {
+        if (p.target == t) {
+          return true;
+        }
+      }
+      return false;
+    };
+    std::map<std::string, bool> lite_with_targets{
+        {"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
+        {"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
+    VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
+    VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
+
     VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
     for (auto& x : graph->StmtTopologicalOrder()) {
       auto& inst = x->AsStmt();
-// The IoCopyOp is a tool operator, it won't support the type inference.
-// in fpga, we has io_copy+cali+layout tool ops, so we need type inference for
-// tool operator
-#ifndef LITE_WITH_FPGA
-#ifndef LITE_WITH_OPENCL
-      VLOG(3) << "inst.op_type() == 'io_copy', continue";
-      if (inst.op_type() == "io_copy") continue;
-#endif
-#endif
+      // The IoCopyOp is a tool operator, it won't support the type inference.
+      // in fpga, we has io_copy+cali+layout tool ops, so we need type inference
+      // for
+      // tool operator
+      if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
+        VLOG(3) << "inst.op_type() == 'io_copy', continue";
+        if (inst.op_type() == "io_copy") continue;
+      }
       // deal with inputs
       VLOG(4) << "Infering op " << inst.op_info()->Repr();
       // TODO(zhaolong): Add check if the node's name in op's arguments.
@@ -115,7 +125,7 @@ class VariablePlaceInferencePass : public DebugPass {
         if (!x_in->AsArg().type) {
           VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
           if (x_in->AsArg().is_weight) {
-            SetWeightType(x_in, *type);
+            SetWeightType(x_in, *type, lite_with_targets);
           } else {
             x_in->AsArg().type = type;
           }
@@ -135,7 +145,7 @@ class VariablePlaceInferencePass : public DebugPass {
         if (!x_out->AsArg().type) {
           VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
           if (x_out->AsArg().is_weight) {
-            SetWeightType(x_out, *type);
+            SetWeightType(x_out, *type, lite_with_targets);
           } else {
             x_out->AsArg().type = type;
           }
diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc
index 1400b25409..887ac3c950 100644
--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -118,6 +118,8 @@ KernelRegistry::KernelRegistry()
   INIT_FOR(kCUDA, kAny, kNCHW);
   INIT_FOR(kCUDA, kAny, kAny);
   INIT_FOR(kCUDA, kInt8, kNHWC);
+  INIT_FOR(kCUDA, kInt64, kNCHW);
+  INIT_FOR(kCUDA, kInt64, kNHWC);
 
   INIT_FOR(kHost, kFloat, kNCHW);
   INIT_FOR(kHost, kAny, kNCHW);
diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h
index 7ed632d864..d78ae690f9 100644
--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -145,6 +145,12 @@ class KernelRegistry final {
               KernelRegistryForTarget<TARGET(kARM),
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC)> *,  //
+              KernelRegistryForTarget<TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC)> *,  //
 
               KernelRegistryForTarget<TARGET(kOpenCL),
                                       PRECISION(kFloat),
@@ -188,16 +194,6 @@ class KernelRegistry final {
                                       PRECISION(kInt8),
                                       DATALAYOUT(kNCHW)> *,  //
 
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kAny),
-                                      DATALAYOUT(kAny)> *,  //
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNCHW)> *,  //
-              KernelRegistryForTarget<TARGET(kBM),
-                                      PRECISION(kInt8),
-                                      DATALAYOUT(kNCHW)> *,  //
-
               KernelRegistryForTarget<TARGET(kFPGA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW)> *,  //
diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h
index 22c5f19330..38c9d0e29d 100644
--- a/lite/core/optimizer.h
+++ b/lite/core/optimizer.h
@@ -13,7 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <map>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 #include "lite/core/mir/generate_program_pass.h"
@@ -49,23 +51,20 @@ class Optimizer {
     valid_places_ = valid_places;
     CHECK(!valid_places.empty()) << "At least one valid_place should be set";
     CHECK(!graph_) << "duplicate optimize found";
+
     graph_.reset(new mir::SSAGraph);
     graph_->Build(program, valid_places);
     graph_->SetValidPlaces(valid_places);
 
     SpecifyKernelPickTactic(kernel_pick_factor);
     InitTargetTypeTransformPass();
+
     if (passes.empty()) {
-      RunPasses(std::vector<std::string>{
-          {
- #if 0
-          "lite_quant_dequant_fuse_pass",     //
+      std::vector<std::string> passes_local{
+          {"lite_quant_dequant_fuse_pass",     //
            "lite_conv_elementwise_fuse_pass",  // conv-elemwise-bn
            "lite_conv_bn_fuse_pass",           //
            "lite_conv_elementwise_fuse_pass",  // conv-bn-elemwise
-           // This pass is disabled to force some opencl kernels selected for
-           // final running, otherwise, they will be fused to ARM fusion
-           // kernels, and the OpenCL devices will be discarded.
            // TODO(Superjomn) Refine the fusion related design to select fusion
            // kernels for devices automatically.
            "lite_conv_activation_fuse_pass",              //
@@ -74,11 +73,10 @@ class Optimizer {
            "lite_transpose_softmax_transpose_fuse_pass",  //
            "lite_interpolate_fuse_pass",                  //
            "identity_scale_eliminate_pass",               //
-#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+#if (defined LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) || (defined LITE_WITH_CUDA)
            "lite_elementwise_add_activation_fuse_pass",  //
-#endif     
-#endif        
-           "static_kernel_pick_pass",        // pick original kernel from graph     
+#endif
+           "static_kernel_pick_pass",        // pick original kernel from graph
            "variable_place_inference_pass",  // inference arg/var's
            // info(target/precision/layout/device)
            // using kernel info
@@ -107,17 +105,12 @@ class Optimizer {
            "argument_type_display_pass",  //
 
            "variable_place_inference_pass",  //
-           "argument_type_display_pass",     //
+           "argument_type_display_pass",
 
            "runtime_context_assign_pass",
-           "argument_type_display_pass",  //
-#if !defined(LITE_WITH_OPENCL) && !defined(LITE_WITH_NPU) && \
-    !defined(LITE_WITH_XPU)
-           // TODO(ysh329): cause CL_INVALID_MEM_OBJECT when setArg in kernel
-           "memory_optimize_pass",
-#endif
-           "argument_type_display_pass"
-           }});
+           "argument_type_display_pass",
+           "memory_optimize_pass"}};
+      RunPasses(passes_local);
     } else {
       RunPasses(passes);
     }
@@ -128,39 +121,13 @@ class Optimizer {
 
   // Generate a new program based on the mir graph.
   std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
-#if defined(LITE_WITH_NPU) || defined(LITE_WITH_XPU)
-    auto target_place = Place{
-#ifdef LITE_WITH_NPU
-        TARGET(kNPU),
-#endif
-#ifdef LITE_WITH_XPU
-        TARGET(kXPU),
-#endif
-        PRECISION(kFloat)};
-    if (std::find(valid_places_.begin(), valid_places_.end(), target_place) !=
-        valid_places_.end()) {
-#ifdef LITE_WITH_NPU
-      auto pass = mir::PassManager::Global()
-                      .LookUp<mir::subgraph::GenerateNPUProgramPass>(
-                          "generate_npu_program_pass");
-#endif
-#ifdef LITE_WITH_XPU
-      auto pass = mir::PassManager::Global()
-                      .LookUp<mir::subgraph::GenerateXPUProgramPass>(
-                          "generate_xpu_program_pass");
-#endif
-      try {
-        pass->Apply(graph_);
-        auto program = pass->GenProgram();
-        CHECK(exec_scope_);
-        program->set_exec_scope(exec_scope_);
-        return program;
-      } catch (...) {
-        LOG(WARNING) << "Build " << TargetToStr(target_place.target)
-                     << " program failed!";
-      }
-    }
-#endif
+    // Extra passes are applied for NPU and XPU, they depends on the shapes
+    // of input tensors. so GenRuntimeProgram() must be called after the shapes
+    // of input tensors are determined.
+    std::vector<std::string> subgraph_passes{"generate_npu_program_pass",
+                                             "generate_xpu_program_pass"};
+    RunPasses(subgraph_passes);
+
     auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
         "generate_program_pass");
     pass->Apply(graph_);
@@ -202,14 +169,16 @@ class Optimizer {
     for (auto& x : passes) {
       LOG(INFO) << "== Running pass: " << x;
       mir::Pass* pass = mir::PassManager::Global().LookUp(x);
-      CHECK(pass) << "Can not find pass: " << x;
-      bool matched = false;
+      if (!pass) {
+        LOG(INFO) << "   - Skip " << x << " because the pass isn't found.";
+        continue;
+      }
+      std::set<TargetType> targets;
       for (const auto& place : valid_places_) {
-        if (PassMatchesTarget(*pass, place.target)) {
-          matched = true;
-        }
+        targets.insert(place.target);
       }
-      matched = matched && PassMatchesKernels(*pass);
+      bool matched =
+          PassMatchesTarget(*pass, targets) && PassMatchesKernels(*pass);
       if (!matched) {
         LOG(INFO) << "   - Skip " << x
                   << " because the target or kernel does not match.";
diff --git a/lite/core/profile/CMakeLists.txt b/lite/core/profile/CMakeLists.txt
index 54a2390244..b7ddd810af 100644
--- a/lite/core/profile/CMakeLists.txt
+++ b/lite/core/profile/CMakeLists.txt
@@ -5,4 +5,5 @@ endif()
 lite_cc_library(basic_profiler SRCS basic_profiler.cc DEPS gflags)
 lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler)
  
- 
+lite_cc_library(lite_profiler SRCS profiler.cc DEPS context)
+lite_cc_test(test_lite_timer SRCS test_timer.cc DEPS lite_profiler)
diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc
new file mode 100644
index 0000000000..a51b769c8f
--- /dev/null
+++ b/lite/core/profile/profiler.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/profile/profiler.h"
+#include <map>
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+int Profiler::NewTimer(const OpCharacter& ch) {
+  StatisUnit unit;
+  unit.character = ch;
+  if (ch.target == TargetType::kCUDA) {
+#ifdef LITE_WITH_CUDA
+    unit.timer.reset(new DeviceTimer<TargetType::kCUDA>());
+#else
+    LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the "
+                  "default x86 timer is used instead.";
+#endif
+  } else {
+    unit.timer.reset(new DeviceTimer<TargetType::kHost>());
+  }
+  units_.push_back(std::move(unit));
+  return units_.size() - 1;
+}
+
+void Profiler::StartTiming(const int index, KernelContext* ctx) {
+  CHECK_LT(index, units_.size())
+      << "The timer index in the profiler is out of range.";
+  units_[index].timer->Start(ctx);
+}
+
+float Profiler::StopTiming(const int index, KernelContext* ctx) {
+  CHECK_LT(index, units_.size())
+      << "The timer index in the profiler is out of range.";
+  return units_[index].timer->Stop(ctx);
+}
+
+std::string Profiler::Summary(bool concise) {
+  STL::stringstream ss;
+  auto cout_title = [&ss](const std::string& title, const std::string& name) {
+    // clang-format off
+    ss << "===== " << title << ": " << name << " =====" << std::endl;
+    ss << std::setw(25) << std::left << "Operator Type" \
+       << std::setw(40) << std::left << "Kernel Name"   \
+       << std::setw(10) << std::left << "Remark"        \
+       << std::setw(10) << std::left << "Avg (ms)"      \
+       << std::setw(10) << std::left << "Min (ms)"      \
+       << std::setw(10) << std::left << "Max (ms)"      \
+       << std::endl;
+    // clang-format on
+  };
+  if (concise) {
+    auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
+      return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
+             (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
+    };
+    std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
+    for (auto& unit : units_) {
+      auto ch = summary.find(unit.character);
+      if (ch != summary.end()) {
+        ch->second.avg += unit.timer->LapTimes().Avg();
+        ch->second.min += unit.timer->LapTimes().Min();
+        ch->second.max += unit.timer->LapTimes().Max();
+      } else {
+        TimeInfo info({unit.timer->LapTimes().Avg(),
+                       unit.timer->LapTimes().Min(),
+                       unit.timer->LapTimes().Max()});
+        summary.insert({unit.character, info});
+      }
+    }
+    cout_title("Concise Profiler Summary", name_);
+    for (const auto& item : summary) {
+      // clang-format off
+      ss << std::setw(25) << std::left << item.first.op_type      \
+         << std::setw(40) << std::left << item.first.kernel_name  \
+         << std::setw(10) << std::left << item.first.remark       \
+         << std::setw(10) << std::left << item.second.avg         \
+         << std::setw(10) << std::left << item.second.min         \
+         << std::setw(10) << std::left << item.second.max         \
+         << std::endl;
+      // clang-format on
+    }
+  } else {
+    cout_title("Detailed Profiler Summary", name_);
+    for (auto& unit : units_) {
+      // clang-format off
+      ss << std::setw(25) << std::left << unit.character.op_type        \
+         << std::setw(40) << std::left << unit.character.kernel_name    \
+         << std::setw(10) << std::left << unit.character.remark         \
+         << std::setw(10) << std::left << unit.timer->LapTimes().Avg()  \
+         << std::setw(10) << std::left << unit.timer->LapTimes().Min()  \
+         << std::setw(10) << std::left << unit.timer->LapTimes().Max()  \
+         << std::endl;
+      // clang-format on
+    }
+  }
+  return ss.str();
+}
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h
new file mode 100644
index 0000000000..0fce8167cd
--- /dev/null
+++ b/lite/core/profile/profiler.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/profile/timer.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+struct TimeInfo {
+  float avg;
+  float min;
+  float max;
+};
+
+struct OpCharacter {
+  TargetType target;
+  std::string op_type{std::string("N/A")};
+  std::string kernel_name{std::string("N/A")};
+  std::string remark{std::string("N/A")};
+};
+
+struct StatisUnit {
+  std::unique_ptr<Timer> timer;
+  OpCharacter character;
+};
+
+class Profiler final {
+ public:
+  Profiler() = default;
+  explicit Profiler(const std::string& name) : name_(name) {}
+  int NewTimer(const OpCharacter& ch);
+  void StartTiming(const int index, KernelContext* ctx);
+  float StopTiming(const int index, KernelContext* ctx);
+  std::string Summary(bool concise = true);
+
+ private:
+  std::string name_{std::string("N/A")};
+  std::vector<StatisUnit> units_;
+};
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc
new file mode 100644
index 0000000000..6f49698ef4
--- /dev/null
+++ b/lite/core/profile/test_timer.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+#include "lite/core/context.h"
+#include "lite/core/profile/profiler.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+TEST(timer, real_latency) {
+  Timer timer;
+
+  timer.Start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.Stop();
+
+  timer.Start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  timer.Stop();
+
+  LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
+}
+
+#ifdef LITE_WITH_CUDA
+TEST(gpu_timer, real_latency) {
+  DeviceTimer<TargetType::kCUDA> timer;
+  KernelContext ctx;
+  cudaStream_t exec_stream;
+  cudaStreamCreate(&exec_stream);
+  (&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
+
+  timer.Start(&ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.Stop(&ctx);
+
+  (&timer)->Start(&ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  timer.Stop(&ctx);
+
+  LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
+}
+
+TEST(profiler, real_latency) {
+  KernelContext ctx;
+  cudaStream_t exec_stream;
+  cudaStreamCreate(&exec_stream);
+  (&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
+
+  Profiler profiler("name");
+  profile::OpCharacter ch;
+  ch.target = TargetType::kCUDA;
+  ch.op_type = "operator/1";
+  ch.kernel_name = "kernel/1";
+  int idx = profiler.NewTimer(ch);
+  profiler.StartTiming(idx, &ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  profiler.StopTiming(idx, &ctx);
+  std::cout << profiler.Summary();
+}
+#endif
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/timer.h b/lite/core/profile/timer.h
new file mode 100644
index 0000000000..1e86f0d7b9
--- /dev/null
+++ b/lite/core/profile/timer.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <chrono>  // NOLINT
+#include <list>
+#ifdef LITE_WITH_CUDA
+#include "lite/backends/cuda/cuda_utils.h"
+#endif
+#include "lite/core/context.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+template <typename T>
+class TimeList {
+ public:
+  void Clear() { laps_t_.clear(); }
+  void Add(T t) { laps_t_.push_back(t); }
+  T Max() const { return *std::max_element(laps_t_.begin(), laps_t_.end()); }
+  T Min() const { return *std::min_element(laps_t_.begin(), laps_t_.end()); }
+  T Sum() const { return std::accumulate(laps_t_.begin(), laps_t_.end(), 0.0); }
+  size_t Size() const { return laps_t_.size(); }
+  T Avg() const {
+    if (!Size()) {
+      return 0;
+    }
+    return Sum() / Size();
+  }
+  const std::list<T>& Raw() const { return laps_t_; }
+
+ private:
+  std::list<T> laps_t_;
+};
+
+class Timer {
+ public:
+  Timer() = default;
+  virtual ~Timer() = default;
+
+  void Reset() { laps_t_.Clear(); }
+  void Start() { t_start_ = std::chrono::system_clock::now(); }
+  float Stop() {
+    t_stop_ = std::chrono::system_clock::now();
+    auto ts = std::chrono::duration_cast<std::chrono::microseconds>(t_stop_ -
+                                                                    t_start_);
+    float elapse_ms = 1000.f * static_cast<float>(ts.count()) *
+                      std::chrono::microseconds::period::num /
+                      std::chrono::microseconds::period::den;
+    this->laps_t_.Add(elapse_ms);
+    return elapse_ms;
+  }
+  virtual void Start(KernelContext* ctx) { return Start(); }
+  virtual float Stop(KernelContext* ctx) { return Stop(); }
+  float AvgLapTimeMs() const { return laps_t_.Avg(); }
+  const TimeList<float>& LapTimes() const { return laps_t_; }
+
+ protected:
+  std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;
+  TimeList<float> laps_t_;
+};
+
+template <TargetType Target>
+class DeviceTimer final : public Timer {};
+
+#ifdef LITE_WITH_CUDA
+template <>
+class DeviceTimer<TargetType::kCUDA> final : public Timer {
+ public:
+  DeviceTimer() {
+    CUDA_CALL(cudaEventCreate(&e_start_));
+    CUDA_CALL(cudaEventCreate(&e_stop_));
+  }
+  ~DeviceTimer() {
+    CUDA_CALL(cudaEventDestroy(e_start_));
+    CUDA_CALL(cudaEventDestroy(e_stop_));
+  }
+  void Start(KernelContext* ctx) {
+    cudaStream_t stream;
+    stream = ctx->As<CUDAContext>().exec_stream();
+    CUDA_CALL(cudaEventRecord(e_start_, stream));
+  }
+  float Stop(KernelContext* ctx) {
+    cudaStream_t stream;
+    stream = ctx->As<CUDAContext>().exec_stream();
+    CUDA_CALL(cudaEventRecord(e_stop_, stream));
+    CUDA_CALL(cudaEventSynchronize(e_stop_));
+    float elapse_ms = 1.f;
+    CUDA_CALL(cudaEventElapsedTime(&elapse_ms, e_start_, e_stop_));
+    this->laps_t_.Add(elapse_ms);
+    return elapse_ms;
+  }
+
+ private:
+  cudaEvent_t e_start_, e_stop_;
+};
+#endif
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/program.cc b/lite/core/program.cc
index b60f279c0f..45796a478b 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -122,6 +122,9 @@ void RuntimeProgram::Run() {
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
   }
+#ifdef LITE_WITH_PROFILE
+  LOG(INFO) << "\n" << profiler_.Summary();
+#endif  // LITE_WITH_PROFILE
 }
 
 void Program::Build(const cpp::ProgramDesc& prog) {
@@ -183,11 +186,6 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
 void Instruction::Run() {
   CHECK(op_) << "op null";
   CHECK(kernel_) << "kernel null";
-#ifdef LITE_WITH_PROFILE
-  if (profile_id_ >= 0) {
-    profile::ProfileBlock x(profile_id_, "instruction");
-  }
-#endif  // LITE_WITH_PROFILE
   if (first_epoch_) {
     first_epoch_ = false;
     CHECK(op_->CheckShape());
diff --git a/lite/core/program.h b/lite/core/program.h
index 7a6700da61..1c1e4975c3 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -22,9 +22,6 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/model_parser/cpp/program_desc.h"
-#ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
-#endif  // LITE_WITH_PROFILE
 
 namespace paddle {
 namespace lite {
@@ -87,22 +84,7 @@ struct Program {
 struct Instruction {
   Instruction(const std::shared_ptr<OpLite>& op,
               std::unique_ptr<KernelBase>&& kernel)
-      : op_(op), kernel_(std::move(kernel)) {
-#ifdef LITE_WITH_PROFILE
-    if (op_->Type() != "feed" && op_->Type() != "fetch") {
-      profile_id_ = profile::BasicProfiler<profile::BasicTimer>::Global()
-                        .NewRcd(kernel_->SerializedKernelType())
-                        .id();
-      kernel_->SetProfileID(profile_id_);
-      // Set profile custom info
-      auto& profiler =
-          *profile::BasicProfiler<profile::BasicTimer>::Global().mutable_record(
-              profile_id_);
-      profiler.SetCustomInfo("op_type", op_->Type());
-      profiler.SetCustomInfo("op_info", op_->SerializedOpInfo());
-    }
-#endif  // LITE_WITH_PROFILE
-  }
+      : op_(op), kernel_(std::move(kernel)) {}
 
   // Run the instruction.
   void Run();
@@ -113,6 +95,20 @@ struct Instruction {
   const KernelBase* kernel() const { return kernel_.get(); }
   KernelBase* mutable_kernel() { return kernel_.get(); }
 
+#ifdef LITE_WITH_PROFILE
+  void set_profiler(profile::Profiler* profiler) {
+    profiler_ = profiler;
+    if (op_->Type() != "feed" && op_->Type() != "fetch") {
+      profile::OpCharacter ch;
+      ch.target = kernel()->target();
+      ch.op_type = op_->Type();
+      ch.kernel_name = kernel()->name();
+      profile_id_ = profiler->NewTimer(ch);
+      kernel_->SetProfiler(profiler_, profile_id_);
+    }
+  }
+#endif
+
  private:
   std::shared_ptr<OpLite> op_;
   std::unique_ptr<KernelBase> kernel_;
@@ -120,7 +116,7 @@ struct Instruction {
   bool has_run_{false};
 
 #ifdef LITE_WITH_PROFILE
-  // for profiler
+  profile::Profiler* profiler_;
   int profile_id_{-1};
 #endif  // LITE_WITH_PROFILE
 };
@@ -135,6 +131,9 @@ class LITE_API RuntimeProgram {
     if (instructions_.empty()) {
       LOG(FATAL) << "no instructions";
     }
+#ifdef LITE_WITH_PROFILE
+    set_profiler();
+#endif
   }
 
   void Run();
@@ -159,6 +158,15 @@ class LITE_API RuntimeProgram {
   RuntimeProgram(const RuntimeProgram&) = delete;
   std::vector<Instruction> instructions_;
   lite::Scope* exec_scope_{};
+
+#ifdef LITE_WITH_PROFILE
+  profile::Profiler profiler_;
+  void set_profiler() {
+    for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
+      i->set_profiler(&profiler_);
+    }
+  }
+#endif
 };
 
 }  // namespace lite
diff --git a/lite/demo/cxx/Makefile.def b/lite/demo/cxx/Makefile.def
index 1b5da970e8..cc2e593000 100644
--- a/lite/demo/cxx/Makefile.def
+++ b/lite/demo/cxx/Makefile.def
@@ -1,26 +1,22 @@
 CXX_DEFINES = -DARM_WITH_OMP -DHPPL_STUB_FUNC -DLITE_WITH_ARM -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK \
 	      -DLITE_WITH_LINUX -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_WITH_TESTING
-LDFLAGS = -latomic -pthread -ldl
+LDFLAGS = -latomic -pthread -ldl -llog -lz
 
 SYSROOT_COMPLILE = --sysroot=/opt/android-ndk-r17c/sysroot
-
-THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
-
+                                    
 SYSTEM_INCLUDES = -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/include \
 	          -I/opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++abi/include \
 	          -I/opt/android-ndk-r17c/sources/android/support/include \
 	          -I/opt/android-ndk-r17c/sysroot/usr/include \
 
-THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
-
 ifeq ($(ARM_ABI), arm8)
     CC = /opt/android-ndk-r17c/toolchains/aarch64-linux-android-4.9/prebuilt/linux-x86_64/bin/aarch64-linux-android-g++ 
-    CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti  -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
+    CXX_FLAGS = -funwind-tables -no-canonical-prefixes -D__ANDROID_API__=23 -fexceptions -frtti -std=c++11 -fopenmp -O3 -DNDEBUG -fPIE
     CXXFLAGS_LINK = $(CXX_FLAGS) -pie -Wl,--gc-sections 
     SYSROOT_LINK = --sysroot=/opt/android-ndk-r17c/platforms/android-24/arch-arm64
     SYSTEM_LIBS = /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_static.a \
                   /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++abi.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android $(THIRD_PARTY_INCLUDES)
+    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/aarch64-linux-android
 else
     CC = /opt/android-ndk-r17c/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi-g++
     CXX_FLAGS = -march=armv7-a -mthumb -mfpu=neon -mfloat-abi=softfp -funwind-tables -no-canonical-prefixes \
@@ -31,5 +27,5 @@ else
                   /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libc++abi.a \
                   /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libandroid_support.a \
                   /opt/android-ndk-r17c/sources/cxx-stl/llvm-libc++/libs/armeabi-v7a/libunwind.a
-    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi $(THIRD_PARTY_INCLUDES)
+    INCLUDES = $(SYSTEM_INCLUDES) -I/opt/android-ndk-r17c/sysroot/usr/include/arm-linux-androideabi
 endif
diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md
index ec72c044e3..b7768d763e 100644
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -1,6 +1,6 @@
 # C++ Demo
 1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像
-2. 运行并进入docker镜像环境，执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/r0.1/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
+2. 运行并进入docker镜像环境，执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
 3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz `
 4. 执行以下命令准备模拟器环境
 ```shell
@@ -27,8 +27,10 @@ tar zxvf mobilenet_v1.tar.gz
 make
 adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/
 adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
 adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api
-adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率
 
@@ -37,6 +39,24 @@ adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_full_api --model_dir=/da
 cd ../mobile_light
 make
 adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
 adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api
-adb -s emulator-5554 shell "/data/local/tmp/mobilenetv1_light_api --model_dir=/data/local/tmp/mobilenet_v1.opt"
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt"
 ```
+
+7. 编译并运行目标检测的demo
+```shell
+cd ../mobile_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
+tar zxvf mobilenetv1-ssd.tar.gz
+make
+adb -s emulator-5554 push mobile_detection /data/local/tmp/
+adb -s emulator-5554 push test.jpg /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_detection
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
+adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./
+```
+运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg
diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
new file mode 100644
index 0000000000..784ad73da4
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
@@ -0,0 +1,61 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_detection: fetch_opencv mobile_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_detection.o: mobile_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_detection.o
+	rm -f mobile_detection
diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
new file mode 100644
index 0000000000..2304b38eff
--- /dev/null
+++ b/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
@@ -0,0 +1,61 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+mobile_detection: fetch_opencv mobile_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection  $(CXX_LIBS) $(LDFLAGS)
+
+mobile_detection.o: mobile_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f mobile_detection.o
+	rm -f mobile_detection
diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
index f795b41d46..8ab8a3b743 100644
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv7
@@ -5,9 +5,25 @@ include ../Makefile.def
 
 LITE_ROOT=../../../
 
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
+THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
+
+CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_full_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_full_bundled.a`
+
+#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_full_api: mobilenetv1_full_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
index d0767145b0..c13320603b 100644
--- a/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_full/Makefile.android.armv8
@@ -5,9 +5,25 @@ include ../Makefile.def
 
 LITE_ROOT=../../../
 
-CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
+THIRD_PARTY_INCLUDES = -I../../../third_party/gflags/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
+THIRD_PARTY_LIBS = ../../../third_party/gflags/lib/libgflags.a
+
+CXX_INCLUDES = $(INCLUDES) ${THIRD_PARTY_INCLUDES} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = $(THIRD_PARTY_LIBS) -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_full_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_full_bundled.a`
+
+#CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_full_api: mobilenetv1_full_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_full_api.o -o mobilenetv1_full_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
index d235d6e25f..9150ae6e44 100644
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv7
@@ -7,7 +7,19 @@ LITE_ROOT=../../../
 
 CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_light_api: mobilenetv1_light_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
index b91aadcef8..7a2dbdd0fc 100644
--- a/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/mobile_light/Makefile.android.armv8
@@ -7,7 +7,19 @@ LITE_ROOT=../../../
 
 CXX_INCLUDES = $(INCLUDES) -I$(LITE_ROOT)/cxx/include
 
-CXX_LIBS = $(THIRD_PARTY_LIBS) $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+CXX_LIBS = -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
 mobilenetv1_light_api: mobilenetv1_light_api.o
 	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobilenetv1_light_api.o -o mobilenetv1_light_api  $(CXX_LIBS) $(LDFLAGS)
diff --git a/lite/demo/cxx/mobile_detection/mobile_detection.cc b/lite/demo/cxx/mobile_detection/mobile_detection.cc
new file mode 100644
index 0000000000..9b8f02aeed
--- /dev/null
+++ b/lite/demo/cxx/mobile_detection/mobile_detection.cc
@@ -0,0 +1,210 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+struct Object {
+  int batch_id;
+  cv::Rect rec;
+  int class_id;
+  float prob;
+};
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+const char* class_names[] = {
+    "background", "aeroplane",   "bicycle", "bird",  "boat",
+    "bottle",     "bus",         "car",     "cat",   "chair",
+    "cow",        "diningtable", "dog",     "horse", "motorbike",
+    "person",     "pottedplant", "sheep",   "sofa",  "train",
+    "tvmonitor"};
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(const float* din,
+                     float* dout,
+                     int size,
+                     const std::vector<float> mean,
+                     const std::vector<float> scale) {
+  if (mean.size() != 3 || scale.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img, int width, int height, float* data) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  std::vector<float> mean = {0.5f, 0.5f, 0.5f};
+  std::vector<float> scale = {0.5f, 0.5f, 0.5f};
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, data, width * height, mean, scale);
+}
+
+std::vector<Object> detect_object(const float* data,
+                                  int count,
+                                  float thresh,
+                                  cv::Mat& image) {  // NOLINT
+  if (data == nullptr) {
+    std::cerr << "[ERROR] data can not be nullptr\n";
+    exit(1);
+  }
+  std::vector<Object> rect_out;
+  for (int iw = 0; iw < count; iw++) {
+    int oriw = image.cols;
+    int orih = image.rows;
+    if (data[1] > thresh && static_cast<int>(data[0]) > 0) {
+      Object obj;
+      int x = static_cast<int>(data[2] * oriw);
+      int y = static_cast<int>(data[3] * orih);
+      int w = static_cast<int>(data[4] * oriw) - x;
+      int h = static_cast<int>(data[5] * orih) - y;
+      cv::Rect rec_clip =
+          cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows);
+      obj.batch_id = 0;
+      obj.class_id = static_cast<int>(data[0]);
+      obj.prob = data[1];
+      obj.rec = rec_clip;
+      if (w > 0 && h > 0 && obj.prob <= 1) {
+        rect_out.push_back(obj);
+        cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 2, cv::LINE_AA);
+        std::string str_prob = std::to_string(obj.prob);
+        std::string text = std::string(class_names[obj.class_id]) + ": " +
+                           str_prob.substr(0, str_prob.find(".") + 4);
+        int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+        double font_scale = 1.f;
+        int thickness = 2;
+        cv::Size text_size =
+            cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+        float new_font_scale = w * 0.35 * font_scale / text_size.width;
+        text_size = cv::getTextSize(
+            text, font_face, new_font_scale, thickness, nullptr);
+        cv::Point origin;
+        origin.x = x + 10;
+        origin.y = y + text_size.height + 10;
+        cv::putText(image,
+                    text,
+                    origin,
+                    font_face,
+                    new_font_scale,
+                    cv::Scalar(0, 255, 255),
+                    thickness,
+                    cv::LINE_AA);
+
+        std::cout << "detection, image size: " << image.cols << ", "
+                  << image.rows
+                  << ", detect object: " << class_names[obj.class_id]
+                  << ", score: " << obj.prob << ", location: x=" << x
+                  << ", y=" << y << ", width=" << w << ", height=" << h
+                  << std::endl;
+      }
+    }
+    data += 6;
+  }
+  return rect_out;
+}
+
+void RunModel(std::string model_dir, std::string img_path) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  // 3. Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  const int in_width = 300;
+  const int in_height = 300;
+  input_tensor->Resize({1, 3, in_height, in_width});
+  auto* data = input_tensor->mutable_data<float>();
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  pre_process(img, in_width, in_height, data);
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int64_t cnt = 1;
+  for (auto& i : shape_out) {
+    cnt *= i;
+  }
+  auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.6f, img);
+  std::string result_name =
+      img_path.substr(0, img_path.find(".")) + "_detection_result.jpg";
+  cv::imwrite(result_name, img);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  RunModel(model_dir, img_path);
+  return 0;
+}
diff --git a/lite/demo/cxx/mobile_detection/test.jpg b/lite/demo/cxx/mobile_detection/test.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6bb36e136deec6088c7b75215fc35d6231283673
GIT binary patch
literal 127499
zcmb4qXHZkm7j}S95_*?vfKWmcklv(2LTCv!6sbaJQUVHC=mJth=pB+kLT?Hv3P_EG
z5<mn|KtVu7@Q0$<Uj856nfL3vcV_S2d(Q6JnY(B2GtW8uZ{^<xfCpg(w*oLQ000bU
z7vSF-z#IT(Wo2V!0kg5Ov4g?vT<3VWI61i_1ci9d$()ykN}rdOl2g*TAP3WgNlB}j
zs%z>Q7#kbQs#?I!4Q+Icj1B%b2m_d%or{x8oQFr;KtWo;;Qzb)8v*dL0bemSGcia5
z7<n0(cp3hU1H{kf$;|M->Hgos2w-3WGPAI<ok<;d&K{VU7@1grKp+#4jS0kX)|E+0
z9mr?O?Bo}n!on|I)NrWLJvuL7ws|BdBy$lJv&9P4Y}B&!4@ku~jS0(|Tg0ZJgPa4?
zd(H+FWdJapS^Iy?0T>wnXYNcU$9wjkk&)@F`oB61jJ#6nOnj#NKqmod4ZmnJ=6|aI
zj{j-!GVuZ~0p3#z(i33GJvn8Yk_g9c9H^0Fe6;Gno#vAzNkt~sH@bSuA3n7tqclBO
z2(GtJ8|R%<51F%PKg7u>4Hs|$ZFtp-ij3=T5M&fG9TOeC)Se=V?^FUyPb44Z`1!P5
z+^UOTnEVNse{pb0%OiE)QH2du{?%RN3B2I3rI8NE1vV|~?w4AkHEEbNR8xRgrLdY6
zb>1anE=Sn(QC^`Zf;Ffm{nl}o<l@jD6y~tt{oX3)6YDQ+wNLH1Z*CQJijU2ntQU)p
zH~HaS);dR0?ZNF|9`YDd((mybcjk1QRzO?oT=ksvzUI|tA5_uW9cd*8Jt;e=Q$_g<
z_HjzJGj|-#PlPAAu;zu6?vHwr7spQg7h$iRBjX({5P3VSIrdRtjXlD2wTTIm=zyck
z0>a5YzTQYp7tW0-{OYcR)=(pRdUE(s^D)9Iqf%Mmli~KN|J)4$E&!j<OP__uaQ8lU
zab;~=vQzKE#h414?>tb27yQW!mG9*l!SJH4LI;p9L>M+lOE@x>L?cj{f%RZNm^q<o
zLX7ow=~kuzS=(fVymucWhp>@3u*H$#fUgKR!UkB%53Q(mO2dIhQwz^4e&*$@W47RN
zpgu-uC(7DOb)Hh+uTVY)FS#Y8NX&Ea5%8$=VwEbw#+XeqftN8`(+rZP>%f<Zo<XFI
zIlY1rOeA@<9dm(5q8BNMNkP1kQ$9lp3m@vDDPBP_?#CyZ(Imzy`T~Bz!~jh|Da=I}
zgAiJ)Uf~2#cVkn^z@0}A;cN$86?o>S2agps!TSp|ByY$xTMTSH+3!Hp`Iy_%SDFrj
z)89TRPVT>yp`QMU{@Y2|S|-zJzP$Vt@>k$);gD5`d4(dm$;v%E-*GC4Ene^rT1+Jg
zHayQFC04-!M@rb~MkiB^N-?WLj(g5N&zeDXF3D7vkgv-9KF_HAupP!h#H17O913L!
zQ$k&Y$Ti#RIu{Tv3wcP=JiH+DdMOYa-GT~@a->M}6v_hh;L;{$UaYV*vV!dt8#@hf
zw*eGgO|CG@<DIFZVUO%!9hLS75yT;$0S|{hArXyv-#0VTL}fNZ%h{EyI2&nJS!@^(
z+lYeGnb}cY6B|0%<Y?WD0Rd)Q^U@%4F*t{13t2wQN0A@BO!r%80eABe1@AC3T#LDR
zL?bIUo=mg*Q^cDDA5eXMs<m&GLFY~$!G^!&G=Iqrx{#P}_IB84u9BEVYyV;|rJL|x
zj;{<(Fi6dtHMy(}iUFP|cpYrZ>n}7Uku(fmH^ky&I~rq27cW*f#S)2IP0?2PV7?|f
zxYawb^yYPB@>Vk8OOyg=h=H%zVFEXvf@7Y$5|w)Tc^Ka|XP#8DuGzoXq%?hM?V_)^
zNyqZXy+VDzk5sU&V^N1mpudk5+prc69L~)l+l?Q*^z5W4Yx8w^d%Sq~N(J~OD#NP?
zCTv5?cQF^pMmX9Wln?QE!UthLpe>`SeaO}e#CPy|3?m~*918XX^fh-0E4+$<L%BBm
z5AoJ^YUvMp{NW04ne=08Re1tkA;@rD4f-|1kjM>_J6r99|1J5mN)1(uHZzGz)~>s|
z6oO*)ZyaGS&69g>5GAnM((b0n+E-^8G3Iqczy-rR-_;7ToJV5#Dn{&N8FYm#mUZ95
zJgjzKH8LV%xPyWS)`7R)OB!c~A2+c$nQnWYFm)mKJ^h`!NmuS}x)Wy}%4McysK_$}
zW|p2C0m1+b=3h*BkdT|<-m*6R57|J&yFxBTdxp(eRc$y*HEF~w;=gnb^1JuVK_Xp}
zl!#jmX98F30u&aqM^^4r*t1N($!BEKZk3+JwB$n~l^3?nFN|cl?d~OS^2vP=RM?Z_
zP+IJ(H<5C%&*Fh7A&C$ZHyNeM3VO0ygJygML#QZ9$|{8aH<9|ydPo$?V(PRH>!%C5
z-h7hL03w<U8DS&UsdIObkT@8&go4U#xoo?ss7Z0xuS9V^EDd|bAM}9I%;I;&l?;4^
zsE8A79j*m`HZ218jw{OQED)p#9I$E=wJK@aJw!i-70k-!N==ocG0N(?@Q#I8?R+<9
zRtGp+tlMLf`vfLS-s*w1f}XQ+s$EAG#^4L-xl$@p8r+TkycTlVrYw;adJ@IfQK`yu
z_GVZM8E6qqF0wYu0Mn*UJZx&5O;1#^11Q(<e7YLw$=_$D#`h{&O?ov8cqD9kPD^Nh
zgOFVQkjs&qq37&I6JGw*sMSoE4P)L^yCLXyW19yD#2R>>P?DD1oe!L@78630lg2l{
zw%&m;71ei%CpEur;Bq+YFU0f^wZ2t<k~VZcG0k1|$f<0+<ov@Uv&#xndjC}`yna7H
z)7V3@4LL{Z*dt@X^7;sq7YZO70<gKSVrRm_NpDghu>M@bkp%)|dalqpFo@R#p$)bf
zbr_znVDptfM&<;|p%MO(gr;OvI>EyRm?thqGFr;)jpfEM**VrdVGv7Re9c5ylC<?V
zGQ0cZn)c50-Fpl)P7(6E_;rKwuM8`<5LT`oYximh&tJOuJrKRYW&_hS>P+UG=i8~u
zAlbS~+MlW%gQyyL{<auQl596Q<c_#E0`@ep=sh{UqODkGLjf)||4IzX<;#LLIS01h
zW&q#gC_)m}&%A{Mf(SGd<_L0JcsPX59A<XC8X`+#{)xejqmMoM#4ZdilV_7{HZ7f9
z)&`HZN)j(|Oo6{C0v^Q>JI%y+1U`Lx!rK?Pj$CEx>F#b&0-o5KL_h>m@<5SH1i2O_
zSj-J*4@y_Dw>8<>eo>NN)awFM38(d}Myu!ntn%xpCUfVc`+V8Nlpuk^y!`5NXKU5y
zZ86Nd*(H9Ffh;$Ly%eSrtv1&nK3~DeJVt-f_Jsvr;9aFf+;j1p0;`V%TlQq{yTP!k
zZR&X;Ur|@XtaED7F-D3EUF~RJ5guyN4{e3D%hzD!^*+h~J9B87w_|4agX|6EAqVXv
z?@rTs^p(8+>bVa?S6^Xs*El*#j@RXSM=ppl9Q47Csdh}Uq!6>9UX|ASmZp!VxhvI4
z^{l__e!~2qGs1_*0e{H93m?Fn<FPvLn_+SyOG8HB<j-ObQQiH2v`(F%k=Rj&I7BBZ
z%)5c|5XtTkr*kf5Wm8KtrJP;f$sTmi_0JWXat};p!aZSQw0-t<KQ}*CaE(cSoWRau
z?|u*1aZz#s5#Kigw$?tem~;T&6AaU)b%#_deOm>Lk&l?qeC}#0pP@j7S!AUNhD#vO
zGH)^SO$+B-Jx?X0K}>nMLtt)~Bu}Y~9KSH|X?jqhG*7tjrmaR541eKT;6#pRRF#4X
zM5O?@7G&?K2Vwz+V=o78%NkU&ppXN&MGG|#BsjC^huFzjRYpl9Od-W-8g!CEV~m*g
zb_CSl3=we&4F!#{EWdTvDPd|Yrcp!7#K{*v)fV?nynjX-bbE7UH;qWv?h?<{GXX;!
z`P0ru&?Lkp7YcY1Jn^r2_#<tvgC%B81VZRnLHAp($*(JbbckL}1QJmCz3jAYu`ndO
zdRqi`hPG6#Qur0%NH2r4R#tTo18l|&SuZ9FvQ*R&`&plhlp>6;Ytk6DVaSr1bP>z(
zuf!8bHtW1M9gB4d;Q~tT@9yO(|4{XF=)2?!BX2-~$^rZ5LhyZgtSAFTb|#vlbd3%Y
zWE;HSj>t0s2s~@;JJy;m(`In3pmH*bzSR&c{fI4u7C^|W!&c^}VNW~Wzt|glnRV#N
zy@dS-*nm>CK6c^V#~3}EoWwK>=O(LuDhUi3gqlX-ZwMk1=V)d9KQWr8X-wBxy^XEv
z6^|z!Q&-WisrqGlM00@n*ph@(2bM)*hw94Mt;#eFD3hLkO2wS8%}QDAo8YY(FmbGv
z4=~TZw$bM@;nDc(&Uxx}mfOo%*jf|y=!ei=khbQl`Xe1DM$y|mlCIWW-va*u?kxWU
zB)2GH73n&D84Eq_5J+KWQ~3`7lf=BnCHEceDV|E9&!M>23vY3+jCy**uwGL)RE&O?
zAdCIGuHx#YRUmIBApEQ5pzlQ`5JrQ>6yY)LM3XFLxw)v)(XjyYdcV!k?kBz;&dJ<g
z-H4esXctk8(66aU=XJAM8j$qqm(j;;+Jl!NnIT@*VB@+e<*;kCyru8df);p%>EM)<
zYj)6SwuqB{Yd#pT5J`-X=e<IvWa;P<Ogju=dwbPa!AmMKd7BPBf1Jf^@NM*l6%}xz
zIG+@znHXk4`lKYg>VN%Iv6H-`Z?;RF^W_tYY%Dfa+zxCuqi_LEJTY!M;k2m}KeAb`
zRmVJllwfpU%%h&2^xM<Z*VVMcrrUm%xUM3(`VTNs<_=ZTEvcrZ&N+W>4JS2j#uZeT
z_~obe?mbEv>b2X6DNS=tBdYc`#9dh+D!5)^8~rpebYW<o?z-yo4wu`#HZS5*!m{&_
zC&~=wb*sw#J465b1Qqa2t1hN@N(ljCc{q^rr6c{vN3wrq-B*Uig{rW3D}lw5LB5Y$
zoOc?2wkU@ASni*%OiFVZ_VhN(GN=WKE8CNnBq)-f)57$QJif&lv@7=R&nl`S$_3q4
z#$GHA2wrI&0cQlyDzRf`#%e;+f~wrT+QDCvgRU%Jn}Cu8IyP(@>=5OSv*Yb(z8YOh
zE026e?SHCYF_<SD7irHDQW0%vBoeq#KD5i8EaL1~>ufOvGkT+_aWJ9YEeugL^Hvbw
z9SZ7(%kA1Gn|L9x@-U+1-u)kPkd^v!6NDrfvgc^GP&{g~8slemT9j^3Aj|&S(KPn;
zRYD=L)#Brx!DhtbgJkEG|N1Vc`+ODBkF5Cyai<=1Fy?(TbY>nW9D5@6Nw4dTLxQA(
zoWjX`cr;@*V=7-%rIfBI5o@Iq$A*)BlU#+9ef->VQ2Tu*{fli(x}MNRa#@V$1BGkA
zU#X1V6G9(@Odm`b9|PC$>st-4OZT=Jl+)=!;$#AY?z$D?nl6`f(0U<91kE4KpJ0Yy
zd4XW`lin7=k+o&eJ-I_>{=u@KXfu0+FI+7hUqeKg?|-keqP?On&&Ea9Fgd={d^44s
z%I1Ni4&K>3_8ivkx9(%ArGq2d)w4YsTFK}4dnE|@i}%v5Sx8QBwK|?gKn=@vL^4X;
zFdv54;o`5XBXkQB+povYtQEgP-)b57tx^E372lNV;*Rs!e0NxJ1a4HpL^xg&%VM2C
zXc5+a1Qx9a;o(EWP4%lyVPB(}MKgvRmX;EJAQKcW`#kiR+G%rM_@w5w!H`Ss7oFti
z-$BHh<BY%d<~^(9%j<8|ej#+A+W2PP_ZFP>jm%C(MC9Fl-QSl4-m8G6wqN!(j=6F_
zj=g+%1FAH1i~9YB7C?Rwg!OqIygym~WYxMrG4sBMW6k3JZ`K_Fmu~@%({*N1y^>y2
zr8AJzMvK>Fx5s@ZC-?5I%Z0xFRmQ^|WQBaYldCWFN)og$w14n{S0&+Ea<|<Vs>YKW
zyfdb@KC~zaE7OA@?;s4O)J_`c)we=wGI@@EFRXitZ(CRzu=@VgL0z$Uf3OoG?#okl
z6!*lpTDEB|*3Q<p)xd}O2P62Vb#K)p6&1&=sF%FQ?rr-n?tvyUc%jm?&Y_~5+^|s{
zxv8B}%7K{WoL1%Y>kH~tp&}1*0eTuk5!bqh^~<bV)>WI{gi-EC8>CBqcj;UAgj0Ta
zTJ(L#b3gZ*#|gQ#e!;nT+mquL<FXwBmOdQF=0mE|aHd~7;?`N3$lvt)!g|B9*TV&?
zcBp(y!nPx0_nqcO7)fNl?a1bhuZZsy`t4W2w$<G1=v5wW`-DR!;aAs^Pi8Q+b4yIk
z#=ixHd!sVyD=g+@-L&cn#=p~o$OA~4)~<TKw9Bb8?7P>0S{;6_tS&9e0{gAU`6+$n
zx~DJ<?B3C_*UJ7kM*ICGd$3R6t^B;xoN$Sf&|&*3y`c}GVWEqELma<EUxYp5jg4H|
ze?a6=>{z-wsHAH!!4vI5{G9YAwQLwb&ATE0b}Nsw1Kdk1>UW5<>`=5C+S;A(6G6FT
z8Wxq**-g5TbzjMNlYhAG8L%5yxKf6s_d+yx2Ta^r2G~5DFN4<<O;Jidc|H>u!KRWT
z`};v^9ZA#@ZO!4I)ikTYgWlnPfSlX6T33eM@?v6~9op;>En)mLR+j_RcRe9RSBXXc
zwp?mQ8|YcHAVW`0E#9R?wZ(-3T(|W&<UH&Ra~XUj&}!B0HsRP}3Z*Eyl(>A+8H%C^
zD<#IN;Cr*(a0i)kHvI<-&7(njuawMv;K7wolf0s;6@`tzn|-jMNx=`a*)mH$+*Opl
z#@F;hD0yyhQf8mjOuzaA{aVG<i?3>3?>j>TSJio1b7Z>aEgq>0A&G5>M4fYm&5_AG
zLQl?6q|Aw3$XG#;%~X5}5x|(Y3b-GiHZ>3O{Rhb4m)ak>W^V{F(%Zxt@A9X=+2D<;
z>rNcJA5#08`mNkEh`zLtI%YC7vaFUf&V2oSRZbxvzSkD%B_}1DWMBD-%48;e^v31N
zK!l{tsN<XiZj*xyW6_%&yHOZSyGc#d*YA|s{Movgm}C*jmp4id3wbyWytUisrE<qH
z%rh&aj&eQxwT*26A{`J@dBaiIZpQAxgj0AdXxsY1gkiZUcT9pAU0z|s%JGZ``jk$t
z6UdFl{kPRryLaCkSc-bx_`38xaE)Zxq9gy7O_Tl#bVg7euN5_P>n&6|mc=9waG}Nc
zbEQ<l8b8gvQddYE=?j^WC{|exfgS=G1HI#0D3;6RGHfLD$lVlL#XII&U0r2o!#^-1
z(?7P<=z9>kVReND)>ns59N?1G*SR`fx>&}uHWUu}tK}9$Wr?g_m?<T#Z`ItfM#Y>i
zuAnf{l@H;Be}FChM(v!4Xv0$O=b}ocU*4%){Lc+HRCDp1ovY2xE0@V;XHV522zP2u
zM_+OOi%2y<`qSuG^yOxktEbLv(WA1#5~f6l2kNWWorYReljpub(o;8&G_A88;QfB9
z-R^uRJb(Z5Z<LGK+T3c8YP==&9_+h2dODV<LQ2AzLT)<tkCLb68dQtrYbQWzhIBEc
ztrs-0Wg2n~Ne7x$s}*v%*GmOt8HZXio_K8v50-gmXh~eW+Z;jo^u9%rE<48jAR`zt
zDI9S<{QQqLazdEpEA>r9`A~}fn_X?a;lGrt!R>O3$ES?m@cE4k>_&^VQ#Vwz-n+M6
z?A;2eKQH3=sN+))cXVoYx7i7PUfO0h=Stxw|HMI-1RZ#uLty8w^;CiLLiRJrGi0D#
z*a$8=Jc=ae8NIKDI+1!M_Rz{vSkr|%_&h>JJ{j53YPwuwqE%uVP-zk+&$JY6?%;7J
zPGXWHtJ|#r72l9D)DWM=9g+3l0eKSt;C$$DQ`>Q7nC9noPy0v~+l2pou3u*1O2}9`
zHtxk_#7tdE5|_d98!A5z)i=jg*1gkg`uo}a#jBc*Ka8?Um&oyzE$i=SgC0lrW=|Jx
z55_gM`~yU|@EtCE5A8mgm9%wO)h8aGSf6cz_bsoTs$2+BWNM63Ji~qxcsv%+jQQ;k
zhvEa_pAwLGmI*Zi*;YX__c-LlBk=0y&QQ&!9a7k&;#=_~(K42yC81W1HN;Z_{L
zPsMne0#@e)YL5lMt5-biU#8GVU#<p0sA(bSHU?VPa}&ch@-SlR2T}CdV!YzzEzow5
z`g^mAek8=Iuv46N`&MrTTF>%Jj@3{YBo0OW^-C%)azcf<*i{dOrWm%p!yu|0P~qk7
zi$kIa8g&ctxHWo3My=<~JJ^#IpTMe3lTBHJb^peb!blgnyuPP7!)Dn4W>kqQjlQtt
zm;H&Hm3l8@$&;t?s#UZ`)V-S@MK1(qB*yw1hhB4Oc@}I{AgH$I%S@y_tIklMAe(S~
zGzncsVq}&M<44;+8d+M^veR^3tbF=Ez$jmzNr<*3Q#y;=A%O69b2IESC?eFf_%}dY
z_BY^D|93VM6E?Kc4<22c-KtkuUC>bOU_C^mU*Gj!1;jMtX{bVEXTF>FqEa6!?uXZd
zkD7~+>h>rro7Lt1ZV+rQ=juaZzgZQBrj-<eBVBVuMf{b`q>ii<E8;Fy7oc$uXJpiP
zz}I{Hf-_2@v`Opu_%r^Bp|f`f_Qqc2@-qOyS?MDMyUrp%>bM*?)F0GtVCtb2I(Q&E
zPJnk0(3j%S&gS8UrAhn3csMOBIty{b1#2J@H%(Y;OGVR&^N|jZC#+%H7m+V{6Hu(@
zTv9B=%?fLS)J-J05HWiuuua4I*8npg9=^yU+c<&&{i=!GkZRSnoV$s9dnU+GyTUII
znT0k=-*b{KY%*xXq{S_)<EFn>qAHMu&|))qs*U7cj-2Meh>h}Yi|A4%L$YenRj0X&
zNV>}l=6d(@uYz`22clwgQ_U8Z-TumDh27%ik&z893Cnfq^f;wHIw&vFW!!&MI*bwR
zb}g-TZlU`*;f=lO146a$eaACJi<fZTwB3K0`c>|whGT^MZoi99=%aEn5-+;KwBy2Q
zj@4TJQN70Jt=Q{zr0#kHNdXAn7OM=#OBu;Wb6VRrg5<mFsOE5$x9&ONM>F8i%1USZ
zRpr-5!qR?@?LMSy2Q6D$(YT$tl5gcBo|a^TT9@E~fv0&3Q5X4fJV~u~L~n5@=P&6n
zxY`B9=Mwx>UQT0etM_~uD#(hdTG{w+n8Up7)TYdMyPbURbMDBU=|hjVP^_WL?4EvH
z;yBNk=}bnydHpiPCZ*gv@-CU)faJW#1U{^vPwshSn;2#o6aT*D7_e6QvsHPwZ)L$e
za>ya}99YM(SUpax9a=QAKV<mqoFQ{|+g`r{-WGX{eQuo`HZ9(wWu(%7?v@DbS({RB
z=S9ueYr+9Tj9;JX+IU}e+_ih{wLB_8@?Dzyb^mizA(zm=2PN@`aTYGG!<i75Zp|}1
zktkeV7|b__9&cfn<T3)&GjB{#!luToi(u+s$cH<Z$71(rl!30d#$Z=BAuj$iMAJw4
zh%M7ey7}sP_cu)BUggbORd23deVE(#*UwM=lTOHe|IWRBP9pta2CIu_`w44JyOQQa
zx*nd&!lv_6uq)&-x@pbvTvXr5ku$4HRx_nu+)8pjs{b^pWwCMNCI5LIJW^0FdRoWs
zAkN;8XQw>Fm{<_s8JB1hX6Y;SLWjX|X@qQ@N{P=U7bh9+E;>IBBE9t1w^r`R{M=vN
z`R&e&iZ=?w+5Z5l{*hS%>Is$^os*uL>xN_pP!Y5+yh?`SE|C#PFvaJxDV#BDEAzrA
zmX^_U&$+%rfW0_l)(`@PcQx7oO*oBp%Ejar2Z}~J68t6w?8ejzw-pkqD40Dy0m+bZ
z-zKY`my(E*j;x^$y26vjX{~$12uVu+hRjKdjK>L4_DeyWXQKb#nGkyABL%z5A}%N7
z*QxGWL7of7AC>hvT8DIgJC?Ao7Aw~=mpM=TQc*E2_3W8!dpD4p6(`~5GvOLLqleOc
z*5X~#s9jX>PzB5lai1uSFl=3X%yAt?&0GpoR2#u5UfPN??H*G}kP1cyHrlwZEME1X
z&g0wAA3OT0l|=|CsjbVx{kUfzd+}`-+xza8&{$2<eGc=TPoI2`d;SqAcYy9~Te$Y$
zA2+W3gL5(5TRxmQMywC!Di-&u4fe^AkE+@S6X;ssJfpTR>i5T2(|pq27y&<?V{sPL
z`V{`#HuoPuq+`>+k+K-JcyHHs?oU%&`w}Jl$6sn*WZ&rTkL7WR`A>6lyzFmLP6eMo
zXxW#0V1upj_!2M!h0x2NFJ&%a<CkKD{pj;8@buCemj1SKLh%Y+y@+W>@{lp_&L*2r
zL?EBTjg3GLCh8zv)zpaQe*-=Dir=u|KbTw<Bp;8pPeu`lc=jL|!VQ@wv_Yudm4y~y
z%^QrragYWzR-TeH>t7R^Oi-1MG=E3tLCv!`#olKS9>nXRz}ARbP4*@5IceRu66JFS
zU}0cO1c3P*z(Y*b^x%qMrKW|uH)PC<DH1`a<Jz5kDF@RbI|8dQ(ha7>^&a?a&=say
zb1N3mEVaPN9h=O3iFZTsMcZYY#Vp_oKLGYQq+iQY_-U3Zzi*&-uzss;R)paU%|+DC
zT3A20n@w4&aRemmwV<L$>GE~(ZD={U^fT{=rDMurpI2_Z8uKhQ+AwqDiU9?tEgCvt
zb52qLi;fFT$@{R|W?U4#GCOA6!l(3CMz>fIqwlt43~W8Muifw9nPDoK3rVyI3%mzE
z5D0ot_qvf3uzPA-ZK^UA9dp00MkTKx)1aiu+G^QEW$=?)=K?*Y$Zu?NJM2bO5h_2U
ziE~Qv52r%7$u~5beDNH*FA69tc=cEa%_IfyOOOuFO+TzH>I*8`O2f*!XCK>!&@{DM
z59~kF_rUQeE}obIe{D9a460}idnlV!>`jNJ(b&p~)xqRk#l#x5s8j~a)XSegxada<
zNhf(++1I>))s~sfc)On%g2zb52t_uNh1H&39c8OnQRN#*B*@<9eYftM#4a9qZlj~%
zas#Cqm+4t3<c(2@GxQQ;ygy|$`1R-`Io9X9h@Yz`<@HLe)xn=R{fsT`+6g(Vk{)S}
z$rJAdY}Z}T!(8e7>BDYaozX;(o6InI^~3w^d96I8)>J_H(Z$p@EQsVhpOB(`F{9S$
z`r3W#P9~J--=kX}&wEWbzwZyxS_;v$TT3{geiMZ*Xk54@VEYu=Eateoqm<CjMZJd2
zX{LMLqa3;==1Qo$Bw6ge_ze5bPt!aZd*qRhPXA?s*vtD=t`~JRFs+%OzCnO+-=rT9
z#Naf@3KEQ_Ie}u+Qx0nRZCF7kW|}}&Tto`(ZnTZ=*P^zxtt>-7H)g{blhw)hb~we&
z@tC}hBElG5!mw$bNwF*XoP}f*3N><b$LWh=SKI^Xw_Byz0g3)!8{U);IW^f1!kkib
zf`Bl^ovf#STp)s<*~*x;oNjgf10a7YCoU&=#@!i}WEUMMo6T|Dr-(QX@VMMzbJE|&
z9x)@8a7VkXYS)!C-@|lCokAYO6HMZU|5L+k9dKi)9$aXyQjk|lj@NWt_C5g^SR7~o
zN14!)fP6-^*zjQ*-tmA<rVqf+&t^MU#ut+&{-aq~+ws6q+D~?R-u@-_70H1}XMG%-
z-Xh91(bT~AI;P-m_G|5AMuZzm6$bMZY~Dm}RV05Nv*&APkp;mvavSb8KnWy;AP5W&
zwqdOUk~n5;>N8*n5wkQAV+yeWdd+SC5fDJ2;}#6aMe8al4moJtLP=3q!Vw|}x-1Tn
zfG!Z~!yT)EkVeyLX*N~<XHXV4#=R3ZCt<r(&m2z8w4fwA`hIBE%Gx!LlyQS4oRN~d
zLbA9CFTP3E@^8&N)~Q9&^+)5O%iqCe&C{DaO=b*PYH7c%%86dgS+pH}lAljgjdnGM
z!rhnP$fR$Wyh$5b%eFS#;gw755q#MNn&lrjih?|HwS2t(i(FqKSL8x%rebNVyt2Ik
zcw)QTPuL1GbMjFY^@<*+*sS<9M_cNnmxuOWy_ih4B_JZ<#dW|_*>(x(xzB?s@;&qO
zA$FmqLXzN)LVczv&e`3e+9DIJ<dV_^eZ%W*yL}vCCTqMUorvU+E~mY{R5+{Rnu&Y-
z?(TD6aTsQfuj@a1wF)+72x~gv2Zf;r+sHUwIK3_Kf?sBw$Qt;Pjl|qbQmas?g5cAn
ztd6u%nkF{cNZ1mQ%O%(mTj-irRScHziITF+DamW!TWsyh8)Uz`KdiMHny+%68-E<!
z!vIz8Xl@8fwC@^{o!@sWf7G-)#67XmhPu9phUk0$1F&~k7oom+Gc-77C4+`ugjLzK
z$C?Vt4?iB-D~o<!Ca5#y_YV+SugZG3QDCWJ#VT|Om?Bf?%!zHWv?|wyo=fekoOemq
zea$_|=xjA>7nY|R-7R#FH%{?P@8YyMUGh>Uq&g*r*tUVy>(G|A{OSJZ>Oa7CUX$eh
zgVFQz@WsoT5Sa*wqHyk(;H^gg6n2}*o|JD6TTRS75~aE0wR{!~pOP7s3nk`<q%76Y
zAoMhYnYa}~Wv1}Gd{xVRYtR$Cn5{94ynBtY7R|baPIaUufEXG}&MBm9vcT*)V$QHP
zH@0#MQ7Fjd&1@si!}lDIzXRKbBq(HT0x9Qs0vMfGihXO^57Bgz&OuRA7<jmm=e=_M
zJ73PFVHL)g*<kDhHLo)Gp>pLhX?*OhCj5#}d}9jt;a4<sfKj>~;nr@oiq4!7!T5bz
zhNp^RccDN#5AnppTUXd*A+vjUIjo8~EX?9=RIfNkiDPF-)^M>2TQv`Ff^%`18Q-2A
zZkm+<C8m=t;0Ngy+lF$h2$nq9pyfUO2&>6FENWhuCs`wsOkCk-b1ilM2guS4!g(85
zYo_FOgxNelZCDXp4LOTBE@@X=d{(9N9>YVm3!Xrs_&2xczJ|Ou(zIU5_zxL!{{VvF
z<43|y#UQ<fCb<Mtr^s`5M#YKbQByD)fki~5^DdYUs!TJ=q4_~~3uypiOeL@ZPpEhD
zP>QNVF$19_%@YSH78tS;WnxGsWZNmI(_l__5Ym1uF&jK(-xW0i3DZn(98x`}&0!FE
z@($iW#m-2|Jq5}o6|}ck<{TmdWnRVot+LOPb8dmw3MDMP6J){Hlf<U6E?SBF@cOFR
zxE1I5q*eZddzt|gu{AdQ&RJ?}3TrkWw4k7H3yk1JbeOMLRe}m<7;b}4eB-w%L-6O2
z;-SA%SiRh@7JI6~FVMOZB=O|H*R_5YC-ANh;0nSBPoz`N7oi?TXbRZMoh;)3XZ~C6
z9qe#DLi?V-*}LQMoV<sdaTpHG@^aeFSN(^PsdvrYImRig8%XM6EgPyU*>2gclmXaW
zRgdM}%DExPGio6L)*dobbLv@e4D_Oia)X6Rv@XgzD=PF{f7p~>51eC<cVG~|E|;7f
zoxVrrcYvkF3j`6@;o4^58ii&%NNUMmw2+kdg^+FqMNOkeis5<*(cVeqoVI922O%$^
zh7OX;b3N?XcTS(;H)^SGgsYM)>Hh$kO9LrYV~-_%)TtU&Iqe-p1b-5OT3kcXTTyqP
zio?@pl`bK@ZBb)%;mr`Yq~W7aTO37cwB_z8@|)DITUA@U5(6k#?VDSAhc38$#`FP`
z=n5vqkE>G`kKEs|dE5-sH#KY5&waclgcp<~+xUTRU+X>+vwNmE^wmx)w$C*LG@@2)
z%9eUBNX_YL3L9F0X>cSR6abTR?jq3Oa>X=M1(X3{GHMP(T){`yIwDZS1q3RYFH3*{
zkzfvko7hO;OlfN9G+dsqG4CBjCOVA;ZA3<Oy;fi-3eu8-LZc(^n&&Lg9ba28qY)!s
z$`2l2`BN^sYPaa7ZnMEbfDftAx$^P;FTc8BLnI)-dcXRlH1CN@7{vFaMa=LadNdaS
z1u?UK9qV=q=11~mvN$=6O%y_Ll?TvYcdW1bWN99o7TNW_VN}VzMB{F}U2&^GwR0g<
zQ-r*^BNXQ?QzflFe&KQ)*rAuwHyhQPc&B$&B0}eObNBp@N&xysuwluzAl96-gPMFz
zaCjM~Iv;x8bTYn5a?_tB@1@7_a;%`1|5L5^-l1U(QbEIIIxfsvS-^S~u-RIWN)7#j
zvusa!-!*hZfxsLq=+hsbPe$9?W6pDi#Wq%ZiQ_)IgunZ|WsnQ$F*k%>OlXg;YyXs0
z+1)ls<z%q4!>@q56UxCm6?1FrOkqa)>7meE)m9Ux8diqb#Nfg}cKp3QOwHNvf=Ho9
zHa)=riPVSst)?Mxmj6mCLKDoMaDtY)4N0&b!WhT<1_DWQj17fG@CA=@)N@$}65~x_
z2u&_;m=_s=W`@ba3b|p3Koa0<4H(CCU>l9B0C?ip#{_m+!#Cj)a~cQWn3d3ClT9Gm
z{)h|Na#ztx4X+|=_4L7W4AE~4#g4H`YGy4`PkwOWie}{t3N@12og?u44zR;CpI7PL
zcemC{sQpAat-d|AKsHq+B|YqYZ`Tq<!c$*&3+AWaRP*hO>>IVabG!7EdYT)l{~czt
z7WVw<u8}vpDR<@usJ^~yu-CT;Ps?)mYqY9aV|`l<P4IoVQ&zW@t&{MO8AIhie!gw)
zPgJn8xFh9~895YSx^^Zzb7lM72nLcji~=4=f_T-c+1I5EFm!!I2d0cg+&*bU91kd&
zS%B)6=5FHtNI&P_>H4aF!@kV^N};OWyHO;aWx~{^ChrLBs&DsW>_glQkAN;R<B6l%
z2>)1U<-VpjR|{)uOXa|FuJjtaT(i%@yqb^vjM+`fJQH_|*hZ(#TKuTZ6yTC<su6Mi
zl3=RbFUuy?RYiwf$O9Z>%c#Qr{Ev$HgC$kUQ8J_M5uV6xu}dzWH`%Fu9JIF3G<@St
z_?WgF>@Ag3BZ{*8u`<!(j)9QkBg*?MR`&9J<PH5_CK^%>_Z0^YSC@YU6X`E#DKCVw
zwB5fMJ!<*zIY|Bf!;YYJ{dVgzO@5tfnvqVgw>jD|X~@;+Gdq?nycU^J`!ZB%31CI{
z2Pq*6FjThK%R_+DS|Ojbm32??^C)P18hd@yr(Y|OWTe959}jPTyKUal+z~v^ba`!P
z^7~X{WsS-yEnq9q0sndd7u=P)#=G*YovVTLSH6CqqI<(K-6B@V>S9gqr{$*6SBC@V
zM(?LCEZz&Dw`llU=gcO>hk89Jv@6ZrylmIst)g+i1G{>V_jV5*mTuEs654Q6Cn9dm
z&rnhS+Ls{he}FMaRA0O?>5N+Y)H8gqMoKXM1Qm6jOa54~bl|Dr2RblP1YFwlN!x|L
zSl2T>S{PX?l$$22+_+l02(W&xU=6Y?R$=5a3v?t%^Ptl4$PkhSt{0cmRV@O$P6}iI
znsPKU%LqUT#AMvY7#5jK5QtC|sss9tLt<*E&v~GVZ5nMToSAJN0tgVM{yM*?u%BqB
z&oJUlhGhF#BN~7z9aWta^7S~K!SomogzS<*{<~Ll{K%3gdq%CxTJ?IPW73Fu(r5H*
zmi>sjGiFqMz<qDQ>**g7^)@?Z{wMibzm}-#B+aM+8P-e#_j_1X)t*Fz%xHVvYFcfF
z*O1!3x(^-nw=CU$;4vQ0!sw<;$zbgp8cKZBfNJH;Nb%a$*h2KSE!yaKG<K`Hd#|oW
zGq*AddE52Yx(s{!Gd{hEa-dQK2MPxpKrQtb2nHnA;#hkhumn9r{0I59<vj7%yxyT7
zL$o33B0(LrXK<F#+g8cfwW*=4T1CiPC>KWd#e-tz$kSx$1q!xeaO|l|duEC_to5$9
z4l=YVSEXIMEG}xk#G1@ApD|3wepzAyyAC^8_80Nsmy!U!J%~R}_wQ&<vwfJbdXgR+
zxNsJoMpY)A<uNv=637)i<(f1&-Hfei0(wY**j0gKL4|>MxETWp2HRBllnkd&6hrvn
zvdKIk=*%1|;=r|9mZ~DgR0sZpk?qT7u%y;p^toVIpCR*n(Sn-I&pL5SU}Rj7$)-d>
zf!(=Q8T6k|^FKiFqgki+=3wVGsF>OXS--TM;X8ky4;@tek*(4D!r6+P29BpU2bb9&
zYH9-JnMRbkL;DsVkA2f0nzV%#j=s8D#?m)X#h}rBTAKYQI?evjqnTwVa=T+7XxJ@1
zl-Y5NYbOo*qb=vjA9KmOYd7!hN(eV6Nwq|iVa{Z^st;SS13vPO`Fc)QTI~*`qJ~hW
zvuxMkJs#R-C`N=rUp*LEXh6|heUo1lEg43f<HkYR#dx|;<6zexmU|0=eKq+8f0w-4
z%0t+f`U55w+mk$HC!dt~6iF40jNNf&pL!5@qSk%7`-3ysCE3$veAVp|Wy$46&3G-c
zefjw1qjA8+I2Ge;iy^O9@}pjK1Es9RN8NJbHs4(0hhG#MU-Kz_kwKQ`gai+uZkP7u
z3pEP$YU@nJ|0kzG(Ub2}G2&Dtjp*`m*=)vld`H{AQVBfI%s+JLaAV|tGP*Q$ipvrA
z{Mh41^YieYw(OG1Rm)Pw^{zIw{9|HU{BCh(S}0@9q%*GE%cX2y3eUW0jH~Ua_*UuV
zXQrvr5wyW`-8D=0y`(_ib>C0;PRr&!0xackRlp3E-i#==&`bm{j=uc%S;yh(<mWHC
z0g;HE)BBe{mf)6@H0lQGRic64G4Gyo^h;)i&HMhS9o;eOZl)lJEQ{E$kWR2s7-gvW
zSI^dNF8<z-b$=&4l$op5@;Rs4WqbGU(a}1X#FZC8`3~4-yLKIsQELjl0QZ66@S%m-
zhf^K5Na|g$=O&%V(Xcjg)~xL~9NXd^+l<6gKz#1~^SW`B@^&w^1g0;sPp7Y5%hXPQ
zwmU@P-r0V(@2NI@tG?S^C71r+kjwAjZ%xf%ln9*fv+F9AaaDknskows=o>z_EgL_u
z{D$7J1~S*$1zRUiJv5kW`{TXhvhMlB9I`&f=G-q#4lh;fX*=Z2EKWyaW$jm?6FKyd
z1<q2Nu2%n)4S|1vC*@<FqK+yuu#)?8G8yc9<E#7xEQgeo2v1}=)rKv7S;UeVq+=>-
zhMR%pmucJG+M9H2Y9`!CMqK47V{Kr}31Z{P0*RQ+L#r8Ry6KXsa}hxhA^1H@2+X-K
z9GvJc<;De|csg1+b_ng2OZhbZ7<(4*{%))0n+V)-tinr@k2Ip!!Gr6!t?=6j5qM?V
zLsN9)JvXEl>KRouFZM|mm(#t+ryd@wIdea4RGeSx=C(}M7i;vcW9*8$ziQOp#cF8<
zicxXvs<LqJ@3Y`xet89zyUw>nJUTF<joNJ|fz%8Wvk-Z<91{=COyx>&kwQ;kL66Nn
z!mr2_A~vfB&mV!rg0O@&dp*>M-FZH3{*cL+-k}R+g}u8P!1j&;l2$9glCWfT$9~@E
z<vXR@R<luu??TXfni1p?nGB-{*hvf{)QC)J^$Yod_b|)mg2@Cp$O(bopCHxvw^7_`
zQW~+ez!W=tHSsK(2gS3I1f(pI`3W-%rd8?1WE1EH|2R1W-yCLQS{MRQs0GL(y#y}c
zXeKix{ebjol5)D-j9pRv++MP18D^+(zFF3PH#D50(HAGUla}80C9FonE0Dlfqyr=G
zZ*iyl9fg>}<oH-mFh&NK(QuCv?8E-&scbzoYxubMSPkssF!$Q>QnQ0Y+jZc`>)mYc
zsE>UVl%V8}khsu*2>XX+y;%c!!}ejJwmq0uCv|K2HK<GBdsiaj)1}W2EqT{$7Cv3t
z6U`LUQKBibTd}e?e5^q<JJ!$O`pKiQUs9!=RHr%${r5C{Y1iC6E=Ey6fPq>XB5FVF
zxdn%a)e~-JMao*^URyHgXZxv2Z$i)5Ug5sVC+3%?CO04_?8V0GZ;VT^X}-$ML28^6
zKNUi9V~DX0_Z*iL(hoZAi-TS;GS4VoLbnQ-8nm6Y@cVyoOVQ+)k&)W@H<sXip3NRD
z->cwcsOGORy)#18qvYm;4#kLty$mx>^2ge}A#e@+^DWZhOm8%M_rh_ziUiXhDh%^M
zEwkuXE8hN<XBhuK!1%hYqL9mcUvi|F^qbaT(yz<)A5}S6*BAc*QV!)$eBv&X;-uOr
z@Jh@<ESztd-`Gm9=2EXjq-A2Fep{>!gvS))6-_sceR|cF;t)$zs2o!S{*f4>u4@!6
zJH@G(1C2UnTy3TPwwAu|jyZfl^i45Tnk9-q^>CsO34Yhd05J9y+rvF^nqDt!4P7s}
zE;-L*ZC4le^9|eq6~;=`R)=UgK~77Rv?yPkFR$WF7mb)2894llex9c{s`c3n%iEsC
zkCpLZL$VQCJU8D}9j-#R?f>`^_u@2U3i<P!Yzoewvv}&<Ar_T*DNw&e#ar<}IiUP-
z10+9HAWJ_VaDMqT!~072Xh)l=R4EhfsdLy~@!`Ec4Iv@l;_9ySrMwHeO=|hQBe(bH
zDOUfY=(-UiWx3G1WC=%o;MrBdT2h2GtdN6aM6DyDQW7??8QY~d4Ilqq(UiCQY^P##
zb+o0CBj4s1d(i^+ZA*arQ~YlJ$K}VLlbZ9~dPQ$0d0wEq0E&`$Z&SY>+;w~XlKTZc
z_m5C%&tzm@$g#IiR{u9jk=+hca^FwV5_{iv@~Zz+;@)ZVw?x&s&yt_F-1WGoPrtkS
zER|LsPhaBqV0|8}`tiomx7;U`lcjQoaxkX-^%I6*Y_*-|&Gq<4Q^%q=uTgt40{#}?
z(n<C$6vU=K)EkyeeMH(oHK7{nwy~EQcl&B*;`GO?a$4$sq_D5_NywYiVM*<hXLNxu
z#Or5Cv|FRm8@TOCB+=v_z>GVm?05F!yFcVk7x*O`pKtG7{@mHm!(Mtao9y)#wTpSq
zpVdijdAd?sq_lg^CGJ$h_<{YY%U=Iy8Rko-2S0~M8Vja<3Vz|gtVxk~TlW<%9`;Yl
zBzJF_As9?H<D}TwYMpw3vio(QfC=_Rg)s=DKuY=@rmpneKuh|JlzKP9WF1&o3*QzM
zgB#&r`vG9{jVmVtzoxet-K;Vft{Q#H?z~t6j&f?~CmXA_X$*?f>OmHgS8=mp2a9&8
z$aZewZ6`r)S+Vm`rI~A=s5ckeKHH&(1gTabuY&h$j();U+YcjlV%nT;q{NxuT&lFs
zEP|EJw%qvINPqAS)~atND!<i(5woLQ0yka_x{3<QA;Xa^HdXTWwX#kEvcRUW;8Db?
z#U{gLKZSeOmFyZ(0$$g93Y+`LGKwEtBx%C)>66rZbLpV#XuYn=D$^FjCBAJr+o&O*
z^vS-+BX=B=$sOkQ#*S5)@DbT71E2ajfy^-!#-iG%tfrl0<;%v@ZxLZW>+RQ;yhy=v
z0T_EebIZ42xY*<*V>FiV1hfRnhC>TMP`n8gkKn;Ty`-#}!s|mo_-%cTnd0XEOn&$V
zu>ILg=h1~r>qX1SUT~g)gzx3RbF{SsaE<(A9S}Z3{SKnvV`{xZ+XeSz%e9rM>!-qS
z2d*grS*x09EDLWY>{_A8)^!3V*}~gEB1e#@QAhuC@+&A(DxUHpaxR8C_&!ArP1o2E
z*z<I!xV-SiYJ(r-o0az!xUi3VUuaG=dq>!KbuklF<$+<@WJIw}=+)FJn5N%(37>yE
zM7W-@Fvl}5pD}~exe=}H9mIEY<<RSq{rVf#vXCLqoxEeJ-aYjo2>_t$v~_VjSGjU^
zVhI;oS;0j$cVIwW?bQ#xu-+Se5IHoQGNzVs%+STZQKxR`v$7glVO1!G5tB$=I73bt
z=sULj{O-az)=EFR7MD&N;1KlnhHONr=CC0uviqBaK-EXS+B!Iy?Skxw3C|#*`9Q7i
zbLl_rx8#Lc4ibLkq~H6GR@ngEw(x~U%gQkRbfp+M`8~8(ZJ*ceiODU63aI*O5XPc{
z(3<iN@JaEx-~qe8VkG&qa{`1OSuqY$)4IiTL6%#hcEA|MiZ(`--CU48kj>tvhTdh?
zZ80h?2;?D=3^JDN=M%j?>dVDly3&tvyC1(Fn4I+fmhJSERg-mlPo7kFN#(h~&H3+c
zeNGpNE%ekesf3Hvp~p^wt1BmQqBpHm+YatzMz-Vh<t<-^^fbknE^<a0M-A46ukIVq
zhba^@lz{lxlpcK)!06`}w9vyf2E(V~CT_2K^V_aAp=91V1+?k?0$u+V_PgL`tUg?8
z#Cx`<Ca#8wauMy4vG`hhKnJ3F-jvmeJ;2dFH(#jq>-!p`x7MF8SieMv?S7<O{^_;B
zcI;+#AF;t6Cop{&by;t2=}}d>zG2Q1{afYp^9K)hTG#8EFI@Y3!TzF0om$Gexy@RQ
zotmMIkMf)Q&IKpl|Cyp&7o+|GqJGm)?(F$Ib?Z1@RgpcVnO*vx`r-8Lp9b7>PGKFS
zcT0<9nm^;$)zy=;1Z(Tc+0M(+YB2@TyWVKbpZI%Za>Ea*KB7sXcF%~$h%y2aG>SA*
z(Q|sK70^_Sz@D6aMfjc^oGI8r0Y%`v3)<$Op~qOJVA~ud1y%wo6`iJ_4tLNtBQ%*p
zV&sN=7;~I7pNtY%zSsRJ?2RKUR<*8PD~R3g?a$|;+KiwLT=qCxHP;VQsdh`HCLyLK
z%eb*nxx<tlugIM2GRVSNLVh^)b;*_PAK*4-dPWj7JUkmIO7;cF1(T2iBWDTaSoRHu
zlBQ41y-WJvTj*iKYa+}qq~Sg_4d2a4A(gB{C;3cA1_WYXJN+39*F3<~RR?i?(zq<W
z=>4Iu(<|6+e&M&kIU|8II}3MIc!JDA)A99n`+}B`x|n}}zo_1ReQ@9G_?Y1+`+9z5
z%5%9oOIy#mW6}M{Aa4Uz6%n@M#so=r41sN%tFlOo!BDkxjXSt%J}%<q3U+azR*0pv
zNDj670Tv$-;tJCo@Gy7j7}@lh(`wWapf2MunEVzN#y1U(QG7@eU7;`aM@h%BX0B3U
zyORJrke%3Rl&l5)V|$hT>ZW}~{m54bNj6g`L7*#58Cs#w0L5Q7(S*rtny?5I1*U4K
zngv?K5FkL10}r~;iXX~BkhguVp()FjY$t45p$T*(Kv+}>K+iy;=T>qtg5|LNC6k{q
z_+Gg=$*b*^e`PG$uUm=(CT=X9x{0y_NvF@1&lj61Ab2#{oBUSyY*Gmxkg7L~vjQ!9
z)X+DcNTg4aLU6|5JO&mLx~!+hiqGw++|==Sv6r7JXET$5A?LKqMM_)8&C$spQUg;6
zahjxKTlPTHhsqy{OHZCWsSIPMOpiDfF}k)(h^!o1re!IXgj!WSWt#*q2Z{gm^Zy61
zz`gX&04ehgO6EN8Pd_`vx+*Pd$oOPtt$9M%r_n-baox8UHuQ67*zSFb()o7kJFeaH
zvId@Dl8$Ll-&ADJKu$?LZ@SZ06T^A96oxw&k^nSyP^Y!GbevPZjyQ}77xZGyRS-Yo
zYY?<Ur1LY_c&I?mi+IGm#7M}fe#3A8Zj^9d^nF^gqw62oI-w^EC^6{k6m|0M@>JWL
zHvIyN0J!I_+gY{RXg>WbY_&P+O8Z>r%%g+ksc-J!y!A4{JHln&QFDFp<y{l6+Tr|W
zkG0!`vFZWW^PhHMFT*klmTaM;(_U{1+)eL@-BroauMpO@m=4M;+jeMXn02mlwV?Hr
z45(S#UJrZPD;Z&fy`_Z@o2<sxed-PV0d~&ae;c%a21gsezFZE_&xlW}?6^<8<x}$e
z);e>%adU5?+Pe$Ql#t@;y7}na-y08al1R3t*j4Vg_+v$9PH&b}g8lxl>90Nk{{SKS
zDb(*z+;<+tx?8GQXA~dJl0H1d&h8RJ8Q4(|3I=G;=CeA4YT_qE4&BPa)%R+}O=%_l
zY#CfmcVp^Cx|Db{${ycy7HcaX6bHU&JNstQXL-AI0+Z7%*~NkO+C2b%X=E)Q@}56|
zh_;(^{z@S7nVw(LFoiM^EA%lyJRIR$ObaYzh9LwB{VQk*W`ugK8|c&^HdzB3IX=#t
z7qgpc{|`y$8OVnBzWvx+)vi^0ma1K=Hi<2X5u?<MJrY`~wQG+^)UK6~5Sx-Nv{h1C
z#9l>dB}!G%R>$}8`#*2b`*WZBy3cieZUJ#%ZgB_PhJTLB<3E5aIXLW@G&AibaIuv^
zR)u~7O|@`DPEXz7&D=&o&=+2%p4`G$2AZ%};(vl#pHnl2)@)WX0SURNs;$?|U;hJq
zS-?BbgoVNP4Ao}=y`{Wvcov43wC8szp+=OK%kd@Ami5RQ-4$c3X#o!DkYxSG%2}->
zW@YYkH~72H-S-j50H3-ug@k@sw<_d<sPbth)%#B!-ls<XpyjlePPUZez4GsyZPGWg
zWybxHYA^6}I+t7SL`mCEe`RU-@V5?05I6U5bTl2_Z`RltTvLQM8?$FM^#3T;ahYoK
zbQs+;Ab;Du9}?F{s;w<+TM?g+5O+IEvuiK1c6wfXElL(1;Ztto3*QXu+>iLu*%Hbn
z6uI19aHo|=3W9|R_~?*91&OcJFYuQxBfjC-tTL-*YzmlEd&r1lhCbD1k|K!S*T7Fv
zBbRT)lbHb>6QEx7n(h*tb($i&UWQYSwO&isDs@RwOd;d2BFnJUOwXQ;!34xNa@&rX
z{g$fYDOS)#qWrFj3MTbOpzBQfQxC7U8x(Mo-dj}-2cEKb{IBdXM4`MIvyCK_<b4Z+
zKy_hzw$tXcijb*HF{zByhP?zFPojU-n67)Y>kCT)$4$%8gMsfJzw{%vdX|W$b<;=e
zJIv`0tX&X1@7jdUmkw}rWVHln`*W0afzlj>LNR(Z5YZBy)b_3w2lzb^<9FnyJ*i#I
zIU(imK**sy{bT=0quo<Duy86wNpLPKi`83BLucZ!8*yx}9|7+Yf+1>DEw{TWVSY@*
zp8j>poXC;xH*4mW%aJVSUlK?3jrJdVRX7oegq*HuW9KM<7oq-;S6fm7{DE}_gsC6&
z=v)6>dY2#B>Ns-UxtM`-c?lf348(fNC7<|+hg?Hqzv9^wkehLHmLTm%E;TO;M&2)+
z7zOGFuc?34dAh{c(>DuU(5TALvrbD)hVc3hefNoR^Oz5P%I7^4FAVe6gc7+qXJ4jX
z&K7o~w&b9n8gzV<KfFiiZ(SA<VP>z)-741;lKsV0WcBp9*ew>r2K=-lQsm!RkJr%+
zRy_&!Km(a5%l&(gU<@HopX8+}d}`N!_&kiDcTZ&)qMu+^kG<WZlwvuQulno}r&&AY
zRopUxxxjw%)rVn^na~MSJy-3@<jI8G(e-^J@AfYFS8WG7@au)Sm}o24tgOx-lP^%8
zwa#(?pY&iP{m#=TWb(JC-S35bMmTS`Gk8ofapS?xE>Gh1>k)nli3w3!Yr`BBd&~B=
zCkJ$eiyu|5X*@o%A2k+jD?^+uM^K=1qR!LKam)5iBj1bygK0=@q<|W&@T%<3wM4gm
zk?EP_s$fUlE|GA9XPEQfsSn(_$)CMzOW|#Tc_W{fs`t6zvJ47Y`A4F(-<UcEoxYc-
zxHVR4q2S8Kb}hpR6oM+QM}h4zlk%BorG9X?%TqLFJ|>B&KKD-GGZd+-Wd_pV+q;5l
zt@1F!XaxMjmojHe<!H&XDT1z`C3}J1|9b<0!AdixyiT=zrkeniV=uP1?$`;Hj8!}|
z(ehAqWc<c<nr)&!p%77HtaREW$|xJbWw4pskNXOC`lIVayCG7s{MDw?ShD#>8b2p$
z$DoQ6O$!EnMk5%FE+|X_ZOl8%@vZk*q4BRr`rQ5wLUT0G(wD5*Oz4rOniApco#b|S
z7i2iY?~uRpH`3wqMAJXjA&<9)&~Be^suBx@_%|3PZ`(ppduUzU<ACPI#k?PWFkk71
zl~@~l6ZW~LArmoqr!%f=L@)X)Plyr)1r76A8)`|o8GlxUkDFs-`=Ssi{{A1$<6)va
z&s2l%CTtcO^dXbEnzI?-oif`QApuv?W?+-Cew?qdX_HlZQK~|YZBx;-k{2jTj~=wk
z0c_Zpq)4d?TF_h3b8Z?cNb7-k7dho!%$#b6q@y2&-ac|<1MJ)Pt79|N3sNHFRGXBf
zbY{aDBBmqQJ`VGHi}EA}{Rv(3`-e&g8DCT0ah=qR<X@j9%P88olrUzf^Qp>6U1PUl
zi(t<YlS+YLpR@XsJps#=^m_Jm08ukkrn%J<A5K$yHj7Kv|HMd75Cze#z`n{zi*i@|
zzzrM!%m);0zjpqc4@4cA!ln~QEwK9>gI<Fl317c`9w;afP%gf$Jk~^3U5^k;sUun*
zfB34D!+h$HydRnrA9KfL&M&p_kG$#AQx}Q>P)wkb%bxG$SLWA)942~t+WW(T_JUU!
z*n@5`nXnl~W1~=&3bt47YkqfyKOL!CFa=X?OUv(a43F?UJLH#oLkc#?2=Bf05V#2x
z%~0g{r$|qm<GDTp?jb}`=pDr|iIqVWW$D+8`{T8;Gq@?eACxKm3}?yG;X~|2e`NJ)
z`X5<7`=c(4hdbj|2NJPk{{cQKSkDMA$^*p@9v`2%UQ3&*A>!r_Rsmw<;Jx1CjHpLE
z1RJlmRQgrvDrFfwJm=5dQ!tme0*7&McakF!2(2F)^Dr{EP}V9R7=S4_t)?|5yFR@r
zw5c2ylh?5NnQm-MrD<xKyU9yKfkOmy0of80kwN^IyFXXEt}`L`Ab6YlgyKNc${8yT
zucvuC0_&$jPxK#a_a1K#@V)mnBzMicIrEiZ%iMgvbW#xcWI3`ChxdNtXzOHtZgWUe
zEi!vztf#so1ldb03ehjHkFR4sP@iYZ(#?j1-jXuL6u;*!_>#`3bXG@&F$LD&{M)#V
zp9+WYw9f-(K-m+~X_+<~JR=QSazYM_O%`G@e$z?%^Li;!_uTv(ZQjfb>3n(myD5yj
z%1M64tq!pd4s(WA6nFPvG)T=>fv)A@0%LpnMj(r!P9IE*s;Wj$YY5e~bWL<>ejN_B
zJLQwiya;C*KF7X1aH#up-6QdRPzqxo;S+pdU>6|AVeK{(I;F5VQvWzSO<LeBAA!IA
zIt$y7V^9d_VuT~>BBLC;Mg6&6rC02ZmdUtSuV~3zHk+@GHdDpF1nROba(Wxj$**`w
zFOo0VZ0d<MtSs@F%F!9|#IIR;0iwur)5qjK`~&UgaOKK<8LD2IH2;ChR)PE<P-2>O
z8hZpr=05<t|5h)+zGQ|W_Lf?1vD{C`t$R)9ud#CXAhgHseW1fdR+KP^Z__CK+!UtI
zin5)Sy<}sr)Q$2D|8q+aqn}A+=$^oFfBV#HF3;O6NDT|sN|}3sZG}*{Wpg<A9Ghf&
zM?q9Ym#9H5TiT3EtMfMf7geL{KcQUO3{S*}xe%)fWfd&!-OqNE5mfjMqWEDxWki<$
zkUv$~0;OnHQk8LtR>&Bx*JGnQw{DAZG_YPwDH$On790@_atz(K&kMuI?L$YlO;ID}
z^9IAQAH<x_CcSJHB3oycv81#$J|wxu*wYlE#=h^&Kfx_vBpbzk;=Xtn?K3BQBKi$2
zt0J0UEF%{hLXtT#GwCK~Nl=vPkph1)ZOcg&7|hMaP%1B%MpSW8z|mVNw%*qN0Gp)J
z{{U2jZEW*266z&gapJfv*s6Sit-tmyekMuIr%5QgR<Q4?|8CY@6V$^l>X2D-YyVu5
z<oQ*zGn=^I4J+PdUROL7iSvz+OlS1X9ITtubsBbP-ntA8E$W8mW&Qi3S|ikL*JNJk
z@)mkKkoJ}tD|J{Qo@>`;Q46A~B<Wwa{VQ{Uo|4vuG%^5VWuXn$*?cGNUf#EV$(boX
z(D}j)wjY0am#?eVID37*yZRQy>Z!Z3Lpe*Po*<-m&O9ed${Ws157(=pa~NmdG`pZ|
zHBg0-@20>Vz8&%{25L%XgR|(@cjd12H7BsYRbCml^sYybAG#ZvcYvqGQ(wk>B()`}
z_+$0U)NR)Q${5n5;DwCwZ(fhSRqi8QwTbHA4@-U9B{Ju7{K~@P>WU-ji3-bFDkqgi
z8a|Ai^!Qm?VZ2yCEc*=ciz;Z@br#zae>xD!G%tgpz>|xq=;Jp3-LKI+UiRbPq1RM)
zM)j`E?r8xL@7X@I&q5`2s`9GC0dn!MoBq9|$X4frW>VP{Of1M?De46dE@~nK_7}BF
zV+EMS(LcW<5&G}Z{OeG=_(y-w)4zh2#ND+7dpCcnH$ha!rNqnNh2w)qSdjuB97a%k
zlI~q}*H9|0`=lnJx$;wixVbVEDL$mR)vr<W#v6Ah(%Z1dasLKb|52>F7i?FmJ4)pV
z6{#%{(l__&`q0Jg)?=Sk9Ay>{4j4RG{VY08UXLu-00=vmb=X$b;AuAx6rVQ%bzt}2
zKW8I+yiA921lt*EXV^>#+_!rl(-a+2@PXa^omIkL?+bPa_{&3vpk%qbU)za)99`$!
zkbid)NJ(o+`=F23Q+?2C|3H?@*oW_XhmmHxZ=WQ#4^pyDryHdDyF+o}XvDHPQ~oc-
z4-Tz0QV%8_MV)sCq6Bxm+f18335Y~U*Odslh2HNsMB(0F2k|}mc3Eq@nYzVGc6Ugh
zGF{#<&Gv5Y(SEyYiX8dKN}?KJ2<CZ)pz1>XXOLXWb&f~k?k?@_-vPqj+M)N3iW)Y~
z*2}P5&T*DK%36_ctclIgy7XdV8Rhl>=H>A-q?L*MKb3}=u-YG{zZx6uiB!m5=iA#G
zYMO0`1o--A?S9PetoK*ZyCcoBq&Srs$$SFls*0+rI@ki!I(L@TJ6El<9NPoPY<eo&
zFn*^w=Mco6B)KNI0+N({qx@P`rPi{90QwE=bUdgTgB>TrWe!_!Y?>*A4rXEphWK+m
zZMs+PJE;DHIy^6#l=oy?Rjjb2pE;MZ@n$j(oT4hrBBt;7$Q3&AjX~K1b{<^bQXwO|
zE!35MvbbxtsrTh-%JrI+svo{$X7=Hbh(R|3H6=!w&*DMmTQnu)tG+}@(Tw^ijPToX
zIstqYw;Ck)um`7w9^Oz*hF)~7=za@2J5rd|s<!YmcT%hBIR0jZj}rxa64E=Za1`xw
zGEt=JZrO%Q`~GYb*jG|6N(IXd-)vcGW>Bt=Vq<iwG`zT9c%^Ke)^7?lGz$>SQ!)dZ
z1vyn1DvSVjd!sOT)4GvK+^s!ZyP%7SdJ7aS6Hi#oVUb*EL*(vvPlPZo9!uiNRxvBB
zC)8ZyY14Y6Vj=S46)*4y(1;~leNm~hzBvUKDTpY{3TUwBi7qJyG@qLZscCeE@($O~
zsWKrutn5~KlA8@&6oq(D07G-e$S3|3S$aCle();=(+8pM7HANF&x;qkvcV^qF`CNj
zY*`v1dHUUvg-NmsBt_nvpcsQ{YhS52-k@W@8q`D)pfg&C`LeR)QY`+E)1u;=r?_bA
zeFmpU9Z(eo4|hLhds^2@QF`*rdfb`%EVQt`Ko$AUvTV}G<hBqd4k{G>^{~?9-SCmc
zdX+tHlEZQ|k}yy3&>2b0B+;4Xvtm;N!V+H$Fw-dv2fOTHlwZJkE~FK~1@@8<@A=1_
zf0r4gbQOW`I_@@#-&TM5*$8#|fUwRF@2CFIZW#8vU)Z<xS{^Ahi!41&hxUi52TDN;
zc}ye^BfCBjRCzzL;rCLb>1H1GigYDFXA<w>yl9@{d{@*5|3=xUawFL@C&ly`&xqbu
zF9M~v{3qiQpLcld3>T$D_~ORhG|4Kv%i@;HX)`0@Ja%Jli$O9>{sv%(I%Um&<OPLx
zKuL*RxvAcG1-4nc@YK1poRoiw6{orujN5}`OAdo)Zr@;ow9GE=Dk#6T;bx3OyawW%
zl+}ATjt05=zVvz9M{Oa#{#+@0!qM-ao>EnZw`?DHALz0)VPCa1NH=>Ug+dX~>XV?w
zNEJLXAaWD;vEmYiRR~zZd|Nz8Y(!*wW%A}MHQ;~?WuWW{<DzDZkKoZu_gsz>{sSzR
z)z((`Kd_x8)ra4W7P(LlOT7Hsk&4kmPbTZ&G3mzg8fBZt97gElPsSG+(&+na;2#gu
zSp3-k1K8aB=x|!u@wcA6Z{^Qr>$zDtkJsU)dosh$w6*&kCqyo9KCM{<L`AE=F)wf3
z+NeH+erPEyCXIzHX|%6j=;@g4`X9)l?{;tP>#4iqfr%#%xYW9L&+gk%Lg2aji4kFz
z`CgBFpk?4z_dk*;_T$|%6!;}OHr^zR&dPGzOSYlo+4xSIOqIIxZrOrSs$R!n`N;~Y
zGhF40XsgB}t_jI7mgiUXLaVIp9y}DQTH}qa?%*)G>F_toytN6pk89H^|M+M;P9*A<
zR&S-QW{g<-T7f7)CgXjaFxSZL0(aP*n9oZ!+PWsqnXwOhD&4@(Bzv{>3979L7b?4~
zp|-vcnDkTM(~?VP26&Y|kVa_6Z|6gDg~W4wGQz<3zccaSHE;mSH)o>%)-BIJh^Yhr
z0kFm({bV7p@66~60((d;$U8H_UpbveBfOt3U$`FJQ~8)9Q}Ohsvry))YTZzD-fEZF
zgDS{rl<P{~PnER`<~M<apUb}OyVUY1R_5I5F(Dh~Qd>NrOaFtTfICpg+dzrApB~>{
zCpIG`2*!M#oT=2#em#GXTfzn3QFYB#wsUN#vU#Hgu;N#$cX<3nSI0HeYA!IJJT~{B
zjCVzny8O6ntH#j`j23M>?rOo=L~O?BPjVd3iRl#6eR*aZ7TUe_>WV|hbY}cREidJV
zz&qRgo81wZ`gZD93PiFWV1#i6%T03t<I&6BI$N@;&*Y1>fb3jctZ9AaQ;cfW>cQ|O
zd!iug$83kGLPNpj;9kqR9q25z@)PEMi1V|4DI+uLq+1>A4-8r$Dm5~Vy!RZf|4kY2
zA;E9tGdXd-v<#yQ7*@y>NGvcU6wXEnDL-jFxbAqo@l0*z8Bu{<sFTH$q`IBGY3;ng
zJJ|gsN`d=*12jx)a*KYhmFVvtI2;3WXbPELc|vE}LhB1H<xc(7z1;g^;RLbH0Sox4
z=AIK{s3JhnZ!FxZKYy%VYcT>xOdnecIWW!ftQs>xa`fE9jG5f?&PV0o2$$h6S%!Qf
z*4?pH;j!3^S0F!rK$V!14z};yt*yXS-MGxVvIN+_m|5S%Y}PPZ@fC@03M@&~Tc#3K
z;!|Fi8K-svWH@f+6rrSu-DAVf+GE2oDX9UV))<g(>&i&QfKz9lQe{sQf~sI6EbHP>
zQT3X~o^2|cW|hZCPct2<1VmO`BnJ$yV5FI|EsFW&n3Nn*=~~|B@|sn%3EfmIJAEDj
zmgFwT?fwri)H=hn6V$u`PGa|Js-3AqkGw0r=?QeYUL-99pH6NJ^qrr(tCx^6plI#T
zh#wGKHsB{+xJ98=K<LFwhQY>FK0+18Uqkk&w&wZ00lcJxntgFDTa#5fL*8oq%s~$p
z+_utx*Hy5&N#tr7r_Wmg79Z%kZTiWDXxb5xK~~eKe(P{!&EVoOGz>2G-IrwKkmsL6
z2Tn2%5AW%*J@p6A42Y?U8__Q+=9OP(#V`0?xAv206Qsxm&9sYbx%Oi>kH>_H2Z>I}
zB3Yaah_uK?Jkqoe$D*RtR9R9BnNyAwGHyY%T_D(F;rjMA%S;PJl@CG5lQufc>mZ6d
zQ)8w$$oJClAsBY6|K%kqpfXYDAS!khZ~m3Z-@n0!*@ijL5!Bg<ajyzwZGnAdd??EA
zV;q`1A{fZt3Fw=fH@-vGxneL>f#KM_wk5seU`^n=D=RRXTmEr7hvcfY^fa~&aO+#Z
z77*IC@2pc=DLtg*X?0cFxv!~wxrzU#AKSTBNV#I1J88hg2j{UwZ<yiYzr0?sdW%*!
z4ux^9ZZ2J4zG{wC#NI7Mh5@alSG}!{IMQ#kzpL5s<nXv2h%ui%X|^$;XUKBXB<#T+
z7)|=U6EfYGlm1ur6?WBTg06UgC{vL5dRwjrUn1Dy>3#!gU7Dm23?>Kz@F(_jA-pRv
zL9pVy+M)A*fI0XJ9F3Iq^gdqlkR-0_d<VxS<P~35T6sEoNYZCk^3T}8wmfdxL${12
z)d$U=mgT668+CRi*D0(s<X}>~%rg#K<4d1l=l<@mWDri2CwS>3)#9*wCQ|L!VlY!P
zA%x)x{Ps^}@xTLw;l)qARHR@@+_}|8Yd<VVTF~KzkJeCD_3Z}F&b=3}@N@~OM|OV$
z^@#>v#(5JbMs8r}_U{qK4S@t~n6_GreEl!N^6izLwjrtvhBatav1R_=@W8ko?cHj9
zSS_{VR?A<B<*G_aa#=xKW<>ji6BBD3!95<;VbXGyO#Gv#E9k03zha*@Q054s^6z!l
zJeKL+-aCHT+-4en74*aJ%Nq`r7R0a<U#}xFHn#GU7LhB&b(v?5a>L=H^vhw*o3GTq
zT;)`LX<|udvA}!LF?WtZS)n-BsjT#lgM#nv;_j2e<?z0|0JDjO>FXYMg1OY&KI@gD
z?G(LO)y?F!+C`{2E16tiY5r-h?#!(~vl2U2bb5OUPbO}6nzLTo+3Xh2#bK6VQ95*Y
z&Y)edLQZJghIzeb+&uy4e5|h?Ss0a{W8YgXZ(;{|zJI-WNj+6g%8yAkqtQ=z%&`}p
zB#C!_D|rl28XK9PaT$WNbOT_^I^(Qdsw$F9$yxM7%Pw{beY9Ly8#W9r=`CzTQI%iY
zLgiP<w$!S>L>kzmIfcRbSEv$TGC%N=EN{kPxhxf{MB<XMsKNufb8zfvB>*xvz>&-^
zk+D8j%-Lwlrf0@}rEXl&v5?o+w_X<Yh>;#mMOD>Nz<*2~!H0>;$M0Y9f4y;jx!Fr%
zYh)Yv7%P-A22Bi$b88aO)f{3L@rq7WGUky*jlZpZ@|MZ%YQm(0V{y~UO8aO{sXH8A
z?g1VMaiI<ad*`;K=X~LtYU$GiN`#4Bpx=q{Bkzq$we$^vzZW3b7GJ5M71FaqDBDsm
z(kHoP>I4*d{{7rt_t&aF@5St@%z#hjUsc`glw`r1PP^!^&a~kTKgV{=W6ipSq)#u4
zs_}g=XIRZ|QG&=pjVI1^eacooyqSWOH!cq=s#k`*T!8O6#pKA011ZWW_f%@32lcL1
zi?_aKrmX=f^9VNf&1A%WoTHK<i`H5IRr{draQMy_Qh|_zfICM<=?3mKXf&Yf^vD==
z%Xh+k6M-NJNN>a|Y1hzU5KWYDNO;K>X0-RsxY!V)L@6g-0sStl($*%8jZuOJ<>2b7
zFPFzp5X2;6obGt|maPgH>El#yp(?*18gd}JbH%c2un1rmNMPCNN|Qe<wvkc#LS~4&
zZyKl4NDH}(eD(fTYsO*^`M#XHtI>4s&{6Fc(xr6G(Q>%00#Xo^in$4%6$KWH(aDf>
zIiQLC?c8x&ZRh?k##67M+2?&;M}1l+{i#W2@^|wdaHlm^xjCHb*+`~ckcw@m7ED9q
z@c#jNKl(34=z{t4kN71HjjC*l&7QWk;j|}Dc<ItHwPMrcnLT}LC$)C(f7x#v4}Z1$
zJI{wqm)JxZkt?|0?{{loY0!Q8(sR7~f$hHIx5SrN3gxHg!&*0~4q9KIby#iusi2!t
zby$V-Ng*rXGTv<u-#vQp1hyQSY=+i**{H$-okk)g{Hj(q1fQe&mQ6ipMV9k_W>MLL
zmOrOwQxN+%pTjvNZBV2&0fm`{sXy6nRwb!3^diG;I7fhjzp5swtgBb)lCGCe59F_<
zOOOSTdyuHzS{b3kA*uCjh)({tMb3+9uwPPtN&sdCK@=Q4P|akqrJ@+tuWmgqADszF
z1;}A|xy^+e)KHDBEsR5(=3gCx`0>cXOg~Ke9gmx@vbGx{anWF|WmsK=@>><H66U?y
zK!4?zNbYcN*YW9@mMd~LpZd0bU{W1h1gl$*_4C9e%Z)=n*>v_*3PKqpFsUt5#(Cph
z-Lw@Cj9DQL)5lDQd48Q*>rC#AD7N6YjeiW}md#poO|ZGG;|(6yB7gLV-)>HU{y-4Y
zHj?>+n_bVjOJ463<_Ts*e3QhAYCMAcPI^j!V8nZ0eD+Rk<)*uXmv@9ps;mxK2dt_q
zc%I+<$;GR-bnN_MX_@60{rC1$T4nS>xYn{jEM+Od@iZte^TC+pe*hJ*0$XV9$IK+w
zO{;m-uJ_#ipz`G)77ms_3PGJNGIA>6q5-)<Or;``U!D|PHmQRbOfdeC^0?oF5WBI=
zR`8~lJs?0yX*#Z2(y?LxeGk99Isb8TZKLCVfSJp(u{s}bWVC!r5r&37({)z(t0B~Q
z-e5!>aVXq(XtLh8EPG-ba}KgFYTklN<c20H=pWv*ecUnA!h4^>w|zdudq5H7GCBWz
zCf#7?Ylz-Ns!eGvGI5Zi6)s!E>cuJ8Y*%3ss~(8^n?o_j_WvaQ|8e`waCcNpAL8m1
z>zirk4(!Uc5UX9Mro~iYSmiYq_B@Yoej(ILvbUA9=9=U@d@)Rd9x#7jCLj;n7B3pB
z9xM#h%m&hZ3>RdN$>h4`iw&(TiLTMG(LB#R-7(R3K?Trp7HJzmxX3ltO6pI(NjaSR
zNL>oYG?+wqWHM}|M<hY~rpnP^C0P`m9S)6EGEs8WwO9IO+SFX!2DxTOIQ|d7<9az>
zsop8xAw#smM7q)is1EF|9agkUXB$qJ#In`XH8#@bj+?^oJmw{FUFgHsu{X}UiD#G$
zcV&<+*v=%`W82K_P1p`xU#nW`?=!V?Ge$psWDg=*7nD|KE+G=<dfVt5$0Zclk}kM@
zdU0ON?H=fua%9r}iagJ)PnJimzs%f4gz2<09hixnDI-%?=~xD=UerG>t?ZB%H@1Y}
z&9H8N_{SAGF*Im$<%|*Kpz4TqH|Yir%hc<6#Ja6_Gx&@dzntnq)R=c*^~tH9W>yEN
z<FNICR{%1dZt!01>+Cau%ozt<5tLcNJV)D_Bc##pWi3luJA1^$Z|jDCqCT{?s54}w
z4B)5>$Lt23qfIaD7A|mo`B^#W*3T^!w?icTiQj#kU~yT<Y2brm@^&n?t@TTXpesH!
zCB>02`eVhOe1FU2`L3<?mtE$%i$Ae9UWo7b>Us;x8#x!;zC3B%a)R9#9T_le_%U>a
zZZ_pJE&IfE-TTA>9jR;8lwh{+54*wRwdT_nDFU?)BRu^loeZ9{T9dm+m}g^lkaGSf
zb@FK75n-aq>xk`!c71b5T#69ruw`bNqOc_ezc$qhU+OO38hhJ7b_A}qsKR+OD^c8j
zCf>@@TioE&#(AwDV4@wRW$IMVG%O=cNM`?Xw@A=bYuj|FsEJmj)}x?D0N3^og3FYX
zcBnBx?C9x#03veV&8<Dbus_dnWXLX&_|#?LMydPZJ9;k>u1`XyjnbKd*PaGX`oXKT
zbqyFeTnlcNcm`hj`8x{UC#&a;^uFq{piciKUri*_Os)5rbcVK(daKsjJn)4!x3~f9
zf}EAE?u*tq9I6Xm-wXU0_S0pW)^>v<v(PN*D$t7<3RRw0XOrC$d;{8j^^#U(CfInX
z0S``sO?|wpDiRSUqVqA^YH?MYtm70YXu+whPx_{M*;4I^moSIvi$GN9!*jS13gp8K
z$~+mWYH6JhV|u3t@%EUzc~~3GmmW5PPikwMfu}^*Pfy9xTh48))dL`Fz1{|U94QdF
zSBnaP3VL~gl0HAvq-m21Pd5crRZL1xT>hx;2fTNVvu{b(D~-}_ZJz5|()C)i_gB(6
zV6X8ZDS2nvKvnHaKImX|Ih(Tvqxp^SvqGj3Wv36?jC;%rnqR{;EniP3;i^aYEHi^^
zTjq1*T=DyY6<HjwpF;2DoJ6qOv+WQsWllm*2`PWnSmuIg$@rvg@GtD$2kP-3UV&M@
zG5$L%8Rkf%AXQzlN1+#uRYD6w3w-5XtzaZ*Cy3N9h7A{jqX!?HD9pS?JM2Z#ixkD4
zASwy@KZ3_&k%Y&8oF9*fJHpyN@fc_NzVLw1(#L<{9|s*1Im+Sve^ja}E>Fjb>$bjS
z@bV=F?RkAlyD$fNs$gF*!RqZ?7kw`Ia$D}4wa{qFf}gi*M<3Ec%V+M%F`Pumd;`6l
zoo}&Yc#q^$(Gx`_pTh-3ueL2*!bp5V_1w>4V4>xZ>h4&bYE7qrlRB*gV92H&cmEx|
z8X=N+>F*L@E4At|CL=Bav<ZNzB49CikEtG{8sjCPbD7c~t5u@7xE!>OPsP@Xk<CHm
zud>q>{>)DGLbRDeN&j1mIvsmsrOCspNoD%)N>X<+itj#6EqCcNw_8*)t1M|M*0Il^
z^N`Fmtb4D-kJTDC5shsQ;y<sMSsJ)%tN#F_;EYE<vGQnk$r^Qs6fkS%6jp%^oPM4`
z{b85d1%ji;<#iA?rX)!xUhtUf<)7>wb}!LiCHwiokR91h5HS6JxnhQ<^YU6EZ&9gm
z$wZ5f&U2b7?!5u10%DUBQZISmbez&CP`4R$4o#<=g@tJGUfph=k5dH8jY@(N=a!mr
zULOJH!BU5oee*mKzXt2d8m|7{+Uf3s*qM8m#y1fp;~rf5kfllR>pQMTY%b>>&xii2
zGF?($8-0&E{sRcv<*c%r{6z}xY+Tmf@ox=->EwR>h32M-G(XCH@*eMq)K5%LB(lsd
zE~Fg{dk4b9-?`QN*7|G!@%qfS6_^ly&Ef9F0s{e3d+!X+-0@XlEXE#Gq;^NPY9|6I
zBs4FR(jsDzuSjm?*H5PKDT{m4lMCUO24JuVC2i|!eBx+1yB%RVPe4a&^ym{oJ;kiu
zl*6o*?mKkNVZmw5emkWhiTv25w1W<kB*FD&Tn}aiysii|nvoi2<CcRVZDaCrYDQnG
z)idk^s@52TdPzskjTRHdg&0X`@=H-VdqOd!LXg9=nR@tx$=2R>9ZIjN5?ag|!jRHU
zl+9_PqbKW%HtX(KeP>WFDM7Xq!nDIFGjXfF+~828eZ$CSeZVCzN3Xr8xml!$Ll#4<
zs*0WZY1^NQiCZr`Xmk3Ru?#OL@J<#LP4q_|wAXFSN};Ck_apL-t70~tSJSRKfg%X*
z9<~q5Gb?cKe(cbcSvHf@3h@c&nvd7N*7{^)N3Wy*FWQ6sSwdMh8{894JxjTe_LcBq
zde4-Hv+#Aa@$rKl_^x)LA4B|gbO_Fcv}RRO`KX{ld<kr%)7dMBn|n5N*Vd)5@iyJZ
zIS2CQ$pdi193h|_OikqRdl(q`GEUGzVi>K_1GOg!Qo`w2@29CH1Y!pZQ`mL78mPPd
z?}8mdNHvw}iz)3XQ_cnTuJww>CG=83G<+y)mIs1+(ToUfdySe}DVg0pDbjsG!2K1E
zznoj~-MeEjtdq`WxAIFbKMS8#>C0EyBq}17EHi%IwXrpQlq<eV|L`Zz{E(R3_%;2q
z-Y#t2t2XO-wd-fZcX$d-?;xby9jf@e>bYb!-P%?T4wPN4>oU#sp8mRH$_f?o9N-r=
z_&Og-iV*M39B^&bdBr48ZvQ+NQ_C|`_ro7^2d!(s)8S0%YZNjGtYN$IBCrHMQ|2n@
zolbqJ-r3j2;zW0Gw?f4!44>j!z~#RvSpmWbmU+p?S2AW4;H{k`rnj0EqBBb^-vu^v
zm=yqp$SpRs8RkO*6bl%ciA#X-hyb!Pj)mWcrJ5}03E;uD8q$!PBWzY#!<M6F&upp`
z;na;KFD!^LfKW_uW#cU+DqHme8e4qjGeLjaChd1ZToLp=+G;3_m&sdUnEowB(7V9Z
zUd2TjKB7Dn5U2nx;UyeM2I_3}n!Woqa)$-uu7B|$rR=>~i;MHK8C1{_C}7AXJ(J6j
zNgeFWSfj4ABclOk(q(&vX24ztDw^;Zp@|y&QZ^q(iZixKLARod6}3__Z0IZ%S%6y2
zCbCCWN|NGAb|^Z^1P+bDp(+MQ6`wbui;=iHKOuj$tk1LX@@!g7V0kn2)2W8E=#<;?
z<oSm{m!1tBi`B-dkQ$%5zK7t(sqXY^NFDsm=3P(I2Ftm#vobh-Jdf$72R^8*BA&I}
zYyL4FGOI67@oj|6oTxSZH6LTy+@S9sS5q37(^AQiEo~~7#j0Da4qK%+o`zWZCbh3M
z|GjGYDoM}ZS%PE0AK1K`!@%Kw>q&cS=Lh9CPT{Vx?^a=3-Q!?fKib|Th<wmr^nBW=
zx<o2qm-gBe`3xjBq$vg2CABNJ+Iq7{zXtH0)7zM{`ab^hj&AV_OnB_78-38M+y~Pc
z>6WP2=D|hSeV!I7PG3#wxL&tp<!+$7r2M;H&q8q+V@Nl08&x)%Hh*zMr?<*1dM2Dm
zZAlB_ZI&J_EF2~Yj7u>9qEL<rfyzLVfV~5@xBR@^a-y|Q&JAE*w+t|&$ae#O%_l|F
zvKSaMZ~MgtZy(u7CVqg$<La@NQ$w-W-Ge_>29Vu2o`@Owr(gMtKg=nR--WSBsx*{G
zNL^Bz8I29SMUG$Rzrr4W*h6e^%?7(51OQeGroc9DZk~E#?1K*`CZ8vdP{l^YAZ2Z_
zH_|bT9-M(%h{fqWTPxSmN8Z=uV(nstVImQx^gEUXK@KB@eNE<*Pw>yylotYRYhTAf
zPGtLaqwB-^I0>On1Xxm_TH!~P&gSN1l?nXv?_sagq7G<)CS-BTY^5IGh!0WKHOyGL
zdHJ~J9oP9ZJgmaB`(s98zgtkvobcMWXgbo?$@20^=zy`O*hSWznt}qiUUggeY9<#~
zJnl(zax(`fuN|?ld<jrj1wm?y(CNBX%mQ-G+GyR&)*+<V9r$*!NE0;fWlwW^(;tjq
zpYS(H^5!utx9~Q140*Tref&7aQ&o#F9k$)J=H}3J&eAzks%ih>p($ScxMij-e)5l*
zX)0m&fLpHI7D!norwsVEm*hvdRGj&)q-F^B_TDP*s}~4ztPFKrmSPHRQaH@T7_=f*
z^dP%#+{P-l4s@C$k_Nro-G3impHnJxyNDv?&Y24brwk@rVM&Dq@*Un>>GjIezhkmD
zG3r=pLEIx7qeU?@7HT|c(JC+e>t=_PJJvd6q<8K5j@`1PWfP&eKNX$hlGMKG{=<Sz
zsajHbCcX{MP1Xcr9ex1R>b0+;CB}PGluf4ieGcWmUMUBC1`u3Z`4A^67Hh$@u&lH|
z6$gvN%0qT?tNX(trM^mhsfwZ7>Ypau^Xh}x>4qhbM7n`r#J}2XA#S;$8Z-3yW3a1@
zTo77M6x*Uped(Isol9onermiP>mS)<VJ=qqNJDJ3QWb{DkjcQ=<7M-AI}}EO*G3_O
z9%v|^adD&sN7l2tz;;Q0s@Tp&(6oTZgqtseuGy^Se+zcl&KXu%S$Z;17DEruRLQyG
ztSe>rbqH00L(iaCkE+-<RpeP!61}ezomfj05s7Z0q2^{ZEu(9_B4Q>O!AhT+ra+Nf
z3z=#?ZjQD#1CK1ZTVa!On(}|n+Z3*3dvEjm8DetR^^@;A-y`6k>Sj9ek$J9Lv=k}H
zpL6rD`XMA=6f>(5j$U3jDMiTI^u?Ds@8u!ssb_Q#{%XHTGN9Dhu{Q-G)mjgz71tQh
zAiIB!7SCR#GX)k1n8&(iCL?)|I9|O+UjC+aB*E_krE9K(4@Bz0&JstYZp|-%ak2vf
z(3;@CCZaiM53N_+mf7iv^7A36xp&4N<ZgJKW=v7~^CG>gJzTKvZny=8TlCd0`WHhy
z>)8?$SjDF-T0-ui$1SG;#X3_iT-Ln#IvVL5BDPOjk3IXkAzJVUJbs>B0NIB1RvOao
zynhhRTdki9g;ViD&!*{OjQoj^c_>mpGWXW}CATag`FoDCqZf$CW*R{RQNt-d=>Aw&
zDPXnfNzACf#JxMK_y&65^o(EJCM+0!d>pjn5hvF*1gi3DpufK9AElNfU&1`fUqGjs
z;it1}ie>wQ_S@mnkaux?WncVYjiUy^on*0QY;{~2MD--KA?+j8>hBR>y|lQ~0D5%$
z*??}j#UxnxK95Hd0ivrMny9PKrlYiL*D?C&XO*b$-cJ>gNZd^)0427E8bZzPZ9R+E
zE}`GFq<AIwFLFg#IY#3s#_bhDonv3{(`_*$ByyeZs9;15^8+7=l}o9N66YIS9yZX>
zg=f)6H~1nr3Mrw4>G|i4T)2%RQnKKH5wT@GRsi(=irmg5(jgsBExM}u!KO;jhd#1~
zu(+`xs`d+4)}Gqz6dBgdwIDiwCTGtS=aL7+eX-nh8x@2%yy<T=slW($GbpErU@Gc6
zJZ~492#$Q9JQYma6OgU9#)cy@@m9tDCW<xClz2$%i87*StNcsK6PE%#9Ww5Wle<_=
z!1`pA4gg9yDbxEW+GzTs32{;R`?9!Qq2H|#WLt~r$b6vhr&Ps^G@a+_7*`fuA@K9s
z&1*;IeQz?3FHLY`UH;Cum$l`$h&RO@(963dOCe0|)w}1)vv2Rt-`JX6b^Puf8AdZ2
z>W}SQX-Yzd-`T((8~l~?rKx@*gZoBZ<1?p$kKNpXq<KWuvhT1U>q`?eN|>}-CQx&X
zz0d6xKGb-WRuEb$#aKx60kxe?Ik^%ne?l+2%G=LI#HuA7k3Q?}v5)0;(H>_DKOO`-
zG1$-hbht?3h%jbXhxVuGck8D1v^_{^ncnZ|qm(g;O#^GT^_ng_UfC;MRQ|lm3mnZ9
zm!H1vaSa3P5#A2fDO2b1O=X3dtj!>rhnOm?kf%pGazZ-a<K3T-#gXm)2Zq+lC+x=T
z0X|dg^6IFmAL{BTqQcVVmrXlveyuM!VUH;#he|&sm*7qt{!(1<revkdMMMg!+>Wmy
zePTx`V+IA&e0R}K%C<^UmdUV=`G=!M0o{b0VxTOYY=xeQEEQ$LMmSGl@-&r)%Z6+>
zpKB;H6dG6VsHkf?Wv<KS?Tpl!R<{`W{#T-FIXG>VA@kXJY_8Shy0g8Idm!jN!Z&*G
zM{n0F9@4k|o?z0u`QuN|Z7uk*`on1SNv}41=Y=JwWz9&pZ0V5A@)$FE05m>eqM6Us
zHLZ2|)jV9<L`9729bJOY1eF2KVo8SR9w(ZDvO@&fA9%H|88HZUc{8t}+l0jTp>869
zbqa|}@!NZ$wP)>;=K*?c?fY9dw3!zoQ;c&HVWAi=23XWa)4Sz9iCt8nr=Nb%*ZV@G
z!A5FY#RdGx+da(7bvUKR)V6PIltx+zRUUYErQ=xKH!1mT(JS@N6G^RFpn2o$6MI6~
zC&o4JH)$22Po8yGt)-5L8%5m#7O{L|b2;!@C=`$T%{(@eT(={=Q59=lpE6BWpMv<3
zG3B7jL}^2SGwme5?te;#5hI>#t2LWfy_-7jb<=(C!rZLZqK$8VYzL>B-Je=u{tb`4
ztMhYl!n?kYphZ<pjLqHyAFmbNxY7<m0p&t82C>bDk%ojecwCAYreSnHggq9Xu^mUJ
zd0$}0#=UJtR<*2!Rq{=sK_moL43p>87a!upX)h1Q;-5dsNexhiBgsU~!J9dR(3~5r
zkCAk7HFwBpXW32Bkd)Sp?@s-t)%+8biE}czuQ0d|4;xHTa-x@W=gOE2kq8-LvRe6=
zpXNK@dMS{>tfJ7RrSnY|+ZUeug7&teiUecfNc;nyFGpoXOKs#j-qT0g#zigqHMdzd
z<yk!n?t<T$C0~;rcw`Z>6aVZ6UIZyJbB~wA(qG%-*!O`)rmCIA-Upud#}rwk;Cj=v
zuA1@N3+1>szIck#Ow96o6dv~Q3G?2)`XA_=OMv$KZeA=rcBVT9!c?nH0u`__96UDz
zkwxvagn`d*4itC4{O;^-@5Dc@T?bUl0B_$SEpl6_a{l5a$3Uz^?LyxMkE!P|jLJiB
zX_`OZQE3}xY1si70e9mO$6}v`U6-0d##|`=un!jR+eLl*X5wQki9Gz!7VXJSrx&n|
zUHY*V{OWuD0eOcG(AxGJ<;Gnd(|na>^2Y=M@kzX(TYqU-rRHH~auzwOqDf0-;v4K;
zDY$Pc^F6?ud4oQyaeGHAgfLeO6`9jSKnbr?A?sAooQc~VQr!=f9VQYx=LS~DYjL?G
z+n~d`Vu8Z(t4621ZTDvKr>*XlW=_+zwzZaS?S+UEGCtFIftuthtG^u;J6wrsj6)e(
zm7hp=U9;oG6<kF`)ry#A;eCxEnj_zd^4QriO`iVVHK$%SWvrlqNac{{BLdd7|H$>~
z&)P5i;dA4>&TyNBMy8QHumgaD&%Px*j|dZ3p{I2TB>y&HQ|{B|O8wnLyuckqUoOhT
zq0UcqhYo|z$J9q9wcz>zB@mz;U5`FWKi_wzQA=gOB2Z`s@*Q);?llmYHPDdk>)TvQ
zyM7KTbx7Gx06E#oDNadjt{IYx=JaGGIRk+=*t0KPcd_fUVF${ihZXFVKJi?Tl@5FB
z#j0njB)7ERile=2VgPn}>f76Dt1?#qY`3y&6-oBo<uL`v7?EhpiBSQT^Tr+6s<Ir*
zCfe`pu=j1mAn3V8@c}(6c*0l=3sS?`$gpb|v=eYy82|jASJrOYsG^3Cp!>|_<LEg>
zV@zIj{Xy;6@ryNv*NKijDO@VoGC#Gc|N7?h;LDr~V#0bl5h-wkrD2E$lMiKEm3LiP
zH#ZXWX=vfrXqDZ1{U3myk}j79`7-5HcV##d`q(X!4sw#Xj8J;`Y>0K(qg@H9KiO!L
z3a<ifYxkMwRojyv&yrgEFk``+jQMQ^{%<|*Vq9V=Et9i1;@(#hzVAYc;_o_v{aY6T
z<#W^|0fHaIR@UPGs>D!df#ad1=yXThRXBB~Xk>`JWYNnb{_D%8fOTr~Yt4Lq1C*Q6
zl5D0N=u5rbvx5|-acDvO3_LEMZlXQ7?IeNb19wRnx?*!3S}3HP|EIRtD7fv0n-j$^
zi{7%()hZ6%U{7hEh$;E}xrSWE*M#b=kj3iwCYX54P*%4N$Ol*I>n3}L4w~c{uXIFw
zt5hPLLj)S&BwNr(!R(LhobG}wXx3!`?c2w1JDODQrlP~KTpU*QETaOUMgCAwUcQA$
zG~a%HoqkrneAwGDy8(ycKt<L|nUW(_D|QxFs`>RC(PhQ_zQql6JjLu*q3?T{7}uSl
znsBqFOVCTRh4+W)-0SvxKR=su{o_Dq>$D%hJ+2HJte^eb28$V?>Dwn4E=RkGHppQT
z3V(R3r8OC~AK<>Z6{&Ug_1{bzd=ZM6(|Q%UyI%TTOrYlpk|(yQjttvZwHJsM6H_%*
z#$3Af{Trd>euZA!cT{hm5&iRrE;I*Ty2sg@w&yvbpBV}-#a;cwW`$$AyBs~eyLkJk
z2v?30DUV>gCV=R@-1u;^xK&%h#I<eGd14dNv|wd9>=8Z%77i!+R9UZ$(HlsIci1Zo
zzA|x2Y95^VqC)gt(a3yFgt>;E)5p@(GLO?e>1_6GO$VKgd%X6<u){}$eW7Q_k;3-8
zHB;EJyQBm}<Y=B!jCn(yK<}mi4qX$h)q9q_?xLUne$9IrcA<5VFc<o>9_hy}q{pZ9
zrB~=2AuTP7x2eKq6mV2+qz9*U!XAX!nu;oYU=O136uWdT88IdH1_+e4%`ng&`-J0Z
z!s4S&Vr7kH2_D^q{{XBTzIngYDv1DKah<+N)O(bih#!|`$`@>y*E9=<J5?e7wq$(e
zBFQMv?)2GED`)#A6G#gcUD`W!g>P{Nq<=K_T(Cf*Z3-agXH<Br8SM1Q!)shee*qcv
z=+jx#0EHO+F8~vvpRa?jtcUY7`K2SZEOO-0SQ*3S;xuPFNq&WlG#kydYEc|e@;bWs
zsMg7PB!KGif4fNUO1?wm(9}!kHRF8X0#vkNRif+8mJwv5|37fUObwuPTOPeJW}^7b
zUV((r$mM0>3HS4)>=9g~#h{n0*2hCdo2O4;fpvzfKU9Di_<VR8XS=LkK5sjmlnIl6
z*PWN!zpD+?f&BAuubjN_L}QxnX8l~{>7Sro0yHq($C-umqTK_v4@(b9Oth-6@L~N2
z&eV$PrWS{i&UaZo)4vr|-A45H92n1R+Om|(nGM~SEzN7ZmRr~gnbg)bdP5&O@EkuE
zy4PB~z%!g8P&hcp<wmNy94@@$PhIx;My?RC>BBYItVf8?C%CkUJM~8Ug}F)>V&2hT
z8C?FleiSgS+<dC}F+=^VaB=jZ-Ugip=~2+Ligt3kcN@-KH2&I4g=24_o@bX$o2j#5
zNoB^Zm27v!h~;6cq5!&)>Zgx5E{N@;wTfEyUzvP^wrcuGR(nLye8^gLL4ty~rUOlt
z3+;n=fly;^hyYT$T<#66ZZ5RKg7^Ey6j$cQK7Y;odtcsVe;R!n{HSz3x@VTuvv&b$
z_tJDwtgWh4DOhO}`7yubWtp2Y&`%a4c~o(Z(-3El!f&t_4SP~_7_rczFkv2^s^$ti
zv+<|0f8<AiyRQ~_es2mySPuq$K{YzjfN^RCTuU53e~>H^TW&hfaiXsT9OC??jI6w-
zu4RmLjt<jl!uZ>S*dk)(BQK_9n6TNn%(J3Of%b}(JTa2pj_Oy6U)A5~w#s>hItRf;
zx^#z}p|a&DOGd%;`*`*J@E^|Uq^9?ZGi5Ek+NIZ2qHs{|5WZ}$<A-`$0<Dw;a0}ga
zXV|U1-V^!q@WWDNeT>t$TkNg4yBreyMTie<5w5x_;GicC%XPLX<aY)lzKi*I{h-zm
zt<`WRq_}1k<k>)MzhJD-uYxZbG<U-}9gTW~;3MtO!mP%LD#pQJSFQIyvPNEq!8hxK
zbD)T2XPbKj7{;7Vlc%*|A368Z3TkB4N^6;&yZ$N#tghdRT1II6Fhue6<0DKJSTHe_
z(V|FecT0_1ucTYETD9%N=-<*RcX1Yfp)wt+aKN|+Y|VU~Ek?cURUTS9Y0&k4wt6Sx
z#9`-7UqGOva!AsD0RF8O`PWWAIyH*hP04V0f74O6fLTrSEgI#rVY4C#l#kvf-!kM|
zRGrGZ8^^B6u4_!M!$;~MbT}XJtF_bT%r&FqwJ@(lrBdp(!R+fwA(VY4DLV-s?d@uO
z#cdm5XU0vAT>yoBg^?bD>})9~_0#FNlj`FFAGfIpxdf8yU6YPb2|0Q|nAZESaQ*^e
zvzNYDrWT-S#^~4%L3bd!q}IfK(BkG+)t!yz`rhA^->+{bdY+T%yPks|D6dc+{A@H2
z6Wvs$iGL^i9+hRybJ=X_K6Cn7e7@o2LKr<k^$6;;*LWdgbZZRJjPWuO#<NLm`N}qi
zv28)P+UQ7q=U6f2JDMU_>HVxsIZTYO4Yb<2-bW8eUXK7wVoMejStWUTC9`sn)A)I-
zQwj=GFflq8RVb|sgIb@!vgLYKv%*%hEh0dW2hcj~Z8WIM8uIDM4Jd4ksv9?CvXWp4
zU4EFAwSD;H9tOJvY<+Z^)O?M2A-dw=iMO^+fB8i^!_Zg;>f4x+jlZ2VBE@Jq)OzdR
zEpDNh`NT})DoQ9#S$KH{aNw4Mv`2c!<4;2My*^&ItZHQBZlTS|u7jqhXR^ORv?fq&
z9R7AwqIpcZX9#}*{nt)psb}+agK}_W?h38!lVFhRfP>gSuNO-*$M-|M8yl7ds#GI6
ziKKh8yEQE9!(ZGUy}~l7d_+5{RvSDE?uEH&Bp({n`O%)$GhRl>N67gGE6EwUaD3`h
zO}4i;{{cy8$fcUy&TK+y^e+ccWn1!5E@(2HY!%xSC!HGIP23tW(<MD{Ytb!Q%E~@E
ztHPmp1Sx=*<x_Y~VRoM5U9`$--ZYg8^Th*FGWdk7_@sJed`bC!ptl(1964DWZ?TK>
z8`k9%C6SV0JE=R}_xPg}p;n(GG-@&i)~r6t0qrd3G|x&the062DDXUD$ZiB+2dz>6
zxqayRyt<d?;H_Y$X{>r%|C6<!;q>qNM@h*N9S*-F4q0xoKkj>nc)SJQR+DZFefW^2
z?3afpb<NGYp+#ij_t1UJ>#NWt{WG;V<siW|-sRbJA3^y%@8$&;IDtvz*?@|h!y0j-
zaMKWs+Cw0Q1kaRot~JbOTp|9e`3dRs2Ie!;cRbKuo($dh9plh-#%ptODnt9;5q7KY
zw7fR6g1{2J{L~OkL~J7$W9GxU=E337y#OBa|0C(T1EK!^|NDk<Mja<3>zs2|*4bHy
z!`*d8W{EP3>{W?4=PDxOoU-?dNTQ;JWJPwFWhE4e?9uN&-{0Tw^?E;F&*$^?dOlyz
z=i}iXM>?T*wYro{cVXu1L~lHDdRi91;s0jD<fLbHsMhSy%z@9hoK3fd5#0)-0Rxv8
zx|m*`z=3alRpzzr-eofE$ijvrk{vaIU%vOWNZq;jyhpkKyT~D6N6lTO_*M=_lDsdJ
zUlft@I$NzW<yu$tuJn<#mT9bOG9yH;L6gAY_S0EeAX_dym*f4UQkeBKD)`l;QGA~y
zV^cJ^>P`Rslb3F!UG#MLdc(BU$?Aer(URPzGE(mJH6by1!xWamX{q-vm09;*;Ykmw
zaRc&7c21Kda9Q2sw6OH^gQRB_Rp@UV>Bi`)_y!1tD9Lz0`*6ZZIpLc&UO7j)qJ!~v
zw<`cmx)0YdExbo0@MhUesq&Q@amW(7WX)wspmm#8S4J>EILS-fkZBev#c7ns&L$d#
zi>BOU9U9~8g2;t)G}uVNCDkoq6X)Nj3w3)s#*MF>{*8p~RdHXN2_vQ2kGdEo2xw$x
zn)>tkH4Lo6O9Kb<tmqup6!bk8n{E)vnyI=R*5(qr1){=MP+TjH*z(6%wx302@tx27
zS}|jY?Squg+xX!}5h<LM#O_6*k;OYPui>vtv+mX!JK@&kjm;ic_&WKb%eBn2eImk`
zz%=oGdQ#Gn;MJtymsm~xNc&{#$39m@eh4L=+JXfpEmbOckHIB53GiN>BzBIZ%#zlI
zn(Ix2H_`>3ZzMV`Jl$Vz9HLjz!gA5{V%Cr0L&-Xwcj=*nh@LVkjm~JqVXYZsvN*LV
zfqV93iYdH@Z`+(jIg&v)F@KeJQZe<@XSKguFRq8_+pLT-_asKozA4O4*XCkFy}Beq
zyP6HtV`xe!4iswnbXlhy;8DyDSVNgPYC0T9Jckan2o=FhMy8~Nu3)Xa&t3jb8bK(R
z`cc#0ci&$4K=3W9aPia%gExM%?X5j))3|QkTWW_lv!X8=k)?y6-Ml!Wthd@q52=x0
zY%;=Xr*uQ)hWs;tr#tHl^oclE<!aFJ2lMiZOk5Fh7Q3nbrjdbVy-kGqr@JP(n%0Y_
zx2={ODc$eC4o>xUcQVhnDh7`U5WEIQyvNSL&`z?;@gHjjePn)mJ+Brxb5Uv9LH+a3
z;lZDxq*MO7sf!Kb&6^yd*;m5=CS_5CNr|7k>FY~kSGVgu69mXVv<GLO)W%w=1=|_j
znz+IdmEZKd!ji?r%p%PYuTI&K-Aqy_a(`-RDr7le_xX!3flc<n?2Ew2qmd_7fpkN;
zPq8@1eoKxOlHV<FxsNALPMnh&ueZ&UZPt}Jeg9gnHX>N-{NCY}rf{I`RMwPHscDM&
zy<pZmIbfpP@SWS4#UpBJY$rT7nZ$MIbaAhAQdK-=@td_#OnV?z7?vJQA}34b6H8ty
z=&5qwgP5i&^VLLhRV=-jX@`%Xku00+9ItZGWfAc*=qY4UncekxxsTa$s^lE#796ns
zE1SlGkU_M*GEFq8O*wHg!r!(gR3id)zvu=>Sy4S0RA*D0Y;&=#_kn5In;zK)yLleX
z^I;c>Pn`u_-=2M78qw7h)5f^Gc*gE#ad5E|I)~RZIp_{A9&f~_dOGm7AWn;jE@cg!
zgQWr#?<N~r9(L=~sj{-8V@;@0n4qSS!XkGPF9Y=3@-3eyR*mg-#UxRis3u)B>WU3U
zAkzkCE?nU#RrkBgadj3K&o09l;cK<dW?J`9x3{INjVz;-&~0(!&U2f07F*xEcaf>x
zxN>fEJ7T|%x6F|D0*4n)-@27Z0+DP&=`%p7^8meDsUk965$7P!V`jt_QFyMdL0waK
zhjIGaM~45kAC_VIIXKJNaz9b!S9r`%r3t+HnPUGR4c0%I1sD!qGx(JnC%4v$%aKDs
z?q_<I1Agf}UNKJs$t#6bE0-}DqNm8xy3m=-&;pg3iKdei1FOA+nhLdV+ol&6T^^U3
zT1#i<;R3L;F({An^DnSW=u=+ha6?^bcDA6Dg7Ak%>Fr=zCYH(!W{sd3)*yDt1nC)U
zU}~1li$6Cdz9Y<;oNbe`ptCR7+xbS`zZ^3>*Ud=hwuaG+3d(mFCGR~=dx|0ot)qZ^
zKFlu3Q}rHi8W~fd?QqF!-S)4Wv)o+oud?SICBIZnm!JJ|3w(@aQFs1Ar&nlS5S6zz
zzBPb_oolzt<+ko#5$!a|;n#d<QW@2IEoay0+w!p!r`<1g%4h$C*3~>HfTNv#H#uhB
zoG7{IDm(bhby;4(j>~()`MB}md(Auto#LXaBa@~MR|X%kp5>oGJQq^1(~(WTT|Jz#
z&c9GS;hpL9x%Ok%8Po@>FF}<s)v$su=0fKtW*^_EzOQ|(h}E}bl8MISdOV%JE95L~
zJGAPm64=<?iat>$TUNBro&MZbOVieyi3nbz$;0vJ6huN;mL7yHDaOB&xL96Z3oj*9
zSfS7gdU}Eaf~5Pv<ZZu13RMk|aCnSMcPj+hJU)^F!2o)SD~8qDue-rC1&#DWc=3i?
z-&anQPb@iAvhCu0QrOxLIL5QW&2gQgW#{vmn{_#g=6G+ysB8H1e2#2`W6qTOvi!qy
z^6kO6VF$&aCMuCx3=kZD$Kgt{MdUu87%y_mbU^0$?dpy@Hi~vm>@!lT<FHnep1|r4
zCr=JTDQXul-wYHzgEeGDn@d2;*})QA&b(HsN$K|(57H4lnKoo4ZdG<<U}C)Sixy=;
zMxp-Dn2atB?y4)r#0rTCZ5Czd%TCgS3m(W;acOWgsCs(dX)$*q4JOdIwanzmY0`B>
z2#VR>auyPp%S0<D3~5vu#^~J}XEsy5O=T%^4Y{T1e4q^ucQ$=kVKb}@yK5@te!byS
z&Zk%9qIzUUJ5BAPk55OI#x7>&<ecdGS?^zbt{paGubV!Xeyh0zKV!mDwrmggtuh(a
zwuoP$L5)D@Tv!T)p1`@>@K)|(l@)h=2grQzZ(m66`QAHff>M_^IqsBR5yQX-%}tmE
z;Z|^HWqn(tbI}-5+EnlPX+Pz27jJMLuhM;I^|O0Pt(ULfKk(CFqhHXsb9vt_^y=56
zyJMSWB+SpoRHySP%q2_Cs1MC{rKH8cL`t$5ErE0*GKnQEIL|i|NlYsvi&jsreI*UY
zSj1lFE-mt3X6`KIghH>Kedu=(D1v5`loq0yHC^QyBpVWN(Y0(D$z!XRrgk2-r;&8F
z?;iimX)dDWJ-*VDCv~T^^iq0<sIe9wZ=;*f4jFK0TUWS;Tb=$+nuEv*h{tQ$#?boE
zWpoCklY-k|eDAkH$dFd!fI-`~1CoLyq2zdI@a66Rzbj|>e)tVr!rgBx7`)eKmrHoG
zS1+Zfdg9zt3%kWF8%~~uM~D~aKVSHLGwdYAjsLDC_p&IU@<ky#xI*h6z%wA+<z@Ek
zL-61pbabsu{f+cs|L7uwvwYx%pM-=}VTssI>R0<J;ir@k_9WjYa(bi+Pn7xNMnC(f
zui{vLhCF_#U0VIesixSnzHzW@%c{3h)H>4sc~UJmCx~`;uUb!arhs%!#A`WAtZTLL
zepJR#kfe~aq}<)-9FMAoI1(jf$G#{}ygugXv$!Y#*qJVVtip-ZB$yP9nb+|@C^NQV
zvP`U;R#q<YMD`~!cSBjDCP*kEbVx#(qb5Gxv^YM<e!2eTn8|@QhQ!Wy(rL+2?VK5h
zoSLy)_Z-fv^`=Ba;R~bfnC_&~zya*DhLimY2DU8s_EP%&>(m{4rVJ~;2)CR+egAxr
zrcLp<pGAe3oxsRsB%ftf(|p0yYz^M=!P(jtD>hPU&07ZxuDbI+h5R#9Mqeytw-qyQ
zxSGz=<3BM{{38SDXI<#C{=`;~8EEJX7D>5rhS}O|n8$2np>oTUU3I(fmx?@6?X+-`
zP*J|aJ0Kox4Ko(#w#wiu3reB8kyr=x+u_B5EL?Kv`n3~x<c4?b&$nh(q?d}c-Y?_O
zt*|PyW@uSZh^<1F8aP3;5R2pB&r2rLR2MmVrh@TA^-!3^#oQ@qkR10JlPf1~msKuk
zIr}UvuljX=`9K?2{dk+(BBS2b!K-QLvt9}Qao0|w-1?lpbVvGVe!2+lw@Rq+1McUc
zq36Wy3TDijx<93s_DyOL{$y7d{V^-oQpQnWPH%!<c8b&_JAScQGG|jnIp|%OVGVwu
z{x*bm%FBo1ptdskXSz~(H8ee%An~QOYA|r6>ioW%nE<@8vLnoR;YCBueL<1OoC4{B
zN?n6B1ufjW6}L$AS1)YW&+aPBCCK^P(E7tI9-(bWA)!o!#x!Ymltvv<r0hRJo-{sT
zV_dUrQ@J$zqzA*+?A4%V8!~g_{MRW&v8ya>MxAco*Db_|uj3bzg3={aFj5BW7hU+E
z9$gjyO$B3nUvx;a4Z4I*&1Wv9ln5ySlC?mvp$-&E%%ttG+j6c|^~mPARL*DO<-&=|
z8rVE5Q|+^(DLP|3+Qwva)7`R5j-l=2=3?w_BrGaJj2(2FyQo(es*X)p_4v)}`yjpa
zI=hUg@Q40NGrOVCdvCYPn=-Y|{_VbNKFl)f%<;syy5T!*^tQY8(3x{0I=8vYEw<0p
z*J=gl+c?)R^|V$EU46K8@;6KR3Z|x1RD0AV6E^<+WBtt)9}#lX`crksG9hLXrKsU%
z&8v85$(EK=rYzyY8S}xV87EKkm*;yzJszD~cLpjs=d2A=%A+1VDUWQo_x!==3!%iH
zF&$i5@u@-}va6c`zsHRVnA(9#Y5I!_pIeKFZrA8ww^ge;LQc0q=v+rHG&wX~@+7ep
zGaxB9a#|m@SP3$<*-ueT4{(8SoTf21QR8q3_)fJ;zkl}G3NGm5T#IMY*V$oVr)x2!
z_Tir%*%b|rt$qe}5AB}@X_D!mhaV!s>_%Yd)+?v2NH--AlD%q-vaV7sql_;67bj#U
zftTY8bmUGW|7@Ypy<6=9_}416o?S9IzxmRA>$`|(>fBZ0kTlLfh6=-DDQbUYsTv8S
zt6VT9n1pNz`WTeY3He=gxc{PeskX6<0+W4>emR`hg~4(5acDP5XBU3Ql0F*MPRRZ0
zt{YLeaw%pp>z269Au=ZW$54|*Mv8vWLPL4uv;pfptDZ7+HWYe4REXt5Y1OzY$L&G-
zi8DpdZauLZc{uKAG)}j>)!Ab4y)KO?woVd~r5YObHO=^hj=Q8BN1vb%C_$P&FEQ(t
z1V6hN<z(F``+25<`){lk*g?KBbOagLbl>Owxs6f9^Sz1{mK;HT^LqTv2>KHyezRP!
zUv15ju=)jEX|HzlGX8lQZiQ1&Q&I216Z`A!Cy((vUyo>F!=AktnYdhbRnUmh+b3)O
zV59uRe5pQIqzKcMT8bqid4m{T?VdoA4cgxthaL<3BRe$6GLnVzkIij>-7^nxDav^y
zCmhZqaw=Du-p)$|l$3Mhc{B*@>}-tV8nnx{7Fa9%q-21Hf(QzXfW(or%tGmB2Z<;v
zd~PE-w~-+2TE}E0_gFv{+LQ#mu0SncpOleO8!E$qh%gO9p?2FW`-gUIt+c9}tX(0m
zr12}>3s4D0KK*Vwx6KebHB9FNwVM2%TIWAzU3xv9w@A(*t}d>equT50{u`VsP~Cg(
zgsetEr_<1}FAyzf{@cuk6z}KRDZEc_oPJ9^v*%;sa_YC2OjDJlZ;QiWmP(;o-+G#7
zghR34<!nhO<&Q@>l4{j1{4ed)(Q2vP%bT~5nQFgk)t$wy_S#rC7^N7oZLMa9zxAEI
zt;2&6%G>ZCdt<)+TelLuQgYu7y65;w>yEuxw^1#(t2(%&)4IGAnsq9+mh`+=N`s0#
zj&Ib=e^qXKvUzX_F&1`Gks(CZN_Z6Jc-3#1ipz8iH8w$32A{Z4&fRReT4?b@jgMvK
zVo^A!H^sIWIL3GJFZ8xmxEZ)CoKmrT323eT(b?TM^7>9DF7bX;3D~18uy*rg-BRp)
zO_%_kxcXF{Svn6~wNpgX3l$lkLIpF)<nj{6nA(&qNaZ?{PK4yiwNt7b%M4p$uZNb|
zkr;(jX{%PNO}9Qf_M{c2$;x~zMIR=Rq>#D2<Fj8o?J?mN8Pe>~_@ZxhjZB)+VS7+3
ze(KuIW3feH5=|)@T_D%_jUHjw*+wnp;{iqKkAnpFPAoN9UJ2D{+2+=FVJ0=%%Ae;%
zb6bk9-ToMzZ&3E`P9A1~CtXA+@yYqE*s*GEKW>M*1SY12w-+cpjC4Vs|5~WR>r0(^
zgoL?%!44g8WKGGoI@>zW0>`Gf$LVA7SsZp(&-zUMK{poE)Q9FmkMHM-9~tf|M*agb
zs~)kH+&JDCE>1bL%2LW!KDf8Xb|8;4I7~|Z2Otkqk7XAM4xSZ1fNU|*HS?p3{zmSp
zjhKJX6hGqoRA?0~v8z-VO&N$`l8Tl71G)`7Ph3nLTE23IV0qMen0j>U=x*_k0i!~d
zryZS=M+mEvgrXhseW{2|#ngh5C{YF3-?j4M%}a#jnI%Jk9i!+U6Kgm111koHpoStR
z#N`K~R}6~0Lz4<04-NTY*Bz((6<;si6cazS?*2U7T&lda?(2!0SFcqSL>A4k&j_}r
zgZ&vL`80?TafeJno;_E{o&rNjp<lznMwo0Pw~QMY-frvLx~+zaIsP>x)q`X2F8<(B
zZ#;ifMZLVfI_jZ<pF<KUv8wn3|E62^-vEP8<`1>@Op;=!{Dzg^OJ|-Bi&MO<^?3{*
zy6WmY5LbD;*S2hs<5ekqHUhe2(I18sz4%eOLbRGYyZY+j-uwq@>$1A$Rd$gR=Tz1!
zzZ<~Py1!Xq6H2UMd=NC90L?j%bwV<c@+c%f#PzFeRO<3-Ozjt~1?#HT;Jm_2Lz)FT
z-&XmVKM_smnk*-xAqbEOnqenG<D?Cl_Q$!h%TRH}ExKsBsaKK>LP{HX9$LVK@;3tR
z$ADA=bISaAF`Yn#tAq83wp9<KJ6FYm#(<OF6!H-Amg9RAPIGe==o-n@s&>2NcF`RM
zYA59rd3!3&H91z~pPG9o6XfCyOhESXEKUhUTybLj?@XRNuk~)J>-c_tU}W3z&MN_n
zM4wmvS1%M-7%GcAe{^+e1Trh&<Z!NXW=Qz_omuV&8V|?yEb>4)ls_cDhiNa$Uk?T9
z%j>O_{|>}9-i#W(Ptm<)=lFHx<?sW}@X~AdJ8t1?LAO;Zn{^-NYfi8=yt{jwJHV%^
z=~~=MXt3y^x##KV`01OKc1>S%X)#uo-WRW5s9yd_H#cI7k^QS!^=KFtggskn@x*!L
zeU)1MMtW807xlj<FRwN39cs6xgu1}22ELq*eKR)O^*(e&UD<p%`ooRKcdq8zFWbM)
zDZWzn@DH#fD`X#jzMAgpR95jov%B=M0nDBV5YO+8*A9iA6%;iYw4^;Im5DAisE=IN
z{@^~sZ0!A%h6`Z~&1TTEYzD7|o6B5A_#660Q2jckWKH8Pq;<n!7&Zxo_v})NcJNX(
zv?SIc<b~F&CXYJ^8r_PSm}9ACeY|vaRg<^rzEVThP22Zp$UBOfnRt$m2K?GvUye(C
z_~w21Vo2p1f*gM0so7EgO)r;8r{k#??WKgxAL<>6orrGO``z^Ig_`<Fvw=Z1`kG-#
zOx?^?JEd^+WTqU}MuEvmLK~LPo9cU}KRLM6ba`ph7mWo1-#jE6tf&?z-eba^$*pTV
zxA`S6@{Sq&dahi2@ei22ZYuuK;UAE$$xLY;3Ll+(cTm-(Q8peS!{qq{Z$EJ|*vcPZ
zf~Hli+f_x6KcI;JU3~Mt5&rs#w`G0#5kF-7jZyyLqs5k;`rpYcE=;hQeDe6U{wlq~
zaGa}It)JzAfzZDFyN0*JpSO~2hn{(!3_dyXDY&$<pW~il)4>fgrda1%W{&K*-8Eqr
z9a3cwa)vjN&pNQ6+n?@DvQa_bvFW=9YPVrlp-^OLr7&5NX%sIDN4Nwug)e2*ya`(H
zMGenU7NJm$xYmC?KK*VHc0H1RO7mCY!n59?S(B;?K;rVtu5|sfM-lg`(WNIHPhb9N
zn!xETV8*GQx#`#-`pk5+3pT(fr=^j$EPSHZ{@Pw?NBX-b*Lr#0Og1jj?K89^zoeDW
zZ#Xt4G<p2-===|^7$hxAwWbj1j1=~2QTc02EvwJ!+?zh%X9()@&3|c+n`F=r2CYWZ
z%!gV%AY&*(Hs-r@20L02J3q&Md9k^!$UvqDPID#5nll%XY$P)$qBn%3anBGY3)t94
z{dvnWguxk^u;vun7>BX-44!9LMFzweia70-FZ+ryNEWUHO?kI}29)`uGD}CYr~T`%
z;v~ENGiD+^&sD9gn{5CRR){;pAKE%FkUPARQr6I3RD%@ilzVkMy=kROuDFJA8fyFL
zOYAJwX2wz4KiS60ek*7wc)%cQ-Vj>zknlTVV3P|IC32$25L(P1b++4n(}=4)yUHb;
z5l^|Zo~jIwK#)_|<R5K?7&f-7@`8RfuI84jZvsIQH;<DBIry@)6CSTuwJX{ca7+5K
z^f(MY^nr=)IBPty9uJMn6Hp8DJ6(iXu>EkVF4c>oT<$69BVA%lQN>mdl@V~J-Pc=d
zKeu9rXnEJ_t7vx`n9NG&vJP=Tyu)Vt7S%zg=E=)c+19focF;84%41mD^h*Cw8Dhgo
zduUssrloA@c_FSMvVTdUHRGPY4Ky%^*>O8WBJzP5bXnv`JJ{ouw7HZlIu|I_LRppL
z1Z7&RVqx2S&^pF#!O&vslE+9{O$}`2LmryW6cY~KC!&>iYt7Dib@jpAlpgnE0SL{o
znZ@Dyy$f-s2|f|cP4_p;jABF5h%)_SlS|kvPi&wRmh?%N(c60ao~uV?L8)#bTMI94
zP)e4J$B;RLt?yVf>RDt;&1AoW(6q-?|2%8Gd)DjB(-78Aj2&i8cuYgC&KA|aeRt^j
zE+>D6{;Ipw)%xVXfldx=*o_e}&#v)7+ib%b!E&(eW3@~rZiv){kyn1opStxB#tr5i
z&VgAetjzP>i;rxx{5)oV3a3|(WH+qcsO$P3kZfU~<VLb4UH~E3ZRnW*lgO@<4R$ZQ
z&VyN6aSvG147(iH<(Q6F12v!D<M$uY7}a!CuZtYEKhjSrg-tW|ZR^1vcs#8WT3k8J
zmX$K*5dQEAyE)Q7(bbgaHXeDO9kyQbA%*gt*;9s7D@i3FX?*uJ&q94QP4BzDG+MQ(
zA3d*HkWfe(VL7xyRl2jA+QhwylL&$vNxHJb=oyavs<R=<93Q991w(2YaerW1lNEb|
zp}f91d|mx!j-schZ|BtC^y}KNp|24id5Kl&eMT#o8(+3}Hax*`He52Hgiac3xOrhA
z{puB#QWsnQ@g&iBPWQ6c8in59YoCR2gI~3|6kSV}Cg5V2oVfrLUFCX)@kcfWuV@w2
zzpq~M%^`@hp2s3T<(z(RKlf1VSN^Y$TI%C567CV+r(MEsxhEtz+9N~m2S%h9+NU*~
zF7`kBd8sHZYYoLNd%%5*HTP4=U^u5agA<oh`FFX8k0Q<QaH+DSy(^GsHb<3z#Wu_?
z_N2#*+2iG5@A{QMr}lT%O(hNGvZd~qo<K9MovLk6-_~mBzE@T_0()y~(l+H3S0A4y
zn}i^d=x*7tZoVxTHk?c$7V4r?u!g5STF|*_V^9-x9UWX3iuFA05lXk9cv-?w?9h{E
zS;fGCE{{$c+FX0Y<A*Gd>rQPkO_}~sch@(t6iB)2nuGKf>nEkwm&5!5wXBDrFeW%H
zkExMLc^zuy5l4l6>S=~2ej(<z&ZHBcR7ueZ(;VpclFMC1C-~{A%FIJdR^dlkfn_-a
z@C`C78<i<UBuu$woz->bD#O#gnY{<h{)6>APma*~W6UN9BNuB%VVnh)m(x6z_-xAS
zX~z_c#T(^r+c`AM0pgk{+I;?tLvo>w|7>Xm<U*(#Ubt|?K*H?v!SS+4$5~cnx<<u6
z;H#=@sHkA{BkRH=`_^PThmd8h=+@-YKYg30+iObueL0HwMiLv_a?;_>f`4b$FIv!v
z$Lz4o6hDQC&a?V|(Z$Pthskh@F(%&pi4m}u6OvKm9xKW5$Hsrb*xL<BvkF+kXaq}9
zx|N6vgSRBa69muT>``~H9u6JDXUVpz(2yuWGS@Xr+I!hUYgOJmOglmnF(m(rWaax0
zWlr6{!>cxJBmhUmdJ0daW9~fKDOB#hlAj38o^rC-<uQW3Rh8|sG<;Q=tj6B&a?iL6
za(_7K8SX4^;q@XSmLMTNrQSbVL3%Tk+-6<%{`vhtZte5xG#yU20;$`cS53O(CZyal
zbuo@AmZjImpKIEMNgu8M19;0y+=^oDj`m;M<QD6v(u?sVA2}Bf+@yrXw>I7Wphf*p
zaRblVSc)HehoJ^K&mZ*(4Gg^l`8H*QMDnYbETtFr-+B4weY)n^vi7B}1WI*dYnmnX
zN7D}nAK6CAWD!SO^=G;M#|vtsf_6S(x#q;3k(0?v6;8n<mM2vPS2M@68pv<j_A0)@
zzDrD}>J1I}!qD`<cy!kgE80k;gyc;Ykj0p@vz?)po;LNSt<imvab+{Qv{EdVSwqAU
zX>{i78~B@od%NOmK^rDV5Ut@JCQg5}?VFjs&(>j8M-ZVAr$4LzfJ&8LyC~BtgZ<aH
zY>!fI+Upi^UO0Fsel)^*qnybRIktXT-5U4@>|Q$<*fZXIoKx(@Y4^5|x%pYq*SxZ6
zb_51V?iA{BKQ#E?n~7k>;`-|1ewg^q(|<r$a;599CDnhx%^$4!n`gf9{5(H?jm@d*
z-FsEvD%+dVO|b{~f53sk^5f{cV9Bq{MyjJnPP1jF@&kulN-pO<RavfQBlYbu&qUrI
zGEtcBq)gZ+1b<sDpZweQga8ikv<8%$*g~=EhQlo@t1G)qb^chbd8EXI@%{b}u(Q!F
z+I;LXz#a>_AUQUDN}U;sP5lFGjSF!3v-+X`0G;Bm$4NgbDJr*l7r*`Lzy1$sz|S4(
z9@@H(Igfr`7+?zLyyf59OXn1x+Cah@j%<+=7tlEoPnMGpssWzrv{CD$M^PzdC*rsy
zS^T2YI}LUwqW2%)viS!*uw)w6Kj81X%8NTypKEih2+qfM?S}pViJN@yD#h0e4UQ%-
zL96eUE&sON{0uFXFZ>6{q(uM8oVWf5Jks1Hh14+P$XF)Xu7Q(TX~>2wY4KgRI5U!z
zj9m?3hM!97F{tR|PpEjNXfwrYBnKPENJfiHd9Tt9lHyd3M*cBpjG5<|Ux)q!PP%yP
z=Nv!sh<cg)sqrhKCNV<HLj50bDD)3lwLG$5@{)H<82n0La$uZcj>bXL!*-&n*etk7
z^_MG<|L2&>YV@+nnX%(xR$4yss6VU4|A0oNAw-6AUBl(S<2v$6&zlWqI8DpzCMHF~
z26&iQz|nT9Z}QN|`|~*d?MovjpENt}_(_UX?v+y)FFfA55##5y{SUC*tGF7We3f^u
zI!;Awm8q7+<{vB`t}V&EbtB6)J-pLjd9=>IDMnpB9b(w@D7&*fZNu*8zT)yQw@M8)
z_||aWk;mae_f!YWs!RF<;2isgmVfA>`=;GsgNe0*0!9BZIJ$dy=KS{#i!*cGUu5!J
z6rUW#Glh!%yDh$nV&1})RG##gsVR*HUw9Adf?<m94w(6zO|1+8=44nKj|v&=S?@?r
zMvsI>y?ppRErs;4R;=0lu5I7Z+|hrc<+R)>vAik1DY@AebzW=!q3aQ@KB-)rqPA_Y
zOV@9<JbJd<Pu0`>|I16=z8R~pl-60LE13$vYOri!@N0(+e6yrOjQO^xmpOkvtE;U3
z9UjL5ILokMarY9#WkDeN8FM7!>qJdavy9}R{DW=-@`amUlqC3Px|htJs!ARh%^564
z59dT6n1@`v3&;1wH>LL6^n5ZDpWFHeQXTA$2sxWdmqH|eG;+SVonp4bu+*Qu!jypl
z*{nL~6P9%5iTGBu)^gfEV2}Ch(#+JwxS8#)6;lTdhWPdzkQ_+=roMHq=%3n2<I#I*
z$u0qGmC=6<{!?4BPdY2dg@^WfG%WMI?a}?rha`VVi0^e99I+h|qcZZVgKcC<KMgul
zKzf!2`^kIse}DwyAMp5hn^peNJQMxdtUA~_%i+S2amGJjgPa}xpWH9aqf~}pP5w6o
zre1$;B$^J5g^AA-;+C)LMlkPNHPyQv>tCvwyk)R#wiLHn<5^RenCX0FS0q+^mgyqa
zqL+=2%rMahZ4)ZSM<wsbmY-jC2t{tmlnI<aGGzI-NB;WfKk@5PQ>7o_w}<`#!}&}1
z^!z7NCC&{zpWkwp-2(CtnKJtCf6km4O(i}(l2i@Y3i{8`%H%S>=xzrVr+#%H{dPZ-
zm>c~MFgq&wHvj3+bR)(zdY^HWksES>KG}3>{rs`mVV8}gB!dm6fk|bXSuvNCGt8+@
zX$;Axz9@(qe>&y!4@l$c*kNkjVc#K%q~F<;)^ene`!Qwq4+u6O+a>cUAqSM}!k}Hw
zsy8LyOp4lOb7G_I?8~3A78-1KYW6BE7kQp^D$Yb>(g&g$@xHmC;R78<9^bz6njNM5
z1DFnBxF#GWvd^V&>-f<8y13L}vIi$jx&j~lKQF>!>3-kgp;)2ly8-5e{7B>T><OMP
z%T~-kc;IGf75@)l`rYA&yYl~lWkF@jMyB+8@|O~n92cZy6Kb*#(ltcQ#P{>}(!||5
z<G*y|itnW|l~3Mbta4`jGB~iwj0s#=6Hpp@_wSb9$uuj7>*{P&sn=_}%BiB7epABB
zy~l@h=e4eEzI{B-73fnFp(t<jD5|J1qNj_Q0J_*?``K18>UDhcwvG61dce9b-Jng0
zIsehic1btdzRPDZ9+LDD(U7D6bgGd#S(>{V3pbMtwhcDr4}-2OIZf-LXPXSxeJ||n
z=R`A|mbLq@TTJ`Uz=>oTY)bxZJCx66l!oTdcM$&p??yfLpX{X_cfbD+_`SpY@gGbh
ziT=;8b{Wf;Gv{wSy3Oq2|9?f<WG}r>?8kprn@n1OH3>NzZk-YdgZ*KC8Kd9M%9I;V
z=IeX^a}00fGw6);q3ASL%d;`x{$HGm=1t6DFxBRN&tItvcUIXt#m=+<Sl)2-@|lB&
zLd*O@`g7vDse7|}@)^vMUuUf?Ii}Z$i~jq+>wyMsiqXuEN$t%>rGG4yFV}B3WP(8P
z%z81Wi?)9fC%v;Te%NPl&~^0kVQC1~+F9~V5)yI{*{9DO^?z@beME3~+v4X>mw_+#
z2<LU4m$?5N*HgqDJB|&s4|_BXMt32FG|b=x3~CVvhR$EJ?rfMoCM`KaWWdp*kj%el
zyb)gU8sFhyE*-X)b&^fc$QBK2Fv~A=OrKsOzTZQ*<>j<Au(GOSKG^XdNMpxxtL`{w
zA6o+5Kt)_cH1l$OLvqp_D8exj&8B^(0F@XZTAQaY<K4*GFFl_FR*K!;Amd+N$T_y;
z>qX1XHk;1_@v@Ic@cn{E5@wg29HoRs_(nl+F1i9c1%YO+=`95b&A&o6E%4;DI1QXE
zT^f=a@m<Gh0Q-}DlYEl+U^3Q)t8dKOy*V{7h11`~V_h*3rk&&X+2W){P^RWO{-npn
zw)rj`r<pXHP#rD=!Q*|%mklU$_XQ=t<C{J+GdT)L*RiJ#ATypSt9o4Un2daUwmw=>
zGXEWZ#rL?=)+XVvXu-=`gx;&0w`j3Q_zaMl(N~mOEivIjm`=IUD>hZFNHJKb18Y!C
zm~`jICq#!D>RjS<={ze?0A+@VE<*MT?*QN(7&O9zpw;4k90qzmgboTgC2LH0JH{5{
zx*bsI1#Uan{5rW3&`3om!d=D@cc^}Ucv6%$tO4%v5HHxQsNOHO=vPQ)9>brNO?ut3
zJ}tmV%i0eKHjSV)L&V{=_0t7xUO*ZE5H*Rf2*;*eyAHn#f}Wx<&3Tb=BoPmjU5p`)
zGaGE_V@|upn0Xp_TmXDF1>i~g7LJ2<j-&ITz^93qxP}Fu6k>d8-0y`{mTHa06j;N7
z{;w=_izWnGAf`bb#3(`ea-pLax&aFi28X6-HSaSDy5>W4H)s$zblo&WwjeVVl1;j?
zQ#0vw4QBQM#kGzy0#O8~eD|bV;x<_O9}GJA-r#`=QX~a3ae^4v&u_ZU%m&4)%tX_9
zF!=c%GW0^M*&S66q3z^_H>*S{&pGO^4%DA10P{I@E_6#7GuzLF^o_WFuC}vT^Tm_U
zei#xm3Y_B2{epr{I?INXfRJ2=+$kKavmR6hgffDxsoks+bT~NG@O%zRM|V(_CF(*l
zI&r!}fP3TXGN`GW)@qg`46)<49qJ)U)pxlv0`!JbZ&(cCQpPl&c8S3=E=+6dW;3i@
zBDDqMaiio!fV~S5oh8BgUiIsrEjZO9_3xqnQAM#|ADT0f4@^bINWKua#e)e`ZO!KB
zR5V?>$qQ&!5$)HIz$p1oB`4oQfTbm=(UrH#{rkz=3CZt9WMXt07yOo~%0S+Rm+q}8
znZT$1mX+ZY_7`!C=J5rB0J4Vx9zPT7f^RoaEB57@oNq(L*9tyO5c{A8H|a@4XyP>E
zRKb2@%0PVLx)uTKteq#Nn+i?i316iMzRb3aAYBKaRGro-k5yjs`aJ~$I#IAW8P<@3
zOn#x)eCvY~_&me?ZV8o{@h$Q!;h8TM3a9v$1^727SQrJNh2Vy<*bHzq7keZ19xh(l
zoYpA?Xd+W7k<H{Y<Vbt)LO(cy&T@NQLCRo5;6}pS&vrP-9Uc;&P=-Vxj_DQI(#~m3
ze~l=c;Km|BZg`SAEy8m&ZCu;jWnwfo&C6P}>~o+hN=jp@oChO~Sf+VF1)=lXwMX%$
zGj4vf_ofN_0*NIeAgS6#4Bm2{BADh2>TGolHYwQoYSS|1`YA#jAD5)>1_xj0f(Gl}
z(z8K8X2HJxF2xhdzs0~_XvbFNQ~CtZgL-0&2C%W5hBJV73R##=wI!`91zbW?ynW-4
z)8mV*$q9jTZxQFTCcl2kjx6NC1VpRS%2?*$usSf-rDx(PML&AvM2Xr0dZ$#P=u@MR
zlr{=af$?jh3)qgKv$xxz6KuyNKe3DPu+zYcvC`nRocdW}=V3p9K8<@SW~k*IEBRhT
zS`i9*J69(dgFbh0iveM!UVFX~wPbvq`!fWY6B4>lfjcjn7q23kuWyirQ`v@E04O>2
zyNkjWBLDqbhCYwJ3arx$`XHVsHTgVwp-=y;2hfsZ*kTFfe5gIe0EPrLGA%*xHDd#%
z0Hhki=f{?bd)y06gDjhu#O8Mted7GYc05j*m0fa)<=9kj!^EVkNWPYqHp~w#+S}No
zAqDVg3cP7+i%i4udKmr+jB3Rq{exb{XB)X_JP$O}yPE}i4%5`&|0A-N`CJ95Hox^_
z(?Y18t_NkV8cOZNxm6>;Ejr8P5C2~aN5ciT`H5aFpC<whk58o{)o>IyLVgjz&Xye<
zA)z72YKr-$=2>#NP3R>9y_0OrOA!E*Exz&5m4Bt^x<!Ykh`iB0b>&904G8^pq20I0
zGvgzja9i~Tx+gH)SsfhXj`XnH#DAtEJNIeIXNtW#WSaw5-Td7o43mIB5h1z~NH;r1
zFb1(J1uBXR_S@btVa0QTg2ARns0hacMiPF0Ix?*VWimaPUlhnxii{t17qDKE^}#Gd
z(_#a$)Ew@DOT+8=zK)ko-J<ejA^6N>!@VMkE}0CX=4f0P6~i69m2G}&n0I`Y!rSjl
zj-{Ccb6QDV@7*v16PtJ-4YNT-m$*`N-CxIBd`{D#Vx`$*iJEYfOM4^_YnmpN=V`I}
zGu|c1r<x??2{68gUc#L*FPM0#=>j^wey#lKu(Iz)W;|rC92}B({P^o)A79o8@x`2H
zDei;1aIW?TnWA+b-M5~<8y*vJ@yH|6rrGeqVo^s#fO5R#8_n>=hdD6UUXj;$ge6;(
z!c$&k4q^Ww1q>jCP{F9h_t)Yx(6ctil-Hf>c*6&AnHoR=+zb4}OHv96WBoFJS&z9w
ztSaybkn|B?*mTHl!C%pm)DFS@8HF17wZ6L&{Fd==-V)))L}c9Wd_HbLc4UwFt6K+Y
zJ;DjcI3i)1d6!$1c^4<LH{7UP7hBC-?!6C8%hUe{sLk_8W{+zd5fmGlyR}0GxZW}-
zjh-GK6SP~-+6-x$-AOJI(V`>M=z)}5wUTdY8PO{|;oX5(dgjG=6Ubn$3<)3(Wr`Mp
z&<Y$k*z}I#XI;qjKwb^jfoU|74}YOsYi~?9E(6X=NG|e8G5Hmq{BAWTt@j7N1`hP#
z^+T75cU=gYNs(3@f#PU->T<xI#4-M3S}S{F!0Z-A`pqE4wZO?NG#`?=2G(DDz(Tt&
zO@lB^7wb_Lt`N|Q`D&su&0VXFaaHJ5<nGB~fMTWf;~}i<F)iq*{5satoZ0nsCGa{=
zig|O4P{%!B-g1`ht#x8b<SB!TysS#PB2%ohi;^?IIBF4+@LW{7?GW*iKu5tuXUtyv
zbB|B8u6Y!DNqxEgX$Ekc{)!*{r2hyK{govRj^lqRs7cK+WHoPYG5l3<mkrlY$1)^%
zY}GRP@BV6BHo2__?~Eb}^wJIwj|eCVxwD}tI0J%o>Odz~ua%5r(BnJGcgTjCy1_b;
z{Ty6JST--xyg$!8FY>t?H275B51}4VDENiH4jF@BJsHpfUIM`-1AhbJ^l<8B%VZC5
zBJS<wFUCAlts7LMKpd8kdIJO7i+VH;LXTQLF5uxMPs3KhE-o?6^T!{fFd~<mfBb6o
zdXVPB$pE9~(||Gz5`pAekNS?ngT;N}X3JSzNs~M@$U^%-!rB8fuh*?^sM*i+=d}@T
z?rrN<+}It|K|bHJyN2K|-Dd?>ZBmtq@sd2^beHz8`jWpt=(Yff;pf%NW?l;ASw5`1
zK_RqxbTS}9yr%-%tpTT=-Em$dpp-o&|6s>$2EgE;Tw4iS7Vi3Z$bNZkyY`wnJVGoo
z^6pnSw<`J@f^CCv(ST^7-Ns%o-g{~imdt6y^S7;S9M<hE`90AHcYsm4G5Zy@<k`<2
z^9g^@&jkaY@l}$heRzS+cn&adJ+QwyrBH}Ck0wF@cWjV_l1RSQ>XnJiU@vmNYoQ|r
zz>ir9zSzV;?vPXGGq^JIqy$-Ei5EU@8-e0~k6Lcwq}Ae{rg`nl5a#1PxqX$11H)9a
zJHCyXtfhVELrwv3c3nabxvV*MN%K4ZSW3`n=8FMM?;q4CrbR)x#9D{2yb4HMpK=_S
z2)%U~ac;Z~6wVYo(k)!T81-J?1peX!{gh)WW$>~TAW?T6!k_29n1zP3MqWPabPT%N
zYVOOE(?wU9Sfuc3P(3aaNzz9M&GmH#lTXOy1)8=7Q}IoLL+aONwJf0cL>{auf?^&s
z0Unf+@P3K>N&>E>wRK45ql;iHgC@oFd%j;mDmfByVBr~Q1~^`mM=PW69txQ=0mm|M
zxy?xfec$F8Zcr*#X=DF8FySJ6)b;#+0VKqGH8IvTZIU2`p(~&AlLCZsv6>ml-#)1E
zpv-r5Q6ZKy(prg(9Mtr}muB%O8Q`ti2S@^q28U_>Qq0NF)XH4@uJQ>gMA6=-0fN31
zboQm>MkWBchF=Hk&|(RSJMl)eHI*3O!HVXCIQpLore6WW6zlWh^nwNNW_RCHpuHQi
zCna}Fgk!fCi(OJivB31LRu!n3wyK1W6@%Bq%|Zv|J$Yg6NBsVt1Qefw3eLL@=Pplm
zOHKS(MmFs1mOx+956<AEl6&W{d8d@x+V;>D$VrcdW>+nEKuD*A21vBdQ#&pHAvHw>
zfN2V0Jj%IuRCxl;WG){g?|g>~@NeP}#tG!O#GWZN!x+}c$8j+IV1;hKy4lOf7Vy9V
zMFI+OL3!#@Ca7A87;lUpkE`IXxM=~N1zv+9-%I?zG5Qr{L~z^ajs(Ew7D0`3Y736E
zEJ|kwv)5By9Q7;K*-_~EwL7p-d`wq;IW#^~B@SJ{&Po;T%07zA>BkEn`?Ht=$3nLg
zipB^0#-^JI!d_y~V9YmNX=Ex-!UU`t*(|_T7dVPx?sCyg>^5GM?z_4hJW`Ww*K{$+
z(;Hs=lHzE4U*lQ@H#j%zygurH&jr31s~@sy%+gHpk8DGE)}dp=HTuF2`i}vuu!M66
zeQ!KOLmvZ>!c!sx#K}cautn50!(aVeqhz-qU9y1FxZAEsU=AJR)m8ib6qzd?rv@f!
z>2v5rq2{s?;roZF&8V;5aT;3E26gOD0xy3BZj^?zk~7nMO5KP8tNQWHF#vH5vVl{c
zg<ky9ZADZ`i<(8mry0<KIti00s(oHKB&h2{@dt%o5kls|mEUz|*JAo~Zy|VrczVFZ
z22KV@a7q@I!WwY59}j$BST6!2bM_seAuWe+sIOTo%WKIuLuZJ+3)6g`6nS2G{}F%}
zD+Xc*xxRjD0z1Hu1;p_;#sB73Z3^a2Vltm(y?L(Y{E`baV^Gvx^?A`YB-83`Dl6A5
z-NxO@jB)#=M)Q?GP3W=X2Iv!V`_JhX1J7ON555=}kBjQ4&%|_0y<GWv0oV%)tJo_l
z6<Dr(&0$VwM;?b67%t?twr(-P!j!@^5mr_%aY0nV0_gB9F}r$<{7_fBXl5F?(#F2g
z!6X{h?>{>{Fj2}=Kh}}|<gUC*F6mik;NvY@e*nY2@R2w>X}<2g$Q3pH<?cpHOtd&X
zb;N|}mkB)`nHx@b&3}IKIs9Ds;qf&QG$rz(t^8nB=?PIQ)^-bw&h=l<K60MGK9H`M
z1+nZqqEjS)6<n@Ki{d%^L@&PeQ}vUA{#*O+uN~+B@bEsl)6QmQ=idpN_9Nk~?tSkn
zF=~g!SF<zEPGm!clf~`iiN_AC*)aT;<eCMCM~3eMk*9NivFz?D@L1_m21E$jb4>|q
zA&kPAvh6hB%I`h&;H<w!YM^=N2KiV&(rLq8dcXV|-02iK#2}c0U1?*F=>jq>Gc;l*
z*nW;drv(u`leB5|=ClAS=@Gz#qLA&>ZNrM1%X}RbAOCdz69-O=tM^$b2=H|?yHAhR
zc&S0x{!3=(#Sjr7-4v$ozAg~E&;>WAHUA~V{sT_D8ecNxkDq1~!BnDcA)QFN*6SW9
z+T?c2{6%Zl`Sys34ML2HP>TiwOgBU$zJ`-rs52t#h@Mwa5qP-y%Qo}gm<9d)VjKmX
za>_E`)psO&T%#!E>7i8@t8|`UG@pG&y~%vr@hq25U|{ae8*543J{4KsQ{1n2#Jnxr
zB()MQeO#vaYUK`UgH_k}84C-bBp564?_nA)LGX3{$4xU{4`Hz&u{aRqDaK3|A|Tnp
zsKZ2}gBdX)`CUNSji;JI<T)s>LL5Im?@AnUJ>%0cHf75ZARb5|G`6*#4Vuw-d9;M*
z@lMjgOw<c$AmdZ0V+3^X>qZOUezSlnZf;eZ07GwoD3w`B8fBo-k@#$o`=K@JGXA2Q
zfzn%>m<UHD)_#WL4J}oS6$GO5_OknODzBeQAi&x+57vT=prV8`^wucwK}RmZ-y+h=
zXA+4bUGRM0uX?PW%79=SE|VDx<8;A_A?zA2nOjc*(Sg(ytp(*Q)P_J;%m$9DgLi;p
zPPKBx<Gi$WX*dcqNrr&4&jtzUr9i^r#2@?T^|=t}gx+^(hqzxlBrmFn`GF$E&DR4z
z?m_Za4Dhaxd*{JS3qPx$C1@J!lmmuqOz=7|Al35Vb>eOz^E}1!ZD;fN5;Z`C(PsWq
zi(>Z81L)EaguM8bL;YeJxw3m}OyC0#uOa}F)rj#~r+^0yeLHC6eA|7GiSOCBm@8>R
zmq<wn50C%Qb|{2^G;0gc*DsWJQRB%l2?QPdVn8|sV)HA(T1Wr{b()Z&wWUde-T%VC
zhIz*TEsFs02UZYdeyh?ZvF*Xf3*SYe7C#gu1z{hIy_k2?A%P06#CD?eS2LSepc<%t
zMTQwXC<>7lAu`&Mxr=0(^Ac_MR1=;CW5^eNf8g(DSr5{C+wHaUPIU@kDJ_0EoeG0X
zYt>=L&@eg@;{DgikO!sf3g|ep%pPPKExh+Q4QR3&5Do@jniPvj0cZSxAFBIEiSg@j
z#fanY(5Wf2;ZkV-yO<U(&l240*7O3_(J9kb7EIimCqmY0ytn*W9&FchA?jy0B!oQ#
z;{n1SL&10Aw6!F5Xe|5`?vQ7#ja`|0E;V`a9=m4!BzX(~tBOTRQp0)8^oBDhph@oW
zygclRNRhyBV?=NKjA&NG*8!koM9vKm5T48IKJ}u_g)IKMfFGqCprN{ta*dtIj<%)5
z@SceQ7Cs+6TA{1>2<`HHiUI$Demx?#u_vTO8F)bMUNaz?9^dezo4Sj6fw1iLd&TJ3
z)LDh-*9Y`+cEQvP9w^W5BUbKbJv}#Xt=%q<`39c*Jt9Wb-M%e{AwtDW@nZz&%WFbO
zoeD2LrgkIoZpWi^{knl>2$V;IwX?`Y7=jSkZ^WUwzD7qc=0j$&X5cPR3^Ud=>6Uoj
ze2&v?qZK%L8E^P9W3QNkA~viD;dIG;$+V(8!@N%7>@93uim$Q+P|%VZ5?AMy4Vl#`
zhs-wCzNS$iA})l@CLpHX<nH}JUSMjs>_^{X-Wg||>J6M;4ZGe-h%8QTK}RIKL|8RE
zwJSloF||cm_n?&3=T28w+gjwy&D9J}FxJ9SJTN&k9_@6vQ${^?EMy8|`tBc~q%ya;
zTb7V;F7ZYY@>)hi->(PL9x<`r{Nhcl_uJZ!qYcTkc%jT+h@Kb)s@CDJ2d5Te1iwSw
zK!^m38)RdDef(NR3py}hf#Ii9<A+s<UZpT2!{_*P;k?iBfnBe`p3?r1Ann=9I#KsD
zDFk#rZvZ>=Dldo?Xm}+d8B!uZQB*Q`V51Y%#owqm9JQBOz1oEezOoOUAGQ8@^7G<*
zv8%wgXq`W>>fRN6@-W2bs<SOcQ#`)+N3Y{gbqgkZMwn>5b=2p(2y}S&hiZZv^}T{F
zLVDqq?v1P38?PRTpSueRejjML`gm!t>QR2^_DO~apr9IntXK*3g5$YN7<T6t5)y-r
zhR){ww5}<O<-9HDGV~~w*KpJJ;>>T>p&+V`%!l)3N<p(3uOXv9+N34?tFa~ZcQ0y-
z7d<Op4)d}NlYZG)BhEd~l5IYXz#s|Q7EX`}*7jK~nL4{O?v6DT+NJYd(rL+Gmo<Jm
z9~++qJ{lUBoca2m$ofZfUc$Z1C1I`H%5N}q^@Vj(QD)_J_1fciCu?4q*A!n&Q3wnv
zvY2SyFtk<caOb#rdb$4U3r{1EisZX{9}UFzE^03=<?R7dNohO&-0w`}<-il+#Eznc
z(^u+I*|$XrlVs!UW@liyVgKS6HkE<PlgW0y4^9UtexB!#QSFR}ZRK#nYPEM|9Je7-
zB@G?O8S)pfzR+)IvO2~XKmfshF!MXVj^W}C2GLXzHgT0cdZ2cn0KZrTTP6^7tyQ7b
zBiQ$pz8CT#t_2Z&X-XRr|0V_}n-RE{8iDH4>yWM!LQghl&=O*kKe&GHV`sbT;{M5f
znGty||JyzW-IW*|k+($Y>cK<!$sF4lmIattys+pegrtOIuo6$EC2v5?Ec7@X6f=2B
zH=xx!UYMLP3flNXy7mwFbLrid1&)f6e41Tw;%>Rmq0$r#ZBm`+sjHUSq7r#&zhoNr
zrX%Jsx(Ec)+GMrX1Z%B?wv7@~$1$TMm_V!`5VC<g(UKqtjL-d8xC_k#eh1z|3&rrS
zc#MYjCK16=1eTYGxWKs&5|=Gq0~M_&0&$Rz4>EKoLf#*Ciym&~UZ9wsljhTz6GJQW
z=!*kp<RM5ygG+e0Av06`)pCmdG~+jl_aQ%p+)ULeR{%Z#WD&5<&LRL6)Jua$Kg^y&
zB<>`_gQL-Ht9~6nWI$Aw7D${($;ohymE>MM3jgbk!?d>OUR6mNu+v6W&|tLjf>17N
z{s};wq(<j{w}mQJYeQTOqD57Z+z^X51jM#@RlCi3VPIe_P8Pl-djC6s#G!@rw$?0I
zdPH~(T=hpapJKkiaMPBe-+^YH9(+c(H6ih<lDEZWyfA0W42mG;1j1TDcvMm|dA1oj
zKoB78*e0oP%p1LtXykTWp!NS4y7G9Y|3CiOVYXbG^>q$2vyo_y93fi`TZW`VQVBT<
zIm!_+M>P@Ks4yW(cTsYsjOLig)#1MHlB;t2{r1=X++&Z==e_6a{k&cPj_NtS|7ie5
zas|ErNoyQu_@*CZbl!arTnsgC79e<>psFaFdGll^XugLCc!($R{s(wL1j_x=Hiu*}
zR2K*HMy^(tZ?VwO3(xL`!7F(ANN9sa7XVV^F+^&RJ=6T!KL@G|Da86C4W@h1el^#N
znRwG)(E4)O#+#v>lAiGQAQ0CA&qd0>i0OkO6O4OQmIXU+B<b_}R7-UuRAg35g6PRw
zv?Uf1D6kOm$3iS%Ak}&knZew%NP*hyKR7?m^u>3t7?*#O1jsFeqU3nKu+kWasfkI-
znZ{Z38)nG5g`YzW4GfTc=&C}Z*WQBfTjIv#X{mTZ1ne9jh?j&!vog{IK6NtjLGT3r
zFyY-}!0|GWfOjRV(#>Q&IYU)6=|e0WFUtC(==X;c8XV7g?d%OJT_Up#3hx;&grcyG
z1=CMV<yj;{b-FoYZ$Upt%hd}V<OoST42QvkCnxAB@O}blse)eg9f^~Ay=)I8RAGk;
zc!KU!s~|Qqm|tr#xPL@WY|eOoiycoKL1S`bMo?1mGnE#Kc+I0xo21>6s6=~guLq^l
zi~+|}S-*8vAE5B239&z#WufgoDcsPZ`M)**%pxk*T6L7xmy4H(OdY&%I4mCc7y#V~
z1}_BJS6e!X;Pp(Ci-XyRTUY`ah!dA3J0FAg#|>3j`x=hvtDgQiK?0~*hQcirK!z%N
zF_Qk$zI4D$`OrZNr{5pmSm2P%_BJrIesYUrWcQ*ghB^Q&Lh2gdMBMMRUjU`abijQ@
z=t>yZNI;V)@-VACJDK?|Ig$h*{I8sypOnDXFeutdSwL&NInE5BD%RwEenTilnQ0K=
zlsH0T8OnNm68A}!B@Vd^X?^9)MwHS_@uziKT0nlX;%Y{tz)eMTr2Klmf@{0y9|tJc
zJK7`mQt{nx1KT`O?!=oQGdVaEio*57>w-YA0Dt!)6bTxPg;;i~!lHy(vI$mC<4w!I
z5s~oRpaT)YDPBm1jg+urn+0`GFeyz+j+wqx=Iw)PVnNyXoSZ}!u!sW#);R}n=E2<|
z{b(y0)n`ZQh~-JbgDOiQ2r_~b63Uc59FMWE6mL}oWXa``8AwE*MmVt515GW_^;3W2
z&+xi-JmA_<%DG|_f>-2q!iT?^1Ap?~idrQ4qYJm~vS+XC&AvMLx9Y~%9J9ojPgPx#
z#wl;QE);!u?Td#*OK(m7L_hs2^)9dKX=nj(4%qA6IeVp4@Kr|nJ_i-6cTB16n(aLc
z?&(a4=;#o`$~nzLbv2{rud(TA#~``GWO?Z5q;u4>1a`Y5*U8y4Zq=(|t>@CG{Y?iM
z>GBnXTH}RD{<<h`@ZWPrul0o6gATXT&~(9dd9(lYUp%|&F|~<pF8z|`G^ba*EGpc-
zuM&?rn6J9$%Co`DPLZdqagW7q{e|9<p^j&zn?#k2OEa!_d1b>6r1v!o1-~sNlW~^1
zFa4PQgguvP_M+EzY^sLL9`jR(qG_Oak=z>9zJ*t%W~k{#QB>;N+C+Out^${D1*}$D
zZg0wUwd$CUl(g)f5ZwIFJK6rF2Fc&2)&9xMf0uu=>rT~*?|2e5gJSCATV9&%p?$jz
zyH{CwlP)DRk$6459zQG9<8-|hE62ret?3=4oEoyQ>xE|;qME-ycF2sl;{2v)=KOI5
zzh#L%_&!G9{im<!Qr8R!oqc5%+nXK5zgOc}C8IxcXU=qhj{3cG0Ml<~rv223$+}37
zk}MqrHC|~AE_v~u)>`goW}^gW!a^|w-*Z6OkDK5~TX)>|QS4KLtR!ew&e(KDcV(Rq
znwzrsL0Ng4S04pCFet7ClI`ke96AhVH7Zz9qr8{#EA{}{7n^)d^xXii+o1ym7HoEK
z2xL|@JF^U10f7ZEptS`QQ;z?&BPJ+gjD#BWu*AcU8vhuCl_s`*bvp!LdSXuEIvG@y
zc!wFZn}eD}^CA5H^te7DAf73Q^;lU#jCurF*Q#4kN?;N4z_M5SSe!|5<2PtPz9Qcu
z{E#=JB|_rr0W5B-N!D!ncjhM+Y{_FYKg3=h><$XC!Z)18FSeY=exFmtjZf^Pjsi`h
zJE;vI1O%tvoIrsu<zy5h-361QL++2tD5;SezYnJeKr}4lsVD<?Wst&OI4e#eg|9Iy
z``$CNvZQT=Z20IUI?{ABrjGm@3FR>TW!Vfx{Q7B~bX02+TJ;Gm$z=3Xno&}zu|HwF
z@T4rIxJfn_FfXx4ofa#_C@dlmXI-?T7n`P{>QkppYhv#n0V?@%Th&`g5UG!Bs;d1$
z<)<vId@eC~ff<4B6Y4`%v3=E0V95j#Tw+Qu<4Uzigb+6#CXmeJG7S=0B{&(BRRM6i
zv|8bh)gJ1oVD4`HLyg&6CdS^434T^o7VcdCm-B!wK3R<>e~4ycU;~TRU#3ziL_;i{
z78k&00sUjMOb_F-?h@>SAf|zZr}%t5ihyh1lX5ljdbX=1f|CIRh--Yb7xBLWh<>zN
zuS~X%s#nr@j}{E8IE+<DGWeMX?Z7*bQ<qX@<QK4n-h=jUnbfvc;6c7ylcHr?fpd(g
zIG7<K6+H*c6MewZh+j-!19Iuvl?_B$d>3Z>tdVmhCt8?S;O4ytNkJkZtln<L1nkab
zNKQlyT|tT!C)|EuSt#u&U%5=iQRI^}EW3I>i4AETphiG6bqnms2{=Jp`vlDTePSv>
zkR0>$p<j#j?PeS<P}lVG0Z|aNzjhIaw65}?e1sox-!z?+njjh}up=wWhqO^(8=!>|
zMuy`+#&|Mrrcfe?!4Ao}--wL;#|!?m$nb|-42}jeQK30OyPZEmm~yH9JpN!~hrXV%
zhP*1V3WN*~#*E-|#>yJZ?YVXoVHZ@NgO&$3+qCdmU^5B>XlOWSUB8*Z1dLl8fYb7&
z6#YU4n8OaNmy7Efl8s!1Obl%lG{x^xO;VJ({>5LGAxv3c(}00~MOaZulJSNHppD=6
z%kwy}PPVB(s(*>GeI89mT{IpB<yAPA2S)4dK?E31!c%9ohUGR5@u8-FZQb=Q8UM<3
z>&MW4v84NzvuI5;F!T(e!|EIo)M|di05^ffC-KMLXqje-6Q-5au?c)k%2tm8OzdR{
zwk>x@KuN;S*l!BZB;&2G0aO_Q8PXo}B#w(we;bvw@%I44yIUb$)11u3Dwc4XfpWH&
ztXJdZQ2Z%8!a(Dj<fftNyBjD9(f12+mPyE3#7$Z~!@7T=Pe`8e)~K0Trb;Si0aUyg
z5@Nmjp;La*uX=cELWhY^d8z&HvwOZI2-SFLX#9@wo+=h;_-A3gC?Mq?Bf&#|#p{)I
zw)Y+{5ysL~O1uK_ik1~|TJDOxgvPY&2>d4>n1(Eq3O@Kc1(9~IAi@A)vFQ6}txy_-
zf{iCW4j)Ag$5u&(c$+4i+bA?kHy)cvy#5l~XjMfv$$LUvBI}^mPHRC?^3^RTwUR38
zAE&*?OPX>bhPT%8F(9_3edmo&55-xuCX!6l&J;+BeKRN<0OAo~XdwV(#z$CgKcy}v
zqqXUfx_|+oUf}jbCw?+rWT&BRjyP5^;QKYM-}(MG?~{|k0{>S6)O;45E()rKZiSyb
z&a;ycB(N9P^e+G0_2E&ffz<F)!r3SP{T2Hv^d(jco%m8MrlC6Jm@Am?eTU9<#_sfa
z6MC5I+L5!e%d-AAZ1;byFy7$T{>kp}V|O|nns3BEf12I4sgclg;qu2E9jV{`TthPp
z+;_0w5l_aYAf&jWPN+>oXa4tcA~DTr>%{&Tdz<ca#yj@yI``lu%(n3Aov_q9w+#VM
z%*>I%_V9g;Iq)^r=CI>l-I&SHWtDq`lP8WgP+J}|K)w)Ye`1KMV_ml?R^o}}&z$LZ
ztBQzAdM3Px#-YE53%|sC7rws%Buvz|&$GR#bt#H*_(scge*>?05BEdB{l}iHU=6uA
z{T1zoOgS!<gCx#Zk?WbGtL49Qc?CW^)XHq`w8-ec?ua+z+9%1Cw}#%g>ls!1n7Mjd
zsPiT?yo#JOIpntMuOB~WN#vBYE>g8iwx|xN{#}&&F}~>EKsofruts1-HN`+$_sWlp
zSEF_nSH6S<Z(6jS7yebvnbY~)J(Q@RU-xTqjS%R+P{I7FImAjbCY+#m>>D@~s1#t=
z!jTFdkL;D#oX7axO|M!u@Y305d+Y`IoMzn|@A+pH0ps17rw7Uwb{XR83zH*bZndXN
z`xQ@BZfxFDZ(q;9B6ss6rbDc-KlSH}F3Wc=Dux;wdG`v&nlF^!3)W%=tSJ{r)eLZZ
zVyL;JGow;%QK-g$N4%xj2v#y0WHJimIjAdu7a|+~lxsuKqXr^bU2^mN`qsK8kY3i<
zxU!TOjPs6?^(q`8P&8PJWSG0VW4vsm!gvB$dWRMV6`VfGMj`e8tmnhbB@cKsfq$q`
zLrqIWy5=k!I!--J0t4~{nLt_Kw={D930p866QRx=rMmR$W3WmDGCCT%2o`IT)x<&^
zJEI=G%U3DO-D(HpS$Oas2%-QN0w?J?H4zB3EJuCsMM2v%(+7?EtM*Kh=XO>RJ~Hr1
zJerV%QW$Rry##d=k@_<#w@#7uo5~1iCON4bPu0Y7&>=w{NXft`q?hXdfY<^g4l7v6
z@CM~tKbEsMur6tFyLQ;sh`zkAk=PC?0_8M^Kwf7Gdc|A!cM^l@TR2m2r5nsC21Vs1
zq{H|-mz-4%&WWd4cu(h}tQa(MyVT>Xsf=t5Q{x1KRJZ|j3+#K-#bqh>4J>6uKVp7t
z>41mVcp6u$GzG=4qZB*de*9@X9zvC;mET01vD!I@8Er~RM(0kB5aOXry2fNlrdP76
z*wpfbmbr(<XLtdn7%guYBlK!w3Ap}o`L%!#1`|nS>q(+k5?Lg?t*4otpD7BHlkeM+
zkZFc_Fkp(3s~*|!$Vuj@TA<tzklZFLHJ$;EYWC=KIoC=5?X)Cu^Jdx*H!Pm8rzIjE
zNGAmya1T(N{2#!@L8t)@ksA}bg;PKOl1ahXRI^fo6ep)KUW=zjQG4cOEC)%jdIKtp
z+boxRb6CZkDp_kC&>&+h&j28TtJo&FB#sAefX!J5gQV^-)b}(NmJ)zuNZMnIe1B=S
zZi~pe>LnO=Aiy&w=b(yEp9~h1A_CPkAruZ{Y%5kw;ONjf{;&N{SOT83BS`Yhcag`x
z1@EG$!6j_Q_|jI5(*x@j@xd{*hKXmWt{_u5s8UTTpUed#fgG@DDeiV9YQ2yG4H31W
z(#RkvDx#&1o;Fef{_#tEg%0{mik1=|Vd8WC2RJmJ0{rcN**~B*v>G>f;&YpA;^9zv
zSu>MUg_Mt+BwO_#atT3jMmrP&orG6~AsV9xWeUj0v91*PziULM<heS5Y_=;@ey+xy
zw)WJR3B?F{iQhC5C5@Pb9>a~qLY|CEoek|5kX7eCOy&fxFn78>ZMG?omY^7`_3d(c
z9R2BF8cuV68e_}mL1|y42{G=Ud{jr3l*wWDEBfa`72R^rj$b|NGTqon*xzGn74B5?
zIkZ+EQm=n8Ar|rO-ji8NUf0)0r5&Pi{od@ywhLNzPe^wq=5Y19qkGQm)=pK7Z4KGD
z&+So}_u#~IKsn;jS~Z{f>z6xUQ{G=V9E5jIxN<h?^a39$Arfb&{;cU1Sx=`8GP^ZY
zZgE~+EH<2hXHQmhO6h{rhnkIY@NpMu1aM6I2a!ZDrxud*61%ClCb_(hTd}_w=Y<~P
ztHV^1jod?`p4~B4g=-{DQZ!~(1~4siV<Odz-J#jTC)6s_I4UW*gTCseAOUT>uwzk~
zOD^TC0r5WA^mx4kByWL-Bx3*B0ucY6V)0}>@~w}CD#%dk3oV(PNn`(@m?ps*Km7_L
zn=^7e(<7-Afq<vLy|Iy^<~ToK`MIo2(j7$nOV+tN2_0AaF|U<SBv}g`GRXF)fcN_i
z^UDoamMEDhPJ})PeUK%W^-DBxda+ht#k|#~trpsjGh+{$q^DKmm3zr3{#MOV+!r9%
zdxthkN@0Dg4%Af5!4Ho8Wgys|-*E^0((!bhds+>2Sp(xGVz{kWE!$=3vffcdfUOH1
z=49x_g{Ia++@JgpxZG&*q1a&5Lmo6|>-6-=Hzlu4sd}NryV)zJ9jy<;e(RGEoZcU2
zJYPR|Eqc&U`DB#78LIJSdSUEu8aKaC;u%B0SO%$_h$fls*WUYH`=4Irez@xScHNe+
z(^f8p1;RLEi$nb#Bw)8!=8{Y4Cnh_ntn@#Gl%KF#_MxzIVb14h*v}c#$V)tASgtI$
zI2b^V@Ek0TtN7Sd(cb#4i2}p&a^j3rDSG}>(mCe-{Y(nf9FUAW5Kz=dhxXcU-`eH>
z^rA&uRAYzB&NI?eaBMOPS@|N6^FrD4a{YKh>|P!1(ylmYIX>js@7^)Jbe7@{$E&Vb
z8HiQ59mVU&#gFQM{qWeVv(g@EXv$U5KQMR4vEiookBQib2<=w)k;O6+mX^oOwLjY*
zzq;3uNkh%-u$lRnOy{`?aM#E`=<+zeOCD9~YyGD;8iKsXUTa1LDFC)F{f;8APZHH1
zyfS`2yLfC-(p6gAU)%cd5{19O+&?^_^z_*kg}Px-wP1Jd!$QxMy<h*m<F5)gbRJyU
z<zP;~DZ!)e>b<X%NB@Q8c_C7Ro-3bsIO4LjWvx;4^#hxs9I1Wz&i$fqT~F$;NShI?
z&izn_$qRJc6g&F(@zL3EPHFe`S<h=FK_{R6^7T74|Lj=T+!dSHX9|+P&ut!L6<g>%
ze0Y2%JH&Niq0_yr-7wg)MpL+9@<jY}VQ5}OWopq#x9y)(F<<2Ut|)yllY8VGu==6K
zF?(Xk!F6=@6yCGxfU^NsrWK-MoYH!Gcuj*<Qt~_GxGv2nOguWCLn0^tqh4~iu22x4
zU$SrQtH_n?8;?hT#^VGuMV0i={wALKRy9M<97=<Nfwrm1Jim7Zy0Dn-&BI`<o5!W$
zyY1p0Qt7pjbUz^OQUTKVLxZ=Tx9%UZjw?=ooC8IAG^S27p?MGF^RjJ3CY#9I!vNxT
zJ}srm&W~u}8u9BE2x{F#f@dE3C70Y{pO)qY0U7$G(D}4@sLbo3={!I50=4&&7at<O
zBucWwuxcmM-f%RF{t<JujgvO1R6YfJyTen&YaXo~&q~f6+jx+cNS2$*N+ct@+Ja&)
zk5u-;Ay{yXFOc97AZyXZGL4TWt~fyLUs!m3iFL~8RB3JSATi0Z?#a^R3he@h18)2f
zsi`D0g`H_+6t1$sR3K*ZJe(am)-gMMMizhP`I`%bnTbFxp_LtSgGN&Scu)0~eX(93
zwx#YOwbuu?#Po25^U%?aH>{o=rmzj7)9)@tCyvW>h=KrRj>c7hyT7bQ|58?d)}`xf
znkZQcG))IGvywPp%Foj|2`JOA2tXrshpDPE;$n(^zC{03GV_}AU2z0TuQmxH1nsmD
zyWz8epX5fR*@fAHDlPc1uH}Wf(WF+5c%SO+r~c|d)hL4kZb#1%lm8v5EV(-#UMKZN
z5`j$yq&kY>Vs8*j3L9JLGIeAFI%i}+nWt!FYG0bEks6Zai^6gpUqlvyq>ym%Qqmg{
z-yqXd<)mj|we!rhE)=3!5YYIeR!g%8`cIfR@-{i~i=Px>SlR<=<cp^%zI$JIsh0@R
z9#*Jijmk1d@U<9h3JuTs0<ZH;x&5~7vV)Xwo@CY=@07vONylFal4Xl!j!}6zt6OOP
zID+-A!)x=5XV$NT2qZ!-Ir9%0C*%8}F&B*j)7v!B?s03p=8FMDAQY&BTcONp+#vh0
zKkEcGqpCwNff)yIFp(sk?kSf6j@LJE5XlWxWao5Fo2iW}WdLn~<oWlz({da6`C@hW
z)v@&qfsb4*(D%D()D6BSkv0=c6?`sKds=$x_cWl3ikszzM)k|UwXtUV$Z|&Z4<`!)
zl3>^dn)v+6L8_hd1Xo?bun%SVeHzIOM?=arS}kO|jd8(#j1CXb_E0QW-@EN5c<P&m
zIk{Mh=M^V=yRe1)(ZO>(>t#BYnj+C0QGV~qW~Y%CTTeGBZNTU=AL9kl9?+{E4YaCC
zu@Hx&6g7Q3Y13Q0q8~rIO~*b9ua?C#AJL1k2bLVY|6YFbGaECFXJw*iKX?}Xe0o94
z<YN5z;yJ5p<6UdYo0Hin|6RS$B%gd{7VTAn9lQE2dj{CN_4m!6lG?P=7Xdzp0>b82
z{Pn-|kSb#y{d)D_`ZsM@t+706<;uYR=dRXsW6hs`{Ol<%Ju7(SNI+N2E>49%&aI*O
z?o+LUFOVFyKa&9tAie6Ua{R5`GCoVfGMkAzNSpK)rRply`d*Ldzm1BI1a|UkQ-b!a
zzz2ti;iYGqFxdVio=eYV&#pYZR%lz`)L%#S$9|3Yovg}H@yo+af3kLU20?rKTLdzA
zd#u0}*`!}v3$rphz^KgwZ4}BZR5k!yU5)vwgVp2~Oj-p)f-0_n9bd3S0UCxl(fI;i
z8OkYSy}?3ja}D_$+DX8;?4{7OU$+1S0}VM%Ab%M1ueu;@Te+J?!MhAPzHZV;x-1Xs
z&Hy-&g}yj}22!`hEA5N0Ko4r4A2Xp9B;<8p+iC`_$fFfU@rRs8;_B>^*}wIBw3Mk{
z(c8XIgofEBm(XQmLBL63mZOVSsduaDM7MhFI165zrj2(nkOut?+1AO4Yb@|kFUI(b
zi;mr4g^yo<813`x^#pgqH<r%?t;ko$d+&C<G8E-LWkT+b=?nt0R4>LC!WN@=cX~$h
zW<LpgLqMq%@IqWm|8VH&BS3*iZVrc6J3jlSWJn2kOO&vCoEcF)WwE<E0oTZsyL<aR
zG@f8^Nyp+x;)L;ozAHBtBs>m%op~4qfT0Pd8{5goFNv0agsWZxHoot3t)+c{tNqGR
zvHJUpI?9$jyV?Gi-==4z+t+ZN;v-hE*~|@lD)@(~i;Lh;Z4B>(i_mZWicMMaQJE1g
z$yU2f&dNDDEBf+Ouavs5gcr81?sKNgatM(+IvTsT>Gs|*8NR!>^nO}*@9mqY;RCNf
z%jPofzteke|J`$wB5P%R;qHf5mv82FdV|OQyn2H9p)B*c*5B!s8{fL1apOhkV$<{|
z%ZB+yZ2ZC4|6aEhJU?}0vV!gTInV0jvE#F>fr>cGRj1>xRC_Sq(h*fj@xKqPr<Om-
zUovUJsQzcN;rv_<hV4&OdS>&M1=Kn@P5;$VPDb}!7AEFY8dRRD+s^C#;3VZg>>WDO
z{Iz}Kz|xYA6hH44owR>r_k}Z`j|S-th!?wN$NH$yi&yqJAW%CDHosYJ-+Vo-{iEQ@
z&eN5zq46+zwc=7a4l?HAhldYdJ}R31@8dp)_w;JDS^2%+FZ(|@DS%G`>|+sc?-PY-
z-}1a67k+BpsB-JQ>!<i`eEr&qV_m%}bHu|%)V*`k8p!DCy78^Fode&Vk`fHOl9}pZ
zO2_YRI3MXeO)1N{v)`dX#ds#(mEHuPlbUmolktX%7F!wF{N|6+m~h+Q1^tqMmFAbP
z=I%&;M+NO+>n1^4EZ#aF>Fvm>U_kj+-PgALm~6aiKPq^|E7|U}KEP|*$cUm4Bu_!$
zo)!4iCM7L^R$h2Wp>OroShFdcYU_T;m!onEE|$+B0CHS3U%W!4!TAP~krLWtnhumn
z8|EDaD8MqoixJ)&h!uVi)*iW$B*&EEkDu29vJ7mnBGZgCP!_edPUxlAeBRL5EzW^b
z&&b9rVP)rc0!ZDM#t(1U{wPrw5y5E=^tjSOQWNGiR%!N!4Dv-?#6YzFH6>N>Q=ryG
zP;`vPj&m?qe;G3_69W!1Td?!Z<&QU@?%rrh`sJg|XWCrKn`Xk(3_@ei3bmjz6NarE
zww?i^X{$2{))A4D{|DTz2wn~e%rOewB$<PWA2sE^Pz;9+ZW%W~b0n_S!8jB=*(?jG
zPLj(ND;%3Fl7tD`{ENC_QCs_2O15ukh9TIc`*Xb2Oq9CvNiYtvr1tDuVTEaOU&_^d
zKZI<j*rEXounFdfV22D%YOQ(w#}N&-1WiC_d_B@0%$b?8+Vm{8<F0_4LDguFsIX0L
zw`1HdGq{q?sYNx(Gi3YXusnvg0p184tes5ri)SnWK+Lof)C47`pk|cQPFZ~&Ln`d1
z3))C}96x=m<4qOOEM}Bg3UWuRsYfy5${K-?$GtSdFlgkG<IXa4aDp+p^SwYBh(c%d
zd4sBJi;r7=1;O0c2*5Br@AWsxp^X5Kz#v)NA_|jo-7pI!*WyCBVqkK{%f8rA2%!Pv
zSsFFk0X$i<Nz21fAC?C1kjGGRurW>=5L3pwku#Sw8iPdUpd{so`pvfTKAl6^#vt&Q
z<RH2nt$*Oh{D2u$dQs53|9=3!23Zl;AChSUO=(K1&Z1?R_0zZn$`NEwi6BnF%1AGM
z{)Vl|fy*e1VGKCX5-pLLhMB;Cr6xV<nrJEY3+{Xh7|xZJ4ppNLhl#ha5W^5L9M*Oz
zxye+N;g~C!9K@@eBmt1xsTCXB>t8HtjyBYR2DCsGXmk5tAS*6ih|@)g0NrO8j3slu
z@>HGVshO$gZ0W;Yz5y*^BN9JB73qJH{zyEeAKJy#)2LV8CGLs~GT#L**2Cu=h&<Bu
z1W-vshTKVP_bewTyJ~8gW6T$BB!SAToK~g_gT>xL*9Lcf+dTec+D{zZB|JKC^d>z_
z8hdvBkMpcbRChmRN4j92d7s&i!r^k~^5c_t?^Y!EX{7Cb`6_QN_3Z(7^B@deA{$?N
zwO8-{E4|1(JKXAGb<USBDq}Ts(r;%<&8`<Fo7#8yZB<`RII`HH_CgPJqdtFl6cvY`
zBl(~&*O;D}xAQ5>&@oaOYiZf~&)(;B*LTfd&q??s{S}=yS35d(1vQ#(SFXlNTC@tY
zTQ<wRFZcMV@jEhPPAIN*((N9&1M528uXp)FU5s9Y;rUS8lT$nosYlj}4nHjJv>%B|
z0+1L6)%_uKGIUV)&r#nW!AS;}0=MH%oxf=8s8HhkQff}GCn3_LQ~~~K>eM@?BDwc^
z9mHLNYrd7?e!Hj%+AdK)C*Sy^<`2~zG%-MuTV*JMkO8Tp{s}w;pfjWA@~DdolQK-h
zpBYOeZ0<r1Yur<TjE&KKh1NGuQ&sCxuTZSCth<&F{BE<!p=ph`#I~S%L?0;zpThTL
z^h;c$<vPb%z)g@$a(o{W7(Wz*$P$_ne;`^+`QV;7H7cKsjZE5NOG+(oNylPy-M6g{
zf}?^bbFqV+_{4Uqxk^4tON{M)wnmeb;O?a?X}hx3K{3v>h|9}%0uvSSr{@2WCE{2;
zmcpjBA~p=bx~0pq;vET>fb&q(K9yg4#+g%Fua(z*i;*7Q(VjZ-g-y@!0@bKJ*8j2|
z4Jf@pT!a((*H+F%IIcIQ75rKd^?1k-{koyNv-0oQRdeMSY4G7ysKjs`<6dTCWK{>E
z^iTQMeyN)(-|y@=JOZYpO^#ju`Wb-QFI&C&(Y5y_>>;n7IpKN#Qlh23;cO=qt{Y^=
zS|6}`*m3yLvC#+ptI+hw{j>?uxJ3qHP|#~VM_)c$@%CPXMM!AB?+UQt@ff!s;swd<
z2N4~e>hCVV7HE}31($uVp3D_v{7PFyF%mDOeM&E*c+xwcDqk|e@5(j*d;d$c|M6Hm
zUA5U-t7g{&alQNYxL+ae9Ue~=9dqlXucf7zvmZO{yH{Z(k$Ncp#>?J){ElZ+T^ggX
z16^IIsP)h;x0$V;<keJB`u}mh1k1PadP}qM+}9%k<lQ5N(5?So67Cm0!ME(wK()=j
z`K;@3YBc-qjklR@Klv=u?q?UY<@S>#xm4td^A8@_|Dj8PL#W$7#b3=xS<*vo>llO3
z6_=6mfio^|r@P$aUEyYU&z~#1+7|Z(=$IUhfUuHCZ{J<{lFN*|Ty?I%AZElie|qYS
zh|qocj7A7ca4-8i<p0X`%%##lPvVt&!zRXsf8<>ES@{{QX9d`FNud@=nUK#vW-DSI
z(6b#L9lKp1p)@9ovxO_1KJ@U4QrO*+2IHg?(WWywOXtgr%AQ}0?^uZ6FALu~E?(+-
zcvku;w-0@BD{(;CO-H}bw77gX-nQkX>a(_M`A<J5JBlh2-b^-b8D6@#;W7DvE2R>W
z4$m3Om6?1WnA#TgUFX~G`0ucm)wNf|o)eb$!#JMd7WP8c`fJ-6rE|7}`@`ky+lI1#
ze2zSK({3XOQ9&S&z0voPxM>>&uz}h?8uQ=GDRT2`yImEMkul63edydfR<VYb5Qrn$
z7shFWpt<CQDtADH?u|ne1G+`_DdF$}_XH#f*qAUA>#^E1%}nz(7^MMcW>5>=61lN<
zPt^q+u=Qi@06=x%CJ2OaX^PhiwoLzpl=Mt#8|WbGF$s9rA#TK#)x0NUKsm=WE-om~
zKNE<ITlt5cV}dH1&}v))QyGXDJp{JoEXHb$>aVO9=|b6(!Kmwm9(_Z(_+-Dn;Lz5T
ze053oDSQJWX<@_yP3lX(n5+#X^Uy71d*pXcW5m{BK*2Ur^=$bq_ih5&Fj+bF7-7&e
zl{i11<09diPzN>ut+HTItskzT=U^Zd({Zveo|2}6DeMq$L&M=^2jlW+i4(<$`3P(e
z#*MQ0ZUXzcF$zJ3bpg2+=)1QCIiq8Z1}R1&rDB%%r>PlK#_qJwUybq0hbWjq3<`Dq
zYCMIB9RzVXARk~QROw+=o%!?f%hsh_>ok@Hk#x$i2Mb0RTReKRQB-G}*MaoKe=(Ii
zBx+Fw5+MQcc=xbMyMTW;$<_c@Nw+_AgNRg$D6UzdK@C9L5yQRW9nvjsXJrdNG1zoe
zeP=?XVr5(+AGq68f(~&8%)Jr0J2@IjO}u8eez^D-Bx(*klHaGP%f_FH*mYQ13C~IQ
zLIO>ya%7Mf$2gfm=z-}Q1R{>O@Zb<!y;f#ga;H;qVkzc~SV8*@^>*En9!FU$3%>x1
z=7!QUQKL@9FI*^C<1s?;vfTg^$|Qdf5&a=WSfeqlS7Lo32xsvt^Tcj!udbj)lw1<T
z4-CQrANK`8(JLDdbj?_c0<51~=}jc{fP<edwqN9tbe40Fy8bF=BY!1iSmu7EmXdX8
zR=Vd59Nrs(kliD<ab4dXf`>HV386FqPE4KFRS&v5xyi}go?)2>2JS|T(8Ud5x=CtP
zz6qgd`K^zu_WPiWzNUx+E^*-g-07Td6l?qe%cO#5&%O#$4zU2)m$u$)+U&S1OOVV3
z;YABR#Pdk;?P2mrjY%$1zn#nrC2=M0K^$H<qe3COcHI$XPTK!|_U6G(fn$8?BM&mv
zC0wV>#@z8Ewar-eWqZ^qq^Eq&i)Kg7AGL9SPR39fSzjYpwi)62=OR+iKWgF8KDNRb
zG8n2>APl&cU!kTvK)Yg~=W!|cdfscDEW)#;Y1on^TyEF%B>C_r=@8EIicGY{ilgZL
zN|V`31p{cNX-Z@f+clsiB4eD}t)ZYHd%ch=W4Oe?ixrI1=h5);(Dx<)?a#rWa7Zwx
zF4DkJBPW_ImT=C{t1pKNQH37By1+}ID~NoGN@F%C!Rw26ga3i-d!r<>2GG0zf=+@j
zB;G&i6>f}(?fYvteERWOKV?Sl!m(Rd?{c7va@@BR+<0G&YM!PJLV7>3`s8_!3IDFw
zPzpCzGdOL~m_tlhnJk|E{;{V-PCv%#%7X*XT?alTh}gL7FZInGzVfJ;@ov2JFHhc5
zJTKv9o(K5|*Q9~+i<3^{(z4#1WQv<4uRS*K@teXQis&7aUsznO0+sRTqr}AQ9stFg
znpkN*EV%@))#^vU;6NV+t5}0zaWc_sTN!2P-PTQW%TxdW0X#)TJO47Nu?{R5OiYQX
zut(D3MYTn!Sri%O7$qv>D2*c+5Q(iohg{3b#7^=$pC__7fP%!MH6v3Ro3Q^0^&2AX
z%k%*J6wJzN+{IL)k%&;v0_%q$!4#N8h*|kJkQ8+FTK7{osDh&L)MR3rQe?%SjSMmA
z=hWGKSFWnR)h++?yZXbh;bB)sE#bi5s@wf_9lFvelQSn@6=+>idhsEb`kq}Cz4spX
zngki^wu)|!=oJwe<(h_)2&=sDTZ|?-@h|>m=asH{|Mm@@o6tFu#0H>c-kZBWAo}|R
z30EG|_P!j*{1|KfIWMqA&H71b0{mjIfy}IE<LvDVHxzo(mqfx#?`5guLtmPQ+v6OE
zr7D=zYe(CN&mB)!3@zNYn-#a9iwOQszh_QT(t!gH`9SHcpXw<z>Y@nyG*7Nyc)$-N
z>C5`(&6IA7aO$*PYVo^oc_ppIx`w+R>7_joYuOW34hoJ~j~uyJW3v=Esu3R*e*E(N
zeP=7{S}j6Xw6^>cmT#>)<k{LiGU`6#DtY@y+pXN|icchhVZ)&-YtP?h+8uN;9vl}5
zfE!P8m1)IjX#KL9=I!ReUSZLc1Jiib)qZ=#>S(9IJ1m1Xxzp0rJP7t%^<M5zM<@C>
zt@{O%XY0>@b58y0%Twl2kM?@)9V+LxZtIpN$rLET$jGW&8I)kL;zjn+V^smV<o^Lq
z`RRBhkd}MAV$Dj_-k&gk^_tJ;X`}HNq^HoWa4_p+5wCQdN|xGju%mQDR6>?6U#fG}
zUTlY{Jgh6W)jaRd8TQV74r!rh9kM**N-%lc_fNYL3PY)iegJA^$2T10+{-DBDq;{M
zeNU=TIpl5D>AB-UBU|YOQ-6OL8PCZ;1&tD&(}w2>{0cz1^`6wP`|?iz(vE+E{cf$y
zslRs>#MC#dzTR<vAk@>yN#ncWW#@B>mK%`-Xk3yi9lZ#z?yEnLwlH+1K~o393vXH{
z1aWW;)@{F%H56Kx*g7(TXgq?JkgiaYQEt=?)`0EKGYJ?=vF$I~0oU+tpyz#@$&d4N
zCq-J5$iE2yZ?g*tVv3vnW|G0}Hd3o<+1AD83{+4CJ)zMZe9_w=bd(t^PHbPg=1i4b
z_As2z4aB=a7eEP~wF+xkCA@=UJD%af5|0aA(=5eEK!@c43Y2#m>kj=C!NE^hn1%+A
z-j#N5U{DT$ls%W#j@*WTq{{s{+WArfw@^q<468{jUt%ReJ3l#iSYHJGP(Uh=-PWqp
zx;!}elwVm~nf&ipRv(iTntE7LNlLiTa^OIhXEfP?vCqcglGRzR-{$t_0`stlgh+oF
z%<&Y~YJ=^IoNIG6`1wjv(P;x*upXg5VvPuu1;iB~c}5@6;dK4MZ8%Jrki3;KMR3gN
zy%tIDhnNUcorM?kdkv93<80&ko1==QXXiz63^n9Zb>rvHv$<p>%#q+cm(D{E0CANy
z%`)TJ<81rJ(DkKU!m6ma02&-)Aq!*Dr4)aoneD?KUX%8n@`fafBLHhM$QCXRGtE@>
zD~n$U^uUIz(tA7bik0s=GqG@NM7|QfaYhx9&$i!O%3)*E^}7i`N`Lw|^Kj>$Ty@e^
z=DWIRhKnSal<Tfo-TFQsxL*gP>cKPG;VogzkW{kB8}D*<5^Yuh=W&wW09XcQ^nXBs
zAE=C|Qh-7R>l3j3g=CYdaWsH~qGdflRCl595lMVUJkXsvcX=58LC_4g%)j3Wz-CzL
zZ_73#S0BcQ=wcG<^Zyz88Z$JyDIaY=3#AA8k`l!6Q}ea<0ASDm7<2=G2KO#mT^0)`
zv3OpL0|A9xY9vR@@tpvpuqhf?eSv6POmW;JXU6DwuFyY0mL6|aE_86|#AjJBoz33h
zIkWL(YTNAf)?e<qbk%1beYK-YB#tl1<9j_DiBdxs+AqXj+L_TBxPoZA%_T=tKZ^}F
z%Cn9=%dF5R?ufi02>x^InHsCF?sG>fwbUc##0eMO`qZdhSEJiPz*Q8hl5y>(j@kjw
zK!MN(LGa;AKFIXNMn0A+ZffN$5ZovKP1Q4O`JgsU>L-$Q1p%C`Ste^s*r|6piI3LT
z;7_%ZN>2&Vy$+1lK3x0V?x5<36)o{zFHEy%g)G5)J2vcy>8PoK+=O8PKUi^Wg>3FR
z#wz!E0Y!(7ro*Vz6>REElW5JCzS_|;lv(|&)_UP@)ZS49*8w_Uvw<#+pr@}ebCcv%
z%O;HGyfX(DM$b8_)K`nj@KI2?6yBlqMHw~V?S~iH9_HkUQ;*yL82L?+N~vRFW(G6Z
zm}JH~sf)ST`So1^ehw2ppXKcQY6>B*pU&F;j5!umIUIJv==C5~GMT%7^-iv{-JQ48
zo3_4l*G=)IXKdMR{kt;ut9QeXe_{-g#8R}J@-s>Jlo-F^_zr#pu}TXhP+e4u#sitW
zdN!P?>?a#pO^6S|MNkohao$4u!?oKtTd@D}vWV3nL;}4g7!AT8EJfmqIW8(iNXCJ#
zG?+I#Maspch>|15aYX<1<z;)+Vlb=+1bh+q<>J!{4(luG6VRDZnyd@~*hpmdTfd4*
z>j6a0;yyu)(3IB48QFLoN|o@a+oOgr`%F!i(SVBzFfA-d)uLALxjhR5N00mV{zys}
zPFz))to!tJ^;LMEF66?Wn$fWPe^?ry?>tWZiJvgRm}gElr3s=IZbVpVKD1%IF8q(l
zg({yr{L=a>#^w^|&Nm&M)NB>A_N|I?%=IVDw=bUirxM>h^9s~gFnr#3UOI@F8@WGO
z20u}gR&*n`Lds7LAI3T{GkY*MT#+eSn)1oB>~`G`p;gQ-?+b56Et+^Q^rR*Nlv;zB
zl1m_cxEsPxGhddp|JH4#dVa*<u-c<<v((#ymf*2}a$$a+j}I)3#9TV>=^kJ<H<owT
zjJ8kVG#KWsJ6-Vf*zL!JZLIs&b&ID`;rc~%-rD9j8}y082!Eu9h^V`A@RZZ<Pa}nl
zRIuyZX(dT8?-BoWr`JDM`c)tQ+r9fHk9fNDk?+|Ht+z=H$~B9sw8rd_n))*r{r9SO
z%R`&l7jn-~g7J%6|84KJDh&5=I_N6VJ$dW4EIxVbnA0=Q3;sa~giC)@Kd1T}-<ChF
zy)pUQPtIiXSNZJE>7PacFFw6_=ot)#+$@rdjQDlj?9h+y=9l`&Ajch6;lEqvYMays
z4tHMkZ$C{EYZ5Jn7&Jl#8;jrJk(GG6Hc!qe9*LTq(@pg^*Se4;g-?*H+`Qvs@ov}<
zx@<)y<URCDxc2zfzyAS--k`YIstVD#nX4Y$^3~k_^|jL*%Ee&l*{UMTk|%q=_E^7l
zaKDGZFO~CNd{7x*j5xHL2A=F<7d&)1R@i9N!$&2Cn73?gpU0n(RaRI_Daw-1>27WF
zYK8JdfTdmxukwn@$->C&oZ>{azVs3ZQ^HbbD%#iz<eg0OFhs3Jp5aDOR&jNz=kg&C
zDZ(%XtLpARneWj-0>yI~te}MC`j?_6RU`r4GP(VRc@`0n_-~Om4(Lzwf-hb4N_m_#
zSr3%cRhV^Wn?a`g&qf>q<V-PcMK^6Z<J&ATd`^M|m3R+P!^}myL1HudiQ>UfL5kxn
zlZ@hJC8gD<^;pp2v1)4B9$vx}FKOolPjL)#H`7_M0^FYMn3;mH6&ekdWOC1uE^`Qz
zQ>_5?V7@X!Zp5gOQ=)DmRTnXSPG4WpI#&le__&Lg2Ti>MmI*{F5AJDga;%7}f6ru|
zfZF<Uy|1AY%G!NC<Uin2hqq&m$T^@kV=|5cO$Va81Ry_+^66}MI;Ng!?9hcHmq#Q*
z$XHnq{aWo|>8@JsLIuUG-M7fZ8vkuyaXwZs=*`c_2L(@n1GBw5^aq<>w3CO0yo}Nv
zUX55<95Ms>lc(x%z%Myj7%$dU<pqQ=PZm!lO&IYx=eo}{6*i&cV)j2Rf=M<gH*(1x
z;z47`4DR`M+nd^<d9pkF#JafVL(2?|N`CL;zhMF%)?}ubHk(1oDt=eEy8V=Kb;Tym
zVNZ%|w!6^6)o|;`{Z}m*j-f_}Hc#9($SL1f?8m!hKW%a`8aCWi-hri;iL;XI`~cCr
z&Se`WW&v()YBf6>M^y?gt&HCSDov7W^SB=HRAPGLYXmdYAEipqH(O<W3{1sHCO4(Z
zWXmET>HR^O`o{Qu!{>0_(9|_!Z8$R4yw2rDt}fH(D^{%C%Mpx55LQy7aF@jq0vc{+
z5JnBpbrgay$N&!?D?CX_KTEFqVb+x8d(AQc6~kX*ic-Dr--?)+9H!Xs2wPb~s3QK5
z2bPXfGm_FUe(#T{P4MC%37`JbA&%J1-3lbbzHi5$;Dxd>=e^Idt(%%(*)OKnUe&KZ
z(xE%-(NkBd5;)a*tPtO1K|dZ&D_7}!43#uZGff<>P4y8awY|_%Wn_-H6_p%^ar~xX
z?+~8GS=iA9NkYZJ9Cy64@TlwD{gQ+fu1gG9&xPiHOiaR@ZG{JRMEu=;M#pB67)`qj
z#w^wg;rCxBXUQ8JlKMha&Etm9VG*^0m{|kkj{$?D!rPDhATXK7_cPvnFKr-~Mt=)h
zz;f`!{|?5%27cd``*gUJm1VdR>-g+w_QBy(ZCZ3=Y!l``T`Wd}t*fQo%O`a~z*u2|
z0JjGMgHjm><l?&(L@`?s_RM$a(Dd8&5~G`HU-@Y{xL%q<`to7pd}2+A=@D`Cm`0hY
zE#a>ETVrOB^_^DbQ^1+2l$Z;tYY>*o<5sth`z7Fi*eA`ZmgPosrAV=<%C+kkpw>)`
z>a}CL_GUs7g_r~pCCS}&rx*IJGuSXM^Q2$FG%N%=49*yY>0{M;j(8WU-3S;mc+vGl
zQk-H@KR?UNya}d2mh)-~A`&c6$bdh89VFemmk#Vm0~XVEg2)4l0GuuBD@*NnPY7&&
z`&a$3uRd{NI(@HvWv&T4o>?w$peI%|R&~TpxgVYm$XFv^d?JK44n5<%_9N`JA3hCO
z7a9jZ$+9wCi3YTH<BhNTHPRBBNHz|dM$t~^CJ#e+ftl60gHOwn@TTdhv6}b*ENY=v
z#7BNR@%^|`fbMAu*55CIWf8+N>%Cj^(ytU>(&oS94d|FS|1er#*UO(REjx+=i+>9_
zec<iitBs~r=Nj+|2U@j*d!<D;J&Rb^3ntUPM(d{ydDitU1g|XM#r>E}v&6EEh!33)
zp1!0sq<@)Y-9$39d;bSSH_`{9leX^Ie=ds*d60iuwmpIqaiVwF&bxzES5W)=`93J6
z`8}=a3t?II`JP`TG;9n89mh|dDQ6GExtGrAPc6~J1A3_-7=23?XhDZY7(6IE_3&it
z6IbO+NE+tf;)&94lqYWPV1MY@uIo8<Q*2NJXu_qSbL2f*_@VOTc_o|l3*b&#<GEOM
z?c>syQBZejPoXWnO&6$=@#xWXdvq`~+_e8}UXSND=lkDJgk?Xx87#|4!{|jlIsWZ?
z)Q2ylb)prszpvh8Jb3o#-5fRtW$cnrV4NK1*>4L7?{tkVc@nyPuD|ke?0QzV>#MZC
z@+qAow?ocK`9XUDuBF_vQ5xt)#41@%e`0{>tL^kkt#~h=j@|X~^kX&EoDXN>Uhb80
z3ad!3*nS@Rz;Exq8x`h3zNKuk=4GX;QY%L~DqjsF?mQ@Z;J_pMN{P=)?B4voP#yI5
z{ki*wf96+T!Q4Y9U;8~51u-XW$B*B)Cq1!(r~lcyV@N(?h@DflK37&3+G}Fgsaf&c
z^yn_|1##E4dy<hlf!I>AsK@?1`JdgVMEj&g2QlOJu*av4K#Qq7wP*FS`yd6g*5&bY
z{M@6jzn$p1a-mgP{ZYf9T95CQ16N!F_J)Sv3@{nF^hHD7v+JYof3h&`lo7Z1X%60c
z5_|2s<0E0a6VcquaaXrta#G8a(7`XA0CrGZh_>@7O)@6$ws2wq04`D=aaWWoF*Vqe
z;gvaO09H&0(KT-Top<H!j2EiDU=R@sSWlIxh0_!X=EE4m0M2|RL$#i?UlE6+(~uqw
zo9B$ELg^f^b>);0I|>yzAf;901H}@*99cB^2-<$)rC7>pjBO<%z@3q>p#q>ByuEFL
zEkM%EUzpB~$1^~L39NA(liGV8_-#Og@kO<gt3~b^OJ*h*_XA|GZPy!6OQ!rhy4hWw
z?}dbNceqQ~>_4wAo+qov#Npd!#aposeLPP3x#fe{bk3-=;GU}4;s{V;nbf3s=rk#E
z+-Dt6!AH1<|7Adjpy3n1Qr}>3|JlE^%)mYUkc=c_aZD=0ZXtCN#}|@&R!oH{p`$lW
zV^db4&IF&Cgmpr{J95v;*eFsOZ#;}3qGGw!cnVD5#s&MCm!qP4g%D!(idszpkd)w=
zd{z{lJa~O^SoDDkT=p9CWvk@yS?k#A1oz}q3Wq2uL!9*ocuufqFlJrzT;M-5O7bW+
zJv}mGi4HTYotpo90RGn^qhJvm#>-a=PF}24H&2pTnn&Jn=jzU~2~sPvkzBT!l}x>S
z^?{3->A^y6QSC*)q$X}L2h>9fu`g^<M#g$|Elh#kL!vutKL!tTij#2UH=x=ach%vY
z<`gr7XpfkfB>R<clblEA1S0xzl=w-%*~i68LEpD+NE>~zB`k56&kkjKi~xoNQ(Tu<
zN&*XmO6z`H@}W`R>KhhleSZk>K<GoS9`a>J2NN0=H;4v~EbV-xPkCA^P*y5=N*_FA
z)JF#rR1L)^b5dzs>v`0GWVER;0WK~@=yw@cj?CvF`|^;xQ&>y~-5B!_UO*hdtA7b8
zr8tl)9b4srD3#WSXk(K5CoCv=7Lr&E5|v()MQp)`oKpCZU+X|K1bg?NH-jpwR49@q
zlCY_h?o6%?Zq+hwI|*wMxFJ>&QV>^xB*{F>%<}z6t<MOtekgX;xm=1gJbLNe6&(h}
zICUrXV6{f-ESX%aiq5i|5p<Hl-nMc_^jq<?>Ss%KF9?C+=RFzS5y7(NsP&LLM*;-K
zLpt5sCr@?^?*T@LX0Ltf`0diRyY|#g&tIzfxHHU8%BF&ACKy14O)-+s{1?V2+3bH7
zM5zT^yk0)EfN1?NZ<4H)3$S*4PKo2FQw|GIoZ32~&tdu6ib`X+-6+|>3g1?;-vTPu
z0Srboese`TUCx~6^ZvV<9F|3hKBi=bx}+Xb`V{O=MGq5pdOb<^w4d}I-GA$O%f88R
z#+T@CZ{y^pQe2N6szaD1u@wI}-Fj*B18LuaIlCil-zoF=L+qz}KLrmzzg%`B9{am3
z{=I}>FQZ$qDm<?Bz1!7?6!<(obQC%$EC*y_CVU<p>e|QFt?H7Su$gn}b*~+c7b$@1
zF3GAMlU*~jh0v^b4beN-pI<zBC5k?&x1;0H(;O=e!>#=j=V~wV<sig_>Bo=Ho+}*_
z>LfJ}DV3C_)k7>rg-`5(64X#$&-u3}uyII_rd=0I5Q~E5v|~}$_hvN(>Byjfu{!&&
zUsRw&AFZxf3o51>bJRh@!r-PBpH{1*8^(UK-W!hv|CU5|vEKiB$Ffi%CoDhI)~(E4
z54lkozX!r5#hx+M_MLRN#jCBwW6rK?llw(w$niM6H+~m&eHD3Qv#$(~kcpQS+jbm(
z<H}(7vD${N*uFfXH?!|X*(4DieaH7g9AQPC89bTu^*q_-X9CASCSG%V>e8N_|7J0&
z1VpW?tBrGj{Lv?$&kSqtd&Qi2_f1z)z7eC9ZSRo#529IgsVCq=*?ohmS5*&dx(baE
zANyo?Zs~uyLfr2zff^f{uVVVCoD>Ke-2Wqfxa1Jno4z>lkXgD`WU^uR#FrRUI@mO=
z+zsh&8``la;>X*IZW6ST9jDFRlp)FVTIcY>FP&W%Ki${k@ar=Mr%lsV<0>%GUHXOQ
z7ZdjpO^+A_C<1Yz$e5U9bJyB;`LS9bUYj`^<X;r9(*IA<x%e~r|8IOJn>lV)&WD*b
z5^~BRrzwZ6h8&Yr&WD^&A*V5l$YD}*7IKP0&ZLp^hn&mV93m<?l|%IX?e`Dt-s8Tv
z`~7}juj_hViDfftm|moI=q|AshE&^$*Ku}f8XJF}#RMb|a(_4*@rEO}n4l*1V)04e
zs~akqj%Mi2A5UggYO5kc&pKT!s%7{~;K8e`jJf+jzyg=ixQ^WmHLv79K<AsAmBnMr
zKB0iQ6jJze?RRgrKhVc9fb_2eU&PLqwG$R5f&|alUe%-V%WDV3nV-1l9aWa~sJ|P$
zJrHF%)M|Yk`)DhQOnmk`MKSO<L6hwBSGF`!U?$>w)&BtUVHK?cjUFL3S#tFypX;id
z%tL++5Bd5OO|WwR<0*%M&X`n{7Y0AKD6#Qz1RRyO3%eb>z47l{mH0b=eyOzdTaP~3
z!{~mf#E@^6_WL`(1;_R<^ZkF>%SXQ7DJy0#K=uOU!cQV!_i9zXAL0f!tHNsaV0L?s
z=-z4F+o$wpmFQ2%Ix4<v^_ENHrHS*rzhTW^0-m;p$l6~GlD))_%}=S5`du~B-f;X$
zMh459vbM8rbBaq0YI**JL)OFaBj#_&@|U>!*0}cufAU^5kJnHht?d2XHgj1)PbGy&
z(ole2_fr%~hoeK}r&Pcn&wt{1sNODJB&b75Vg={$WeFXL(AK-7sGRu$3wBxu^k~AS
z2<ovytX)ZB6<6IAIcq?4G`xVK89i7+8Gf8pl8W0?uhoCKfMi_UV+AGf+(FV)X8Mi#
z6M?9CWs<Uh))Ui!0DC(GD*^)F5!Z@OHjzk<JfSA&JQmX%!|69g$AS}r8d41b%yWBr
z2L$e!WSvyfYK#e8l_Uccpho|g=M^x|D9qFFNvLd7o!_(sq&Lhi{y28_t-WL>9gk3)
z8zv9W|4RVF_<s25D1U}^`EtMsb_bfG2o~lnBQoJt0gs%XwRK9Cy|113`yHQq^BTJG
z+5An3$0s9fKp0Kb^dPXRBb^#14k7?ndiV7p{D5=^je<Wn^g`*lja6WNrV&H{qdOdf
zj|qRTqle4~eN<4-mUI*2Nc-fh21ybFpg8c5l=d6!l1eI#fvZ#iNRp~{wFzMd7A}ne
zaw~G-a0phU5^V*2PMyh6FCju!Z5$R7P@qE?2Ek3`h+l;y7TTFaqYbyx<bjNp1QM1X
zk_RbZN8(^1*RI>Gv8+G)LgA8Vf}%^XiAXY831B_!v82T0qw46LD_+yv1)Qem$GZSO
z$*uzdYxueE0iZ>TEJkR5jN#%FJ0Hm4Cp*JYl{-itMJ@yvA!jPU<CWs<KT+1L^f%lE
zq!q3<<!x@AD6GJmt*qbF5<0w6hNJ&)EWPhAzPX)rj0H#p^LRT0iv0%XN}JN;Ls=b|
zytXNZON1$(@?`=*^r92UGc(<LV|^)`$5@VXq>BN5(6JzA#~3v&0ruRFAY_UkOr_%*
z_TL3RUIH1eFw6q@j36fI{@S~)BK$!3RVZLDq3snj^?gO#8jr!c^5Nj5?;M1r;A>7s
z*}Q4IzX^ScLfLj=6#z6aEUoXWRGL8alGC;>ARTi6-n~YUTxzN>VFRXolxXJ;tYoPS
zfSO^Aoq%>aEp7@6MhwE=(<8k6dgBvfUm}PK?z!Hdtjhy1jUiIyO<S(&F9RrnYWsBt
ze1$9G6mS1x=T-fowvqY~m6Oq$p`3?jz>Jxd3=JVQwIu_NrtaRL3r=bXzff}{1(y*l
zK?<%ouGK4Aq1(*xR+zeCKub26)GhEx>i3Fnn<n&vJxxaWTAOl&O<>*XPC!VA=*9x!
zb^%x=MM_#Qu1i;ubt_@W7wQ62$LrBQL`yw-E0OM)N_#|0@(WTbC#dv_Oz5|gL{GM>
z&hIaCaBPQFv5CJdK(G_~6Qu=110E+ssxJH0Q9>CqwRa_$P;yDKn#F777y`~GptDU+
zFE|xQxAdSxkeVQim8l9@w|S3ldVTBbz%0^?Z}^0=`-v~}t@TqSW-2A~rn~U~*O-l6
zvH{yuCK%GSr3N7())KdSv2jhlL~0OA!*wz8&eV~4`*;EW25_muk^_n*ywAIPCDt;O
zdb`gR(Dsf?0r~QF{*UMf3hABd8hH0!H=g1yo9oV<tbG(wrmhvqua;-bGe=0)O^la<
z(^nt-`w)yrU8lVHo{G-wlqaI%8B$oF0``x5!kfck2hOy@89RefE}Eh*lIQm0yKC^K
zeeE)vwe;?lq(Z9TMNh-PI*+V}%QT_*IUoKwClCH_8a3FVc%7C3`-Sg9$;P0JSg+R7
z-(vnnlegVD&975zJVRM~%7BhAI=QyI;g4_4hahDQ&lh(2v2!OA8)hlxW8QD#15Zc@
zK|s^`p0$Oo^5k`C#HL9dt&#6o_HAe;f?Fc>b*q8LrL}CWRZ{I2<EZZ>|9{i_`<7Cl
z12(#^c|!?_4{9lv1>>9dFTW`@uj>NS%s*V=yjt5jrJqPVMVgN&$K2iBXzXHnfA;!s
zD=(^B!&Ai>xqxUzS+<_k7h>Svt+$h}N?u<3+JHj4PCDLr(-Q=kUt}NT25;6aEMKaa
zRK!7&?(P=bVYk%c2e0rcQ|5xDIu79=A3xqW=2RWv^;VNmO#pG^kaXKl-XrQ)nNj}%
zGYNN;$d96HGbSN6n%q&BOm10~pQ}rdM8l6+6auaj8BSf|g@C@y?&=EP#Q%UZqA$xr
z<w|VeHRo){_b+<^e^+FsJtyOjs-~DDeFjsn{_xm6Gv$M2>9_n(60c6QX@DUumFPjZ
z#MWli&Sj&22CTemXQiLIOD<d%yy)v4p?&S${{F)5gH!;0^^ZZrTd%-LR~zSRc|W?3
zYT|-VMa;#2df`oEVVgDkSr3ra<`4#=A<*WEAUMg;<LB?)TdqN4rDz|+F=*Dax+=#~
zwEbB&1IiXF_@ii>tc`P}om=7zXKA<jWOP##J0(5nMg996YM}4wwu5VW*m>X#oGaSH
z!SX@ss9gVQEdH^2figKjWbBK?L-q}~po?C3@WK=@mMBS&{O{GqerCxyCj-{9t+sv3
z=HX>ofq8>;n>`(5^YQXP@6Hef3ujK;YMTYV=FKEmj{N8lNy6hd=|TcMTvq1wjZ}v)
z@qfx1@jH^znGLs0TN84!&q|P0VZ$BHV-w!!XnU$o#Yo{c1KaC=1$5OtJ9_tgp`Bse
zBP0O}jsgxiZf8P)jNre2w?tErk>MzkNG&#9<lm3HJ)x?8P=AgfS3pNyXhAXox8@q{
zRGJzoG2pfW*yV;Zgp+KzZahSjyw|IL<fstP9h$)NLJ1&G@e3rbuxH47fh5tnD|q4c
zfheaOS0&6?$aP<d;SW+!daL7@x%yI3K)fhcK)dXo1vDD>G2`0%VD<wB>M!=8^}+02
zHDUNEvY|D{-wTDaGTA}XvImDF7cI2}yL;99|8}Z)si=YH28Vhl`I+a<0Qt}xW`WOR
z0>EwOXIOdpw+-F3N3c9B-BekedLGLdnQ$2C>sWK`HiE=GByTE2SZVHM8hHV;O7ht}
z%dWptUW_Uuj9QNX<go4(CAfEWT7Vnd<&<c!5*y4){Nr*yCavl*h(MD}fmGS&i~bML
zQarOQyC92VbD0E<C#&xp^rW8{=uI80Z}o3$JOC^i^irZB4okv|9|C3%NNy*yX=HiY
zV7P%=r@^P1-!VMc%*5{2VJhn?0dqR8ci3Cu>&Y#F$cCb+p?{?y1lqdjd3-`p@yanS
zZN{PNf$=(#vfbC$0_6S}rD3n!f5LcH9S+gm*(RVm(7U)k7&_Hh(`0@uW>&~$+3*{!
zUneog#Y|Sw@Ut9QFF|3J5{pHI^(p;9lJ5RVx+Mj2Orzm1^(vY$S(XY3b$VEC<o8%|
zvZmL;GC86v0JY%#Y^Kx6Zd&<L$;V{KEerv{IqU=;6ysQS9fvPp?77$}r0jxQ_RPjE
zYAZ=8!0-?%t-o1G>(}|~Ib2~^@U$36vQbxo$QKYNJy8I4jEh%QZ)4JM>^pT;;Ek+_
zd2p=}HYWk3y5s?Y9~x5sRtHOY)p*?B*w3YliwxbvPa`%my9>^#&Z5t$t#o#Vrw+Z-
zW=60R`e@1hxO5fyu&-FowlL4GKg0fULx4QEl4Gpzbwf!Seu*pT;#F^X-GuC6=3*_;
zD8Zr*ph}L|3-Rj$=7dUXC#C&u0rD*IH{I&g$Q*{kQs=8tyx6<q#ywD~!|5|i{RE1)
z3~S@JJVC(+YmjGIaL)Xz*%)+wEIip^lb2^&UWkv1TVXc8Kr;nY{yr1R<x}ZvL2sKR
z%Q3b6r|*)(q#Ej!GpXuY842?$!YBdl3Rt{6v7N}4ZmQl&Oh<(|Uo~?;b9MQb%FXGo
z!hf7~%{$!Yy9vKfrb<XAhFsXEne6GNu!m2INTF@)68mg8uyhbc;UFl_5V9vjb}%ro
zyj<e%Guv%bL9o|c!^o9cg{P~4qX(>WHN<c@^z96<WKN;>7Zz>}jO+(LEPGUl*ku{H
z-19RYNR4-);&@w!u;9dUEUn{*PaGUT>#jW~Rp=3|=;?@dHfd5O^f%v-*TwiPiBV|+
zpICp0pJB5*VA|VuO<FLa)y;puLex{Q*LtGiM;JgqO<vJtl$a9Ou!`!~CXpJypdB|;
z*AzskJb1{$9xG&d+${wZtBfFun(FB3Ssy^<Q=Nx~db9AEXr~Ts79tjaZSe|VEw48C
zoEYsdljcE4P*H<NTyj-QRO$M3CMgshuLyBD69k0Pr+Q>hQPoY4g=Z<r8{cDWN><R@
z-uRzurQ8}aGRCBVvYvEYI9=^;hI1?>9bCWiYe@fbO`mLxmQiJ9hLNhb8>+kinFb{q
zO|nCHlXd;%_^NqRVg2Vkk6Y?qrzAhq_=n?{IIxK0`NgaxWIU+J|1f$h#KR}RQxs9H
zTaECEG*74esVFTTdwoBnqC<%nP+_x>agEGfi2I|J?BB=_4@7Ne_n_{(|8g1Q>=LrK
z_yDP#*Sh(+VEkmLrrq>XSGWX{xiyT*yliAnTak?@W?X<=DskxTg<Fkn<1VjBB0Qh#
z&}JJfi~^fS6V#T32mRc0%E#gb0%3T00>-=!ffizSk^GkGnXamocwS|Y&?}Ng_!-)P
z6=hXrGcY|X%-IhCC&1v-#NwC$3?67Z_QX?Z1<jKCdaU)>>sm5PObArzTlm?ZItC^9
zAl)9AWLIbQ;o7(p^uI7*vW3Z}aTAlDXuogXR=z1`wsD4KgDhEXtlYl#(k^h=XP8YE
z_rWo0B4~|sed8d*>-ag6X(3lg9?{XyT^zTZm20D`z$7<cfA%QyV;S+a9LV4gX*{`B
z?@HGA#QjHa$}&`i(~}agxYeI+xojc<ApC8yl&-IwQ-e(s4bl^VpZ<2A@}@n%1Dv!q
zDo(!gVA2oWzXi2Qe28m#eYIPpAfc5kIm!~jDZ;hx@x4BloBm(3JMPe~<nAuRWzIl*
zyVxb&YDcEXbVv=&f07|`ul`E3@_wLz9y`qIg_vr@lI@>h^NISF^vWumBB?oLzDS8d
zmb-C3vNG;Jvvr?jdD{U?F`<qWRidW`4`^cTR!^<mkDqE~%&2mA?&NP)2MHP(8c#qi
z+-H7ue$zpmp4qIho_l4jhLH2|Q>+5pgL%cB-0$<~s_-srGf{`*MRE`!gp7muxpTNq
z*#++`Z8<oPqgY``LW4I)IF*Xt>r;?|$$Q09mbHJR0`SK!=3Yo9y19Kj6~dmb^@AKr
z3AVxT;DGi21L(Yc!q+CB0NTFTaihKcOMr0R4IZlyPzVLhWqB=QxqDPap||i!LqQr+
zweKWD@Hkm=Gu7rng_n9xhPkONkHfz^ldcmr*?raV$EQr=3CQH7q?8azl)W_QpWmKU
z*@W8sApNTg==x)U?GKBqmF5*TOHdA@!_(1%xtG>_Fk`i^cS<d+Da<T~^_hC{Ds1M~
z*{p}<@<#8%u7N$mu9pDP?9XY&py}B4)OP|EZob)RU0(^8%!r_qURFUj5V(=l6@vpM
z{J!eCoIW&pwVaV15OTv}B1TD)cL17h@kX~P-&Q(hf(A|>7T3LLF)l=ee(nQjuvUsL
z_5Fa};O+J&B^o$N&Dv7Aeq<*T_AOnP_CMK4{nyY0&HSJu*;l@LDf8|$HAFLVb5goB
zo}!Jeyc|T(Ia^Bkdw5~uHFOpm4~9ex6nuu@p^S9OQE4`<dYi`f>hh}HDo31#?L9fo
z2TR~9RFNEJiK_`V#*4XR{OZ2=M5B*~<D7#1clTPSn;)QZW*sPBR~@|)z2~!uWeauz
zuylFKsCSI(NFHhn3WZSSZ${i0_vlZ2#!;awk^B+PDaVR()9Nl-%@5EWp0qmXlvxrP
zDw-(X9D;f=B8Ntlt3}}_Y8V%D3S>3ax4gEFmM<X|$!2A7?n_V~e$2s$(Z`R9ZFEY5
zk`|?$^j8BwV~-bD`xL&>Mt&;#iK-sIQsiKj5M`4;>g|S2lnX%w>bPXZ^f-l%k_1%R
zmL#z3y7gmBqA4w&cHWzc)!TzETz!VDB%tRwB-28gRY7Ko@K16QqVNWaUtU<mS3pun
zG%rf&d?iV5Pwf2|Zy+*{`+Ox<IL{vNw?HNGf&&UAp)j*y-CZU86@zANB(p;F{-|c6
z2T%|qADG8k9Wli#kt!G4WTP0e%49-6plC)sOEWDsdxam6l?(X5n$Y`)C|KmTLT4g+
z;kmS<&WK`jK=MMIY@v&TSH?GWPLWFPQ6WLRQh?UZYN8v!v2SlP?$d{E_a)$^%T8V4
z%bWiR5>8Z;Nm5d38eL>aI`EdAd-6KuY_2V_j<rEipBd%(S|HTkTSI)4;0xuBF)0hS
z)-U;ULHmJQKANT=%sj{w)3lvt7T|L~KcxmNj8Fp8x(LFrg@m4xX*JP0mxOYJ7m=9;
zc6I#P;10c`I)1euS%6@KF?{XwjsC_p+(Lyb4}_F|f!h)ZXxWogI6Z3|GtUaS8l5em
zB!0&pBZKw#EO(#5Qz!@XcbvR9(?Dv8WqnH(YH5eXjNALtPp{}v{f)g&Grp*K<yLD9
zytL>*cx9Ke5cO`&_e#&|inmLEbo9+K=!zI`->Pw6J|y};A1M(%1u+bmzM{9TxhH+I
z9ny4UxLJhr5;N7wq7{MzinnFqs|k|HE5B-8>KSX()F;>d)>+Po6o7?M2SMisiNhf`
zYKk>fvvU_KXh_pmVl}->RAISztZQdgG6R)$J$f4{HZQtH$+6B!-`|S+j5(s4x!*E)
zbuXdD5keEwml|v1N9d}}it1#&xJGydh^NBS^Ofyb=T&$qN=L!B=HASZLafMM)qTGl
z_jj)qMT*inXdmWI&t#AT5_pq%94sFu^JnV;oWz<fk%4b5=|gl$02x)?@j}-(zHMG9
zUcGwRC`N_JU6_>@ci(}RQyO5OWxubzN<f|T&?)G$4O;Vde@yYYf<M_HZ&*2Vi-v4U
zUP;xCw`SY|koFvCP2Olk7%-|L)IE*wSAurCRx1{1>E1pqTci4$FP}xo_V4mK)gmM3
zXBCFA%Mp1^+Qlgq4o=oS$v8PTz?p4*8#?+)F_&qvscvR)>j?(%B`cuz+3suT+pFa1
zPgwT&Sd6LHWm`6{>YZA0UW)iQZy~=F<FXVuS=RDOwdTtlhFdJ_Ymfyqg%b@yMAe4e
zdE?t~2Qn+5$%I2Jt)Tlg?On&KZjk2muHEa}CPfYPR=l+gQ+~fQyG>V0!kfc#NWGBZ
zK^edDnXnjnj!3uJ*RtpZ?#JIU=xcs;(vRwl7=vBCi%%|w+Yq@B*uj)}r>EXa;VuIb
zuQ3cNmthJub44fQ^Mf}|sRXn&N5o-?U;~pAW-?J}T5+P^sezQwoO$V0E5T$PMogLr
zMj0&b_%c3cQdH_sCMJ#(6Dh>u*x`{kVTYa84nsSCx6A~WM_7m63S$ln@(geM3=r-7
za#=qKpm+RG`Z(7Yq<T96Vz@YD$85`d|7nJP{b{VMELUpVlNL%-;!hg6@NiA~Ps3X`
z+UfO=6(+6F*#Z;lmtfof0ocX_y*!A1nMw|?M`qlknK4ea{iDiZw{kwWh(qe8(_L9M
z-^o7L{{b$1oOxLPjZgcf8x+eUyA(vJ*i4!DqLD+$Ods|ca(ZXkoP9ejvn8Kc9Qna)
zNbAD;UP-2TLVP~xm=x;w_jV_<_I4wG`M8?!L?-b4#}uH*z^^iM$MbB;S@TH<K;qe(
z#rs*8IhEYZ+<iS?Q0>y!>ZLZy=jvDMR!Q6yy)y4BR!2>_7P;M>9<&|5c79s<(crb#
ziZJ(bn;2x`o2Tbv+d+P6_{19-uE#^~x=&7dIC|YgihX5;%7rg0tWVn(q470f>RGCL
zer(LuS|IJn`Knb)6A$Z@uioN#T<%ka57XX!;Iu#nS&@T_d#${^lCzjGmniiiA%=S-
zL&MxK!00hwv&!`ll%(np?H})QD-k$5KQzDThub_%*}Hd(+*nq@d~vGoT1bCkI-&d(
z_Dw%nHeGcwN=l8o_vMe}!t$Z4N{-MSz*@On?)1*-nM+(t>M%0K{CV0A4^Amz-&C-b
z6oZY|8o-A+-2*Gk?wdB(a@II9Eo!$Z^ODQVJ{eWct}RL$NK2iRda3n}EWa@A@4r5}
zN?U)dL2x_*zP9~@YgYm)3&_ulDzuY3SJ0!scO3C-;lRCtpdgWd#`2bIHCdN0oBON)
z8t@J0y|FUNVqW`>QW!4ztN;3mc}Ma=lLgH#h9kxWKS0&%YvVw`QUZnBWLhl&A#<q?
zXD<E5szhM#?V4IvQ(X+fELIiE#1CCz*=JU#ZF1}3Jc>*|*2Qnd&R0l6>AtKP2HB1L
z82Z%VYvrdM3?X3DJg?ieI$f_0gn`k?h+u489w0%pAS^y_JnUZ2ZV|UaHpy@QS&Do2
z6#|GZ@Kf}@&BGX5NG434kd$GS-9XDp-UI(F@`^Peg*@Y(nYrKHX)*@6<38%A;1G_-
zrldCT%B6i^)%lMH%CXx}23Q08vZf_siy3(bwD<9UYs15_QxDcBwMWE<sQ8ZF-^NQw
zx-jc0N>GVByQ~g1SwE`y-OuBg*k{F=73N!U-*_bIM(T|>2dtbO^>2@)GA&x1R(k%4
z*{i(jE*`4Y9;oQO8@dkKU>Ucq<I#T%b&Bk}Q%>QL{KV`kmrk#UPf(hpnP$EE#SFK*
z9;21rH?d$yI?bwl5*XY3nGwST3iPB4=V1)a#1yzO8ROhPckyn2k9?L>C4OyfUR;L$
zm<kqtcqDB+k?0RKRE$emVCYu=64$<J^Ym8Gk1*1ex^WD=7xZV}LV2};b!{}`VMfow
zGM#o*=+6QH=%x|@tS9N#D9nO?3JRy2CXjLuji6tg3tg!pxncWI*gr9Oz(k_Rr|{O&
zw7S(`EG|_rf}<z$D_9@unnn?hi0xFbQCl{D9(|aU41A{KFnt7e;zTNbd-|>;=cRTe
zk7qoY1Go+~%UyV1_^%8t6wOqHjCBJ@3<LGQr{>*jl3kGK7v8Fn9UcVFx8L=FmZyJm
z8Ur#LmpxB;Ztyef)R`@E#?JP(&Y=j!&j-o<S>Hlwd7gQ!9wu=|cw1e_Tz-FWvzbp_
zKqn7_ba^+lHZc_AOS65Hp50IKZPkm9Kf2(l2p_Z)Ju3)h@Iw-h4lg9J6yny~K}=J3
zRVWk(Q@L^wGp3{&hecq8lG&2W_}2H$J1e9g;aC!|sNbYce4QA;oZ5+5gSRcN#g+WQ
ztk=2|^pYaK*^#{fLZK2ED5*2KsE^{K7O(dst%<CE0*F2;TN6Wfs3dg;BO6$foB_b$
zw7$H}WA+I<l}=5|R2!2e3`?_RSt9$IWQH3GHB06VG8G}9<*kuazXR!v8@lh`dJMj<
zk5%!(kwPkb&`k-6jAZk;FcAP}vZrQDZNFC4Z_96FiFh@jjSo_C@KqS9R+UQ1qzOaN
zU<b$gXW1^FC`&07A|;-Fi(4CB`bY<FjeGsta&ksV|29rswOYlg`1E0a?d?=8^!VW$
zJSyHSo-571_esKY)n24d!~{LYMNWehPE`bCg8@eE1C9z@N8819wghc^Fd4V9OhZRf
z(_R%>H1zz?)c-4dA@4LUMg+U~{={eAwTqp{X@M>6>7)L5FuHfMa5+FalhIPCX1Abb
zY6+R*lF*hmXk2;M?=GCFqm#u}<$I!nb(#T5o~d{VnM+C(D!1-Q^Alz-zz91q`v*eG
zW4%~PbbGZ`rTfpRe8et-V61Zn<C**D2t+kWPhgO{NpC5bHJz_*fRf53#@*>1x<f4W
zRs31n82GhHZa9AXDz*Vm<zgjas$BZOsHhLE=wbqdhzTJcC}5Z|1Ph7kp~@(;GC6Z7
zHXfFl(pnClQb40V+3ORmA7|%X!I?$e3~TX{QvtANoLPQg^}N50<51?!Yl8anI~)5>
zwz-$%S-t-QEL89baaOAD=_ubJnFbq2=H;r4x$&^}Yu<ZS>XCeY#pCZ`NtmDwv147@
z=BI2wHL_gcRTl-h+AvdBaIF94dHMp8gzZWPZ&q!(rADH|66)e?Sh4hNeoKMr{zmnJ
zkMR&jLD-xp2{l#f>!&cYshWlsv$o`s9UV;XFlwfQNZ?ciLihQpz37yRBH^nxYZsLy
zoPWmCWfvAsj-`GBwmV&Tc>i{`2Y+IQ^6FQWtaO_lblS9++t(pZfgDk*>&}g`d)0A8
zMSdRSd4IUDjMAEF=Oy<TVTfV;@cgsbcb=M{ajz0aidY4d2w*obL9G={v^}vSW&RSL
zx7S#z+W0T%Q^r4m@`EhjsK*f<`JIv5jS)}B6($xdWU^v^vh;%ciFd0mKQj`1`Nkfk
zyLxnCUWhb=OHzqPU_X{bp7I%enhUaRwm&yp_|lXNBA}-pF&>_eJ`7f2xNtJFW;rvh
zx72BM+!gis$EH&dFG;Xw<+7Q@q>{1Dnbvjb@=fti(Pl<(NVEO=X4&6w&eo~Z@c`?x
z#j$!L(W9ic6_wpMUpb}Qmt@dgYO5J9uim~^puE}g30N&eS=7FeCsWo5E`R>Q#{1%0
z73@)cH8wF51*5+zocY|IzQ4`baC|ccwbJ-~u6U3f_F(t+L$9krf$xV5avwwD(T8$j
z74F~NJb0T3&D{LKqpq9HX|KP>ySEwI9~?=q+-st%!jdg7XV@C74E3G6O5CxyUKBs{
zJTHf@-qXC|?vdXz!zS`1qXn<O16{S1KBMcQ*9XH29lrh|1Go%~cJMnm{cLr?@j#!E
z_$ZNj_WnfH__;s)@s0iW<iG~#x_->)!L)OG`9U6lUVGz@-B@p`2-+Wr>U%%&1Cr2~
z)4Nu5Xh&>nNB?PgrVRbr#oy`tj_%<}N_2!@qW2|$n_{_gi-v1mGFje537O1~lp7p?
zMl<VZ3Ha~3K+S9FX*&Tt2sQSOMv%!?O!Y)p2=|9?DS-&wA&^BOvu~MA!f}vkp*kut
zD09y3zzyX36442e;QSv|7S);bZG~@^f}bADg87m+Ym6=_?dX#=6JI||Qu7|P+52tj
z4!G^-p2>1&Jh~|<BoEc6t7+`71j|6AVlbqE%%9C_Fp5up1)uy+uSX#_?CgMRaD{`T
zB70mxG}2AUNA8<@eWzWB?d;ksJ5E4&j~&*pw1^YyA2z$$AQjC6Y#OZ0Md1N@to6mG
z(_R}LI9a*<XV0@Ri}CD4`E0jVhr_04`o#oLZ}XCZkA1jFAQ+m7c12}ex6$S?BHt_`
zU(7ozdnX#8yYm&8Q2t=VRXzL4YZ94*%SpKNU8VfD?cU#(S;6w%!`KoPwm1B0jDPx9
zdU?JDJdTDUEmi{Y%u?)ZvTvg*X3>J@yJw$Lmp+}ew=Fbyfc$z~{6)P`pFHd=;z}N$
z32|#l*KhKsETONp>Rs*l=iucj!}y*UKKWAtxzZtE{HO{VBcn8b$5B_v3mHCdTDYXc
zWf&fEAt^)$-W$gL3S`bKZLUtYOT)G%223l*0JqGk<AC7)(@F5vw=<DT$}xW0O~g-+
zf&Hs1Ym|it>loY(7~oEm=CiJ*1xfyuA(icAAB2P6Esf$!LrNQNc#O)j4`6RKi?z$~
z{F*i@3iQZ{Pl$u{^acls%z1K1Xpu-fAbv`VR7)(y&Zf=%V`6T2K}!BSh7f9~Ci^K5
ze!LHmhVaI?EM5|dMuO;bN#%l(yu;1{NKNG6+be3LJ*2K+_1{GDW5-(ku1sxw461%e
z4(hEWey=Oy_ILw9>)*AXxWPZd$c)>@0A{^qw6j+XzVQn?N4D-Gx|r{yGGO&ZOjjJ5
zXKF2UWy`_`0)`rfm};1{#(OwmG`RbjXxc8EtXM@FBVI|UIWu2b)}1LXs!6hq+9Mzc
z$bq=twr)01=bSnKz5>!d`FD+vrb1t`I2YVPR5ZyyuNZ8=dSyvcb$j5l&K5K$fUR8F
z4WO&xH^&@uH`r+NC&5p3f<$u*fOB?Bx2h?WrBLh7>8~Dswx8kBAmp<U{~8f=hy$(G
z@hX`|(k41$NX@g>C9eub4EKmXk6)8KX#zM%_p>v25C%)!ksjeX)90RScgsl45i^dE
zG0;MAKVJUTD|SvI%Vv?4VXqHK_XYm+@Gt(f-U5)w{|bn5Y6RO6MB?L_o(pdjDsqYQ
zqoKD+kglu}CL}FS>&#OuJaA|YaH>7<C{F~}sfQFkJU_(%>JUV4#hn>O3%J*Ru;_Y$
zz`3SJRmc`3q+yd5B6y%enBMwWEc5>KGco5*VtJ<H3Ouek|H><)#3b_m<p&#&LdZ@=
z%dj8hj7P${00A<$j`CTNodm!qqk*VSI%7is4I(!+oyhAm$DLS@)Tg6u2t{8IH9Hmr
z#5wh0rTTGBwp5OQ)u+#8m%UKQ8)L>}P#Gwm2j&qc6RtGn+QMe1$MGj+lE9(9$QqqV
ze(8s-Nq3nYUM-?wp|jPZZRNsOt1%pz0>|hu=7l=YU8Ou#w~l^r4nIQ4g904;u(-0d
zG-$w3Mtg3-*^(;rcg7!g5dLA(ht~x~P_N}DLT8C3s%q=&8NT0zN?n`B>Z+r<{|DeJ
z`#13X^<8P+`l@N860j~5Woj2za;}5{XH)hF>@OC%`|<_S?g!WR%<p%chr<%@Ix!Ec
z%=Ih4)@@wb=6h0VfOF6N`&)8;*!17GLr%iJsyqMZml5$VqdsbxPo(vwn{2sv2!G01
zlcGtv;$yiQx4H3YxB6PWbZr&%s~2ZS<L0F6f`_%<9!pP3@q#pLmaP(3l{x-Z2p0cs
z>xu{~&v>A8U3hkA7q?p|qnfjOcBaTT@3ZQS=N$^Ly$eL^sf5Z@+)jrcKUAMw|0ezs
z#l4-)Bv9E92LJKoDf*zFZ`bCG`=jyZeiey*i*HX0@CkjuyCuf6y{a&g{{buoEc?6)
z+TqzY92Wv_VTcqy9_$0l+d}QgY`~i_caC&BSWabao1ENO#mESDW=#9!tgN#gy21N<
zukiW1yB9;<XbQgX>NBv9o8mPc&MZBtZ3ET&uLC;0JvB~eB5s<Ff7}=?86{6e<Fvo}
z+xUu96tma<_;cqG)s}8lLlevMuNzR`z3Vqae~_S-sS{#V=Peg@3$5mlVJmHF3inA>
z=fwW2Sjb9uv2l=SN*S=|suTQd{$Qu>1rgVIVt?D~eweOzfULO7ed7NBejhj)RHr=o
zm|U|yF7Yu>18wFO?>&?;zELmx_+ygO?AmLqZ&jRO8|v@q+2I+``yPs~2{{zml(+pJ
z8}(;xp&J-~5_li)U^U*Ja5&9U0%1KySA6!U^B$c{UDoD90fcvQ<Z_2zRtVj%7Gh_%
zxmYzCI`&28?hGQu(R-(^?@q5?6K3>GQl7Y&{P*WpQsEzq$Mr+#YypUv<+nSI#otA)
zE{;ad)ajz@P)!jT>4xQsWoLp+iY(X>e^Ols6T^>8g7rTN=8cWk8A3?Nzrj}4yK+$+
zwlKeMmg|-y^|N;hgu-feHGEB`jk&MC-LA7LT((2)a82#q+WdJtsipp2GhH~R|1VqX
zcYPze0C+HB;LF7R{YTjq?!T2^-vxtTE>MmuZ6+?5VXca9r|p2wX>nGWmf2p<<?&{s
z>Fd5+)fg(Ct5D{sL#fjd*BZV6IQ~i1V0Cr;Oa3@*vnMlfI0pQ5<k8w^h@)7r@rwGe
z+wj~oI@!e`jCLGORqE#<@OX+FtT;~f7=&T+<3aL%62mQgNR>3rWbJ#?@+7KmJbJF9
zvCI&Ff{pVL3k3ijzY?zRu<}7{m;eYk=S@(5uo$N7%eh4)gsPAjdSO!6t0<+@(q`!S
z{G$BZGh^d`4TO*uS=8f7A}L*6pk~iPD&>({8Wejvf3HyZ8*O|27u>1xX4yu0mP1ie
zX0=Lf{^9)~o7+vqW(tqi8PBqTXR!&s$Qd1{pF%*50+z=b_t-2q1TQ5TbyTk!#_+;@
zyZingLw;}8Z@r5aA@}||y;1Ee2V_M44-jk<etz~I(SPhBEdJ#AryP-&BS5eOyM%!@
zZrjUF_JRu;z<zXCZgz#~6y_T&1Rxxt>ajyQtkEdm9LP5N+g?804Gx2GtYaFl^rJGb
z>;U(aigp*K06F|wqzA@J9{>wkW77d90>{!F<c8J0rWk_NC9DwW;(|;}Ca^Rwe0+*o
zg9pc87zt1l>eD3Pf&?WjD#&X{DJKD)dT|dgP-mTT_4{#wyR1szulRKT@Kmj0ikgHf
zGxX#p>Bjq^UjWLX$A$kegf4K`6&Z#S|F+cYw&Jt#jA8>kuWEZ_p8E_|K%MbbzWJBk
z%(4Rj!mYbmQd%*?ik{L<Q+)T4;eaLZ$WBXIat1Mk=zJ}?(Elz0<PkqU>4o~OJ)NBS
zj3AL{xWxz7l@kEY9UlEx#1r$|5dIv-mGHRZ47%iz;bR|Vv97unTW=C6kt_UpSUo!C
z`n#Dw_<=VU5gW-}5lmt=bW*xw1g%@Yt5&n}_kus`qkM$X`lOTl=7~H|6vmwGx{bYV
zp$#T?%V%k0)c>x&bi;Wm(NM>dP+v@dFcz<Z>FE+1lIQFqtCH4LuLtt5ymuDBimr&H
z_KW~k(fEI5N<wva7sRmznOkP~P<A|L)BDOWsrz8;j#AS3f+u#j#V)bW5#GOf&$@h=
z=p!N0snQsbFz6q8W!YD4ohLV?yObVT(Jo$8f=u7FDnBvMNZL7wvr6!cj1{`TjP^iH
zM(xu*9}>|43T~#FnF`S%!sQ%5$H>!(i;c0i$(v3$Y&VecHNvHCG_`F${1GOCFd!ZL
z5iJZ%A-y)4s2*3Tt~<@JddF;3qtm`x$BSKo1InbmK|~tkU-`<k<xR57R2`;p$`|YM
z`Fw5wus|<8-@?MF$)@AZhAdlDY>4PZX|ImXkI7zaf>+&mRu-d=&$H67CaX2JEFRl%
ztskI@ql*B)t^~IK2Jc1p-<VQE%*}~j@qO#=&S8`^m6Xzd+YJO@E;MJ(kmYUEpj=v(
zkCY7@SeCgL_suQgKNHiZq*h5y)(;vVOFQ@|D5*+{nX;+EQ%#bXVJ>}fr00o*wr0d3
zI<TR0ZO60xuEbFUX<FAJ<{R(xcnabTBq@N*`E<H@g<NV6(d|w67k9J7?ly#dii!9x
z28yieAAU7bXs!lkXU|0GSR9}M5?%LtxBBBBB_5sO!KdPmScnPR&Q-kpxc>p}eT!Ew
zrd|{lin#Gyt?|?1^tB#hFE5B0#{>Rq__)rbS|zMD=vg10xAAA8YjQ~4hf=X*j^(~s
z&Mx5?1IWFtO)F)J9~y{uoCI6z8$9i%&ygIz9-Y5RaL3`Hs|zUj`94pw*PvbaL^2iM
z`-yw}SId*k80Q7FFly<y9#O7=lY#I$KA8}HWkDW(GKL$Fv^ZCT9)|53?9K@}>_9@x
z761w;072Nl0_?7A{CP7Dms=#~$1hyPdG}RS>4_}vXQ{Crm>NTSkbb>(D6B7_|Cn{#
z(nC$4iGT2TUWCf4JW*y0+=91^A0&;rzuznT?Dcf=j*{rz*;$3fWKD7N?=AF?Th{hf
zUgQqVQrt#h=jKeQWu@I<X3n(3iE*fQ#T`rk^)yxK%BF0&=1T&%2ezJkXquECN}#N^
z;FK?Q4{1bGw7y8qSk0L{`F8C5V3Ij_<c&x7j*mKi?fnrL;(yq1=PZDSaZx1I(yPrU
z{B%1%K;LUfMIGf?2ohZ8SkaUbkvYf#_bq$I#oUtwSf}RI#duzObWcNfs8YMOK3B0Q
zEbIR*fQeV0e)7YjTKhHAMdFHg&AfP)3H<8Ao8{--d2V6#tl4kfc?p4bq1;u{R)Q>_
za-d<PZO?PJO6-@7)%b2%c0ciavHI|=o${r3(D_4YTnAK*HyQZ?!LsZP3(9ckIPl_I
zTm7?lYcjui^?}OvP3flA@o46l&NBo-JR0}{`Kn*!c6r9A`!wp@myMaVI^L!}hTmG#
zyij6Gy|i5<;(FnKwc`^9hD%Mp6vfErCbG>x@7lAB+zy~%Pi2oU*gR~N=DB5$;Fiu1
zI2-Ld?>5JhQaa3TknR2x1W6>Sn(oWAp4zjJ!Rp%Ysg8j^Bo?SXZr<8oRohGT`X2Qs
zNNQIIwlpM&{^$@+iz;2Cx&0ik^Y#D&tTQ;TZG};oWUR+v`)`)G`|;jWkD5pn$uGw&
zjtkZGZyg(10mEYA$Hly5n<6G2wi^M}ti2Zup85+C1Mz&`+AkgIJIi!9)RsezUo&gb
z6-acyx`lr^(^}BfH_LF?62zwWx9xCU`zOu-u6L!ED&Jt6pL#7u?m3T|N9N_cXkz^p
zSg~0b1CP-!wz)VBT$#M|Fjua8BCoghHHkMH*&FooYWZfFxrgZK$5bZ4ozL&{<%=fY
zIWO9KTrz5vrV9@u2lj?suYG@b=T^Zf%YCZnv8SB7u-=8N${#ge^N~frCur{V<G~21
z_Ky}L?Kh82UB*WK03Z73q^<nNo;tQ~To!bteB+or;_w5bome=!Z~V>zb)C4SHycKV
zeP1p;e^w>meAI?6?e-6cz-50x>4a7MEV+8!cC1sa-{$>=lHWZZtX~u!J^mk{Z64Nl
z;neV@MFgK!iSy5stL{9S&#mOtT^qe2!1$JAqT<G$f)BOE3To2Dtyu;~CG;hA8r)Qs
z7}<{NN&pal7({M8u^&zwOvny3+&tqg=Zj4rO64&>N2g-O0QvKp&na`;ndC>QE?tXF
zDHlg2U|r16|8K()!52No@SjZW_9jBVcv-P(0#ax5`9q5H-7@EPesJ_~MN^gqsg7dD
z{ofXE91jcqlLUCcQqUIy<OkRePu_lo3_ww$3UgYm&L`#?{mnYb**QW-zTEQU>rju*
z=a3|OdIQt2vj?W10bmK?dzzgo6=fYlmHxhJV)oXBwgd%0-jOJC)9{mrLl{nR-!s2~
zSVCv2DCSDdji!Q~$O?QqrA$GD%nyf{gw*zp))BDTx=FH6@Lq#l>SlGZ)}&O0t`f=U
z=+LbvkWk=UOktTPNA`mjfS3ihoWJ&KRZDBC(cj@GHD%%P*^6-*WmVV{GkG<GBjwAq
zpc*v=v=nWbU$2`<b}KNo49)N6qxVjk3ce~_L<kH$o+~Z9jkEPOj7R;}Z7xD8_)S{{
z1$3x1C5T0h$}_28HM|K*2TpDcwZiO={S{PnqV(|Tz*63{lve2<<rsKdpXTY=3NJ3K
zcajaVXj%`hwZ$!}uMp^zsr3`65${$r52kv}vhg+1#Xs+?Y~6sY-+tiNV;S5;0iC|Q
z1|g5w0YlqT&faa(s7l!3$+<wfke0KjvTgFKm`AU-JpV&&MLUpX1YVPj_%EIT?8jcK
zk__^z55DyB!{Y{f>mp28h~L45%TMFo*L(MGeha^{3ybD8cX(y{%G{mvSO3bE<>@8s
z-Cqh{_v4;3^i&)i_M%J=GF(Z^B6W+<<&U-LUgi&$`lBUdBMNNP<IV!?goR_&r9(jD
z4!%mibvGZoLHg(IV<ExniRE(e4(weglSz2vxWeCvPCI$sLlTv$Hp8_Hm)s*lcAB&7
zx@Hu7&cZXbTF-Yp2lMmauxM9by|(X|!eDPlDoVuUWk;*XLoc4s6E9{yy@!~{-HQE`
zDp}>h9{H%ZY+tqaB8lnJ4Y}3h^6asl4Rn?j;Xk%Vs$F>`B1+WkWzc*8vNvjx%?x|A
zz@Y83@lkD^_=OMtKD!E^BQzlgi%M1Ja9?z$^_;97o1_gcFWYz}B$njvrZW5Ijt<LS
z%R}dSqidh>u(l0nFzcS-P*|s3vLQ8~#!_0@yAeIduU`sD^K=r>`}4^w(*FvTavMWX
zm~ysk!|Au!;nUa%z*cgT+OvtI!9-TRrRxY1*={xAmh9c?`k$H^954&<LgRCV;b$ve
zmV8#4u{0YqVj#RhzA~mhAF;YHvSKMG4(<yuAf2vn78Q<zpj=q)$=l8sV+UhQRPx)T
zN}_+x$w;c5P8g$Y<Y1FF6(bx;Irdh*-YXuf>8#O6u8&UP6UGQvaF_G7??M6nr)QY2
zWWMB1d$(u%+1q`Z>6kC4mK9f4_|`uT-7JX|z@|;br>~z{KJi8Ob}6!c7}B?p1aSz3
zEN@J0$P=(mM&oMWq3*&y8v^E{$Rkkt?N=rNiDVi-+$*z3(K|!+V;1MIeagKov@V}-
zQw*0ymKK>01dN_#W7d%{#cG2?G(CB58s_tapoB~*^lg9s1&Fcz5iEWN9={!b96sLX
z2MYKt7L^qYYzfUC-p8ZIu+d0#r>U>c_O@p{T$1b(9$}W~<@qxK5}6>@WobK#ae#)E
zE1%JiFgPY>1#UU5?))~L@IC?nVU40}OFr8z;wF9`72(Eid|FlcTbY<D3+*?{v*`$8
zzW$w07t3yG5c;Z(JFT7m2PntR>=wT=T(H1FJ3@GzN}kmTZArao!-H0Z;s$<K-FVHc
z)u0-JYrj40X&M<1E4kMU?e;w$SSyO>VJ~!KN+dhSIThIn5my4FADGGld)36X1nus8
z?AdsIQ_)6YPIfx0{>GTiM{muu?N4uexmNFD&*t>JxO@^}`BY;_aANa;#Buq5uk3^F
zAOl-=Q1WcK)MM`)0i~to3K-Z<DPlB8>8^m+wEFuOvdPmoCLHe^_gsmdctuyiPO-Ah
z&W(*R>~hMg>}039AZz=!m@@>StE6N#>1%;GU-S$!<KzPzxTdk&*`ZKyI4QH`_NzjV
z2k({@v?h|Aa~LwdxZV!^9pVgUyR$LfXlO-g{#;j;JU|V@1=p6wV`2RuMcF-ZJtuZ2
zQC+EsC3n@Dos_oorj`}f%QoID%ot_m=9}r3*H5$QOSYBQn*eD_afuW4vSm94+0oz~
zWdH4u6>glF++Q^#<07*foL%NmtuH>so+szt#Hd4<xY=(_DK4{(7TZRBd<n;L<0KX<
zS{`#e64;ho+q3;nuJ|H1V2WOC9)-9&9J`)&u=?6tCL};c#2uplp~c~#mQ`Vn1PC_~
zxKaG}&GNY9at8z;$-h<Nm_z)}Wr%T>^jLXoK^Wfs!!+#6<Nr7Y`-PKz*2p8TEDNsB
zKp8%>-<^IbJYZ7K6Lc)^MlTSsO?hLlJmM6uK1~LuEPj~;+P6GSf+!kt@$1woOvE8E
zchrCurN&kFlQ|L0e`gG+(fH|Q^nZfu(T8(f;crDN9-uBw{#MS~W(WRr<pJur#@dY+
z+T4x!5h*3)vvPX&a@N5(uzQx=C->pMl+D*)Hd`Ll!7QVf9lx35@^|WHu9r7t!%i@=
zn`RGM@N);ODkvsg*3!$)eS>yC?LS>|<Tdc?O>T|M5}6vFgXn!oQTo!j>K(K0VE`Ag
zE^^@N#QnONf+TwOv7DWVmyI{~glTDhERWBrh37$B@51*S%&Z=KrFS#&ywT;3$t?Pp
z_7VbHxAq4@Zd}LLPIg_|-i74p;xmsn+xK^ELx#sYv|#G7e{1S=+Y%(#8HGPRA}kbI
zHlLP2=f}U!n^JALyi|I9z9sycTP6)N<%9t31Tr>_pHK2{FoC5Lt+DE9Z@@6h3W*mE
zZ+lj(;)2ink|9(K&|g8XM)pOYLw^IUisGB$C@!R|Y+E#PAxzRyTL6dW2{i$*CTKk}
znNLM(lE;AQYN^=Mf7hxUoE(P{XP+d?P%$vEQH!vp0}WTT->VhfmmHoM<wh!`psUQ5
zU3T)A-^9`&DaYR_`JeP}WCQ2hMatUqYh8AAVsWF8IgXd%O{m{|O(dd6A?G^?cQbyS
zth2Puck-;;^eS1w9t6p6D-Zm5YLMn}n1AV>Z%(37*R;*J!Hyrm;f85rp=Sr?b6@S3
z5?0SY{Mj<O04BJk$5E7a`^@8yVKZw3R>3L|a5la(xh$Ft!?GXe$(JN}<-(envtk{%
zP|=4coEhaR=Sl!7$=CWSb;F$~(W&+IPJ57$%KNKST|O#N@yw@gGv-fONJgK|_O60T
za&XnWbG=N5yUW4&TE*Y`o(CzKRXa&l>=iL4-H!1o?dD;>*0?fP{f?yiIU%c5FTc|f
zZ!TWm4gs6{l3rRGjmmkN%|{`!mwj(9t1H$jyC|vxSD4$#dLg{(_7<=DgR#Ki7zh=T
zknVh|*PZj{g5heVRd`)eim2sX<8OXS>9KKz_Sw-&z9CmFW8fHxPp&_TN-&UzGq4Il
zrXj$e#Q*F{h2_QdVVvfKb4OJ|)B15?1tGz&{-pwkL}0XfyEgN2IM<lcJR3~Z8#6w~
z0XXHSD5&s!DU0)<J8DY#qMBAu0bqQWUjWefT7rv8@*L~l-`_WtpUOp%!AoiubIy!(
z*;VU-k#Xf{d-=%wyp!W9xA;|?RvOOC!d3gTOU4@nFQU7jzBCdU8kUUHFjJfsB{oph
zWBlPYX2==4T^?DoQ}++qK<~UXHalI+UY|}l_7x{tO$?jXH{L}SZvf1|*;EbPVc&`=
zbpmw(`PFfB_O9NT-=Yus*gH@R`6uaw;It;*lrwVZ<Y`_rQ=$BTu7{GJ(ly*hK(NQc
z(Tu8ULQ^&)&Wb`niRQ75%Z%8~bWt*fGkiF<foVQ~fSf2xFe6QC?W8<k!Q)Vr`8jDA
ziPRkF<Z~a><_UCmY@(j^9mDN(Y!_|ctY%ivX_@7v8H7X^H8U`rzG3AjfaCmnZG#uL
zwkyMQ4k=y~MsP@ibVA^Py2%{nWl*X*dN8q~m`BFp5PLz;o+~p`4Tns9Di~m|;RIHY
zT1_?HNH)~`SD`}oR{CU*aCggN9q4!SKq*0cLI4@;B61%>`wT7S_B`MjA_^D9hE$R*
zOTS+y&DuJ#|GD{5FH)$BwN2)y2IYD%Aqh3m_N9mOP%^(_po-IB3d9ke9dWv35mfBO
z!_lX6>5}X9CzV7$E|ukFXA7)JU^EVU(#{l7&ODz=prj3Cy#xp}ZCf7b0uGkXmi%e>
z2<mi=b~CRd2m7l@v+1Dpa(0-a3G}ubZE$DcNmIZaeib=9s+{{-X%GoGF2z8${V*R_
zV~DtaJUq$;(by@6x$sV<8$3i1R&Un=PbK^aiT#Pe?nU=q$Gfm%r}9h|zZLo_d7uVT
zDNMX>PMHYK!N=a7ReanJ1n(}DFPS4se5Gn1-C*ABIp#^=f}o)ylsLeJ<I2`MOn3S-
z=e(eZ=~wmqgy~8O{;IA3vF%0Mp1)Bx&cYyo<+M&=^OFx--bUPF)TN3bZrd+rsFg{L
zMBL(%Tmp9Db&NN2p)cBExR!}~#!=4Z^`-N0h|$bN@Yd$TxRwa{=pAG#OZqs>%@2L)
zUF7`ds2h{40gb?h5ubxoOJnc!an$SYwpVGS7-HTPdd$HKZM#ol7X$}N+KW>ok;@e?
z_(Tf&G~ZOZ6bXf18@anY;;AHkxr#G#&cGz-L6Uiab1tLuNUln2SUTYxj_l(b8Nbl!
zZXBALoUKQe=+>Y-m6z1~3X?D~-u-z=b@lmNPUQWCrFZm0>xh3(8cpoAv>w4eX725o
zRX<k#uaGn8j{m}Z$UYoITK}*`)?##^xdvR7k>TuJuA}&0kC#BO$ZwBSPV@wDMuUz6
zn^vCk8be1nU_5p<a#f~~)zGeU#_XOT-6P$USytMCp#D3_{{!1VB)@`Ae<a0)%mW@G
z#FSLXYhb{RV=7iTskdn|4<^8O;WL9;&|SEYEM!6b(GubaHxZeIpVT#tq}{A`=a5B=
zLU_%mEVUK-hO{ziw(dC~;bZ_@>1ZZVyK|geUAUh4l_MG$KqSvGb=4%1$8z?oy*97#
z-9m|QKNQHnW!vbUOF;$zF)&P@AFAfMKTLC}{{SDT1Q!qVfdIh!!rai3Kxmsd%yH(h
z@?F)*!(0Md;3>q!o1dzU>?9VKNFC%)_O}X3HNap2#l-i4=X9$!rBXIpFK995Jr-&U
zbj=XpT-X4}x%`xrsD@^k6Tt*oJuOVMZ@lD{VUKlZ5Z1JSFDAhvApy`Qa1sa+DBY)N
zBBr7HME(|&2g=s<?31-rbcWDeLA$R5!DVYYo9nsMsDG&Q&OfpZsj1h}(^GcW0M?5P
z`Nft+PP1CHiZsi^jBW-8e5~r#9ieHhs{a5|qgqI5bcU?v^<8&Sy$wFlaN;(!dVb2L
z2(__L)u>3Xs@T#d8V$#s@d}%@`W+*UI2E09wZxe}RJ8pDwyELTy-h<0@HPM)){dW~
zX*AD*#@mtsKR8c_Ia1U0J$|L^W{*Tc{{V>m01uaM1Ujl+b-I?(b1AW|zTi#2DP0zq
zPR2MsnV#ZqKK@GOW_#xDnR846KrolF>B}CAt>3!7wyT)oCCq`e_>NWPdpW1nP-}hl
zMm(Q|;pBtq=Cm~xUq-7Q1T>4Z$^CmN_S0)WiKqU{zY`ml9|%)zJw`a`l{6Dhu-zDT
zLyv(!s+!UOF>4WTrwe_Q-CCGqn%A+yK{uN$-G54T4~m<M>3{*nlwGPbUveTdn7`lk
zPTRX6V+bY;@C|`EL>&Z~5>2dSBS_%Wo-@QKNRkF-*XQJo=5XxXj1RjYaTYd_#pmzI
z0ByY>2_oOkB0UmoNU$v=_80t;X*&t*EaG$Ml)!>O#GFb3Duyk|BID(YDlcq26FuOc
zzW)GKda-E`98C9%Ds>$X1Q`RoU%_5I6-`7K09rX5ag}v~8*(6VDxD$?ftb!d{{XtT
zPFy&H$$=hFrz42m;vFFPpA}v2+UO&Qu!#E#_Q1ooL7oU7cvb4sZBQI77m<RLJydcF
zMA}SR6}=;xUeSVmN~;B;1)5Gt3dIApz|F*5UQtAJx*vt0fM9nf>spXI2+52d79NtQ
z+S54~m90v&HzNF~X%@DC2#)|%701K3gTX3B)sQE=UfxP7gl#3nTtV+By$HzG9kKwi
z9r#$ai3SXqk`1THZ0iUB$eT_6{^4TQ19Z&Iu`-q-sA@@w`GTuB5<dvJ1`j@}#-x&D
zUyI09+(<Ct0%RPO(T_}WLi4o5P3%rTFBB6lk$)qFBjgE~#{mw~9S*lA@8qp3`Y>!r
zff*MIQj?%!_qKem_m!Ka4h#SpjPWZ)gW!Ve;20VCerpVxVjAWQMe*dT%*YOaZf(X?
z^})=<<eb8*GSCUUac}696=IPgmM7>!9YcTwmc;N0x%(+Fks@uA09)_DLs|16_JRim
zI7*e=W*~q$&*p@Uz(AiPC~LXf%pAb``KZNfK>5ahWv-F6%0@^edRoguh{y&2uqG_B
z^zPi@CL@5f=?e*IHXW8(M!2Q~7e?Cy1kJwB=(Myzn2b&3W?a7aiwK*azs+iBfiM7z
z;Bun$xZ9%J0mNqV6uXVdzAxDY69X^|d46c7<_~P%>J)ec*!-@T3Cpet7{YJ>wA`Ly
zJZ(1~B%wqB5GU0n#E1v-^hp8X^9^Vjj>-o-1A=**m{Sup#99T(5TJP5d!N;M(dSUP
zz{uz1rlJ4?lQAj-a%~n<>07!0v2lPdB|2F4)S&IHX^DU&N9wGLj2}xVXqaeeGDV{5
zW{J*6VS>4`Z4Bvbh`)pzD(@|ElOW&_2)eNI6CB^a-IZrVNwti3P6`4WG2@<3s*#`%
zA}nrxshMZHi8<yJj0PdMGtVifKzR@Vm=-b5pLJd2mjEwl{S~LQ&8-)bLakN6kvtd>
zK5C&<B$5n+An*v4V@PN540Cl|%>a-LO`zFa)9nCWPpl`AMb>T&0NfEE=Kla;83F{{
zc*wE}U^*^&;DsRqH$}%kd04GZxwr$42kN7y2z<MaL5UuS>k}tw9l)QO3P1;2GZ1bM
z$y$E2D11-`L|snjZ3Mw1feN(~4HLn%Nm8GQn@PCfkv;J%JKB&|B0z#fSc#vXs*W9r
zzoL=hw%g!IB@f{MxC1t{n<{}5oJV68k(2iyRak8=yn{1xRMlkM$AM7?2RoSWlPc4D
zS*98QAi%@}`YC`P&1vHQ0FsK6WuTl!L|f{nfw+fr+SACojFe}SKLpr#kD7Mw1Ow(J
zGr=qaK2iB3&xFC8U<v&a0uVs_jQ#x7#DXo~fCZFIcrGA<40cjRcU*TcC+qnnWI+YZ
z1Lb)22wG+dxft6rJpwnuXmhV7Bbo9*9kk;d_u~l|m;oelAV|6O?1~9=+BqfxNI@XE
z*!%p`+?g;Z10GmF8)spFL=1sqJyew0Qngmoq{8REz<v5pWHcx{QC1o7+DncaJbQie
zx;-^IIvNv2o2%6LLx{V*BH2-D;}@yuzk<`#ruY0#8=MD!=?aFLk=#383HmKwwl=-2
zao+Mj6xbiq>Yk2+gTS+9;FHFC_$r+y_L{E7QTIizY>Wv4KK}qU$@Q&3Xe?`|n-~Mp
zbRAbz8%@&u9Dh~9bqz_YSo~QyD{mG*S?6M->Jnfat2$akaF8wCK}MeAClkjDT7@te
zM%#{J%XU#;THwk%f0`~Xs#9meNwHRXlc2S+Y3!bpL#c=nA5=hrlTE@&B&nR;tuy_U
zCfb1m2GSxuRiNLG;`ATYGo7@$OPg0n0U}u<JNh3qo&zgEi<|!dMVqD6{{RWA+(*Le
z5N-n4A(a`pM3IOj{MSBHRn>-Dgyu^mP5FU^TB9u_^N6ulX<SANOsK0gHc1ye92F60
zLuT!qV(RXk!&@gl0aa@^0b&RilY*`3;MutX_w)VHP9$&+sd0V;e!nyu8pj+#$rn<*
z68LD75KKqxs_Qikpu!?*ICrynxGHg$)5^?I>ouoLLl`CQwZ+=8suxsnlP)-$qkTtI
zbHAE-BIsInl0h4Qzk=$=8H{n~m$e|qwZI#ZkQMHY(_PfH^R)Y+cerpB8e!#*aWTl1
z)_WhPH1st!883MP!{`cLOub=YH(J%Gwv`>NYfD4`BwN$SD(q*vtN;gVuW?|n>fS{F
z0Jy-%)73*wf~_LQ2qNPW4;h8WFV^dNb8W&b4T!PzQq`u{)QJSeppZOlsOj|->APdY
zg-MBs9+*<St@orbl1>9oeAT4pliK+t0N&6ey{x7>mQrL3krGUT2Z*=PLTWgZ?-&4>
z$P?!aGh3?n&wUoOHLO1yt|RY{p9#oCtJPH0xRUL(m`?W_5PL`5EZ6m*$2U!&q8{O3
zK)<Qx<f>ZjL$y_#qSD$7ID>)6{Fb(hrBl;s*-D!xqsPQV9iy4Qm(5<Ji0xNJrPXy>
zo3)Pun-9XqpTv2;E27iro2Ou=83ER5Y#H@V+7zi%4W<BVY(X*fKeCmj<n3mA7?qvN
zYSGxnw2=9kkO;@3kTgt0v|N{Pe??f;*iPqsL-7zKPs{QuN<@=p$>*{C)TwTmYe0)R
zuwrER%9DQL1FR+{-<qoVY&v#FHXl_O)E7C>?V#is5#`?q2|9smbna*yfVelmuRM6E
z`a7EaF==(L@Zan#+WL=pHUd5vG65&=`YF0&stW=(oD*a6`6r@W2Cjy*FdFnZrv#Sx
zt&KLOg;p0d?qk220U7PWrC&~AwOT-IiM)M0(x*wm#-^3xo6WY9@Vn4>StD2K-4)b}
zt7*r>m$Uv5v82f^+XQj|k>T@QcAr$r`?S?r?y9oRX&?eT2+h_{Rp^x{?$l}9xrXin
zNFTb2rZ(-S_nXdRx&rTxCOl7nA4MZr^H5SPadrzpApDPqbS0#|0OlvT^YTOqA{q!X
zAV|sw5Wol=@nQ2zA|PinKIX_?WMG~MJ-$etg?H5;KyKLJi~Uu7;m-tlf&wb09VZsB
zpUGEfaCEe`dq>{~=C+X)dVwbY0L_6s1$K+J-Nrx}%yv|0hinI(#0$sjt^n4_gCmo1
z_l2HIiOT_pI`AX`c|m4^0hkdbMk_ctftbPLj_5Zz&k%FLI2ip@NmNiI@+Q{(e37t_
zCl?ZJZ@BuXi^vjOEfN6}@=`F(j~5q%Dw4gYue6)`K{5CGttM+=wj6jd#Q7{{OGqL`
z<Or21ac!G+i-1TL{>oLh{S8xGK?l|%RMjei*fSA)-1)3M6<3p)C-D<+FU@I#AVHH8
z0}!UPh;=S5X>$k=f0%RO-D1@{UU)p>-R!nCQr1Tt0Dmv&v8w<<jFU2OjC)P(ov9?+
z@yPL?s;cKf0&k8~_1t~rO^>3g;y{o|1F)Z;$#i1I_IV<2^1(@@2%Z3f074Go2HAjZ
z`J!upa6OJ=l}Tvn2G*H&4h#>!MWak^7sae?<g@h61OVPMHv7WRqyZodUgtmC_X`Tz
zbc(za_-=R;WmuCoxFuMwNjq(8$s$LhuIX$8&9Og!eUgIqc9PS!OiW4V)TnE|P0YzR
z@AvvCWbKTQcn0Uw>?mtXNxg>vkE|aBbF|eu#LhXAL@1;S#m&LEly$(}xXp+W+m#L4
z0iJJ@e3i7;kiDibLtytH0B`2CDG_dDO@P1Lti3B{9T|&SR;4&_f+7X)AKwcM`)P_$
zNOh*)MXjO)xvrjO0Ut$|OcLM@JT3Y8tz8>--q!QK?y{{TYKZ}pk!g#fBuKxA<ff)Z
z-f`J2(nix0*hk%zs8B%@e-1E^f+XC6LTD04(KvKk*AXZIB*#A@Qb-vaLGeikxbXsi
z(-1pM<v{X5Gi&k6n{+U7009?O-a#au46j;8osp!Iad^m9w9)X1H!~vNRZ*@JbNVZ~
zQ5Gfu`Xerj9@=z7v;jP03RobJ*ocEYlyvMZaDH&59fXcb=B?r<xPWHh4nC?9OQd~s
zDrdIVCPr?nJP=K-d)~u6)k30S1cFD!$v6s8cpRKbm0Szh;w(;7>}f5MFZm`=HC|h2
znZWFP)DI({l+T);q{$h9c{lke9wowZd%=ZLk^^uffB+?2(+Rl;$H79~DRBgho>kgZ
zwpb4-ADS|vEihxa1j(`|PTvFLl+17dVECRqf@hLREw~wwvQm{}B$zNw#pI4qQv-I;
zGs*6btR4<#)<Z;scAWU&EBKGAREZK|{mFo)CPB5ti3&CmwBF{}&(#&S-zNejl?x2d
z!aZlYjFVxR9zRdb8-zF+=KOw1o($Z`;w*{SL0GulTf}kQLdOE|51q_@ib|x3ZU$i8
zM@Yk_BpDc2@jq6;7I79fw;@;!`r2}!Ym0#r00jCUlC;815+L}fwQwe$@puL}%3kHg
z!;&*>`Kv4fBxA%+%^kKlo8yR4vs4U#Ow9Ho-R_EFK>@b4?SJlx!)wVmBe4AwHsVeX
zBK|k*m@0+93rBPJ=%C#OPduNysa#2u7vz)o!mjdMc7p^P@d=VfI$L~^!kbM=`kLZ}
z7Rhz5f!<G&uT7+g7bXNWO4_YO+6^5IH4*;4q(ksaXNls)Dsf{$PMN4_G0{qshrDpO
zd$aNi$MH>etkf<$M752KcYve!BSoNYo<1jlEM`wP<CUKJ)2<8?6qy79e7k+Zjcr>l
zp;zBIuWO(T+DT3C+d_zN19OF^<f5cK`Dtr}wcAN|7oPtBqJFFHjWMys%xDdh5##e!
zB5R%Nn&4Vw`G6unMa$l$LgUGstv<0^d$!Dycq*L=LD6e-*>+<!7*(Am0Ap~ra<nPY
zC9*BdAKeC>l0#{WH<=3ENo6OKfFJ^4D|t2sxxxB}0Ax1=jHO1fX=o=W5&8E_;2o%R
z=e$Z6waj=p5y6>OpQ$T03!W?os(Mc@ro$F)PjtedeFrp{kvF<(4;0(9jMx!)2tTP_
z1DiRVTMJjB)-?g9`<BEWNLh^`j@m`IB~7B%jUJ>t{vJWPk0jjTrWaMSR*|Jph=AiF
zZ5`BAm;fcDN8$jB{XX!o9d}RHX}40S4L~PrtU{-#X*5aXe|60zb#7@@jf42CCp@c^
zMK-r_aEQtKFtVy@X=&O6>PhU|_^R5jrt0*CCQDlvj(&?zO|rg8i%V0~X_xJwek;qC
zeOK-Qvp4yw>~t^yCVOP7Y|}lDW1ky|Fa|=_W9t~)u1GZiQKm;O&w{T>q%(9I-^?ks
zwLRrfI2HmS8=3Cuux*eV>J_v4VmE1t80{<2dSgA;P1CeENMI&@c?*Z5LHdf?dM&p1
z07vFmf9{PIKm?Y+z7L=am(PAS@xaz;s9bDd*dOSkAkudCNo9nAC*@>sa5x)8SOGJh
zKf0c?J?mQMAH#bIkI8ejM^c!m-+0pxY>;4FMeQmUIia91NX_g>^!?SHB~~`0cZp+<
zJ^B0bQFR?L>Gi$toR=~90_DDT`|ht4)lGh=w2Cyr_m0OPe`SHyb;grMj)%ne8=vT|
zYHPi$tEeu6dx-=$o_rx^bgd!P*E))lOO2pGpX|4sMr|IAp{Jvwbrpcz(Jn4I=dt@M
zXQu&Ho3)Q6uc|?4G4PHf(&x{?P}e(exY002y>OC1kkR8Fe>I^}u%o3LL;@fZG327t
z=yfYO&ol8j6Fxp(pPG_)84?K+=rJE<QKH~w&WAAe&(0CK<%Efd!6snye^oj?rL5;R
z<{;e8N34W;SBHE%Cvp5nOc+#OSk!Q4NxX}Q$D+GM`&!1dU9&9^C+LlE66$L>w_|}V
zn30ebUCwET891IJ`zb0^dx&6g3~Wr?{%IO5AUs?;{9DozfmR0gmtJI(1o<rvo*G><
z24Km^`;`N!v^2b2gWf^n_VGnVsbCSu5C@;<$^&)PkGL!jXmklC;DuGSpGt4I#lQsp
zAy@t()M>UH;$2|g{jDBKdKIZ?X-#qOD?EH*l_O~JKP4u*-7R{w>DAP|l!3b-ox`5r
zuP8NrcBYoGEcb0KHV+u{IPSSFz3oLx8mg2u;+v{6J|iZ4j^T;z;JK|wRIR9@=GLf7
zi6oFRPjBw8A052W?Wd@Ji|KlcAE&2K9aVOdQmr-#o&?WmmClwHz0D#V+d$)sg_Z1`
zN+(-DQ!iyeWVmzYf1=yh(5t8Fbt%3iyKo)Hii<T>5fDiM#z%nT=VUG*G=Y<8BHwsV
zB!UR@Amj2uSe>yq;KDQ&DueMsl6wL^R`Bvr(=zZo_<i^zstE)FCONz)*k#!CTTYQz
zXdDTEP0F%NwnSJ!1y`T|X3~8vU=-y40H+{di9C6%s^>u7;KED+js&RPS9M#6CPCpR
zPVC4WPw6T!KzWi#BE-axPhUhr+G65h@Gb;I{{H~cO-SxC<C8Exzi_Lt?vigF77FDk
zlHwph#1f{O%{eeY_)UjDvQOFwZQx$RDcIRFH@pA<C*_pRcf?NU6X#@teHn-C;2Vo!
zYP5?)2<L^Dq5!%YX86L?Yd~a!aw7BlsL8G<HJ2H@XS&6%B*~KpIa>PAHWRzjRy|?x
z&+>~H7)B*o)w2!|4hU7BCD1Ph)0H)M15+1^i-}MG*MkNG;%>TUOfO-1i5QC_X_qvA
zD1rew`6F#6#Jli|Tc1uanY4g;ARIuB_(sHOX+RAY$77$T-*tJ00Rx=QN8jwJX>J?a
zFh~%u28bY&6L^dL{rF>(S<=06E!&y8fzi7+7Pb`nkS1iwwZH&+D8RncJ_JSl(uw+E
zBpYCxfuFzGSFF*yBoZX{KYyyW*28TP;b~Q?2?X1fP*qVxIp>%r#&6vaq$6l(?Gk-I
zG??SO3A_MfkGrY%uzXVnX4W5?t7*vtN}E{bz&lO^;a^}j<~cESM|gce8e}(-%DqUt
zWRfir0}DEFH8C#}j`4-Dr6I5ngc7nT1H4F?$?xQ^>FNs;j9)*ZmGs?hKxu+`Gjv|^
zB--;5vp<Oq5?yY1kE(O5Y<?&>z=D>KYv#BBU%k9ge-R|yPj3aAQ&i9l<nRv)vDSd|
zAB1?L)NVJMS9B4Z$0;m&8u)PdkK~d)M&4N9J=plJFsry2y{9pCJd)W0Oi2;*Ro%F_
zoK9e`QaEoxXy$L>*<aEENP;t9ZT;02wWYxDIT%;8qTozPh$&^!V$)#&2^luTLXtN@
zF>cnMl8KT4-~k>#Fr|_vaxO`_!Lk~Jmd_)$ETM6NL|o=BK8jk8fI%lS0YkhJeupZj
zNyzks?in&;!A{dRfi2_nRp~lzaw1Hrw+NE%EgXcz=${WF3=?5<bo8k(yb|ttH~OkN
zooiU=I6k33&Z0rFnNQZ1<9(%#YzBxO;(aWuzN<zp6Y8>$RCWQeAj~T6*SXmlJR)FM
zEjEUxEw~>3Ba~xQ*B^-Efy%^Ys7#l0VdABys3gcSJe2(l*`-y@BXmbNpQn;)Bs#;s
zaI3bg+GL(emYIk%bC@?OY&gCIL>UAdkPLkhkT#h!jz#{87Rhd4o=P{90A$8U<!AL}
zE*Z_p$e%P1aTk#TfD;!$z6GR08jb)N$r$dO9}qrncq6(BKnFze7B)($!2oUK45y;$
zZ~#D@o)zna&ev&4B0w?RiBqxAU>GOB6<VB2d5jT~RQ3bP@fSZOm0E9l8cRtX=H8!@
zk+p;p55>&M1MlxDlZdp4@WJ^e-Do6&JT3>EC8JpngE{aw1p22SkY^lspRl51!6PK*
zV<XK>)`Rgmvwj50;(mw-s)FlgE-xQHnt++jz~D->+D@%qwAe(vzz|1*==9wmMbg^J
zwa@W2+xS{cHK+AP^fbundP7B@#?+=mhx<Xzu|541nmXfG*4!L-HH5mO>_d(OcuC<d
zr_`$T9Yg6;sUQe;(jl$vKB^v$TSm2T&~~!W2{Y!WxF>6x=etp_eMyE;1pX+V)*c~b
zXzIAs)YK-x0dP&e;b>Q?@8Ii`1AD9^BhMVH{SYThR796L1AF;Lnt07YMWmoLG<7@P
zSxT83oXZ)?p{UgwdJ!hzLr6Z0q1G{)E|%RrCbpb6by93x?qK>X2k0MPPRem%CL%ml
z<7Udv9G;=ZwxEG>56xZEX`lGE*ocx>MWps!jRv7)xZX;!Tc(OlwneQnwEbsDm|v`&
zHki|BHYDU5kegNmXgHF2jE|bPSK+ZS2a6w71J3$ogpl4mzRD%!oXZSXJF<J-O!6u<
z+j2Ydr_^YLHx~l{s=pgrOH6@~ww)ne;I{ZP!kUm02{wfdO1psu#1jh6n1aw<cou}j
z?K9j30!`%2%$Y#)c1?&mIe?zKfP4c0hyqYDyO=hP{K6Ho(~(A@uNQ5_rA=E?yG{P-
z-frQ#6fb`FZ4Dn6WDji?IEVy}#VtX*rFR*CZa}hjwA_n%r0Rjcg`g5M2psni4CjQ<
zzodAk;c0YO_UF-9Wp|gCxXei&3XYYhYo18}3wJ20z61_<J&?0hc&VuXo11r59?$h4
zoD8ZgB-Uii%-ZWjycuvLTc4^$8eI~Qe_7KoUu=>%`mW9HH08J-`$A!J9?NNsO<VOL
z?f4rIOi$nD7gFNQ;qKYUv=6K<KP~?NX83fDTI-MtOR@kae&h4;S0mt!z{miGm_L2h
zT6EjIyLL>$cqh;+L4Y)n2H_d~)+$zJpK+-_44EX8A73RE67uSW%`Obgn727sYgZKX
z;1-8F00#j-e<jIv?z2vd9ZHj=yA3iN)6dpcxMOK$WQxAfYBbdzDQO7?typtUZ?Ltx
zZ6Qle&}<0c8~ymMPp13LUr-flx?BLyYkrHg+wQcxqzLS7`Sx37#|-Bx^at@hUcFtd
zG^jK&vuQlcXVo^otkx7*ahNk7B`-@Ro|AN5p)W2j7wyW+uT(p!*K~spAYMr0DrDU|
zS`2o|L9K9Z>-L|&)kNy$)3wm<T1=0X`zlqnO)e#__{p^Je^qBoNW8HB05(amA2jUI
zG<0-ZQj{A=1W1TiA+(Ejy3H1tzt7j6$~tZg+FV2*#e*oELz+On%m-T8_2&yE8YZCZ
zcq9A%N@g+}G6P#^F+VT96m$t|NZQa3{6jo`Ne~T7AB2KOKYyK&L99WhND&!}Pok?z
zsRQ0P861f-n7698TsgFh$!HC|`SMb9-7&1kYTQ2!>99WY`dh?(QdEibuNrM3I=Ugx
zdEZPuw;YKl=5NoU<GrWrHD5xVI%Lx|988HDfUr65G2RzN)_TaG=Q525dz<kWI1%B3
zXXd>7tZTJ8?x1RFo*waFS^$CQwDDuRvd@T?lSAl-5hcT$-_0s*-&K_Ad@LsCpF|oO
zjUm*ITjH&0s0!MV4g5C^e0+XOd&ETL()%Z*(?45MqZw0Zfae2&j(=ryRM6dJM$>ty
z#*)^##1coQZl$N9sY9c)P;B@7RT{>D1Hv<9S*FBs;d`!2&28~4z61f>$o&)*ofF6B
z=%yqLMWlHTMHOce0E_q-{ZaH@lqy(mXp_v`6Xgf;LruUL24?u<)AdqRJ6n5^$sc?n
zq5yG&erFylPmKPq(mw>)0h@E>^5V*3&_IKJZw5I0lngE-IKD-_NwSsh+SnX(AJJ#^
ze-QvS#wNf5XXv8=+lT-k4E<4)Bk|_YFhUlAdw>Q2i>67K*dUPGc+4L@RhAZpfNhwA
zA9gCMOJ|ToV8km7B$>?OJ1DVBO>iU$FfKfPSxC4agqwqSw2wbU8zr(xi01eCCnfEN
zMZ_2vzl43=AiJRr24e(<_@6bV$FwkrGA;oVZdOj2+gwCkoc{n-peo1!<GF%<N^3}u
zSmGNe<PYYuD{d6zHVM1|pF!ZXb#)AWAtwWn7F~PTIL)Mq`XUO=tjHt*G08Hc1|7kX
zl0gU0DoU;fp^HGVJSb*N&CfSm&X^`L83H0FfP6fAqiL9EF&LO1(LmS5z%qLe)AvNj
z7e4SwzA}SoY4;Z6z!D?ouL2yxAi$Xd-(No-RZTIL12SU&0Ft-^Lu3+2GBf4M#`e;5
ztz3sbB!ESwMj8d=$vyu7sw&weNQuGWM%M$3+af-PD2e)+#l^QWZgCPkgesbg0(@rH
z`tYr`MKA$?U|i&RDpkoMO^t;1^i_{#R4NkMn}Lho=fOLxIJ<B&I0KF_hN1Y$;8=G|
z$3%m9;K7mLx71GR_L>0zpGm^eWRL+Mf*=rY^9wsnCE$`E@(0amvEKw4z6kC2-C>t)
zaj94caCL-C5@+v)ZLc6i1K5szLaM+?24*{$^0&!M1Gfw0Vh`W<QfX?{GC(;Qut3`G
zEg+IO5o8Y{K_<X<ymExuqS>@|Os{28@(F_&o9D~ssy*%wu@FQOY){yQHA4-_nGtnT
zp_2q(gBMOj+D_HOtVkX?Nl>cm+t6MTSL+k)T;Y($^D$)$oJky$j$ugRb^-;%aG`ON
zAmagc#uUNCn~1zx>qAUnNFUW_f*gFtEVOjdZ-&{;l;zRm+Wi@cIpPX3V&up;Aqx%-
z;QbKQbO107<la1%4Nz99LA8&9gb-UZ5!jTf87BEMK<t1t!;U25x%sUjRP@|QnUW4n
ze(36*w4Clh$_gz%gyzI?D76hQq(qWVC%Ru#@rPKtwOy^e6>mw?9{Z613zOYl(e&si
zaD&Hr{Z`c)45k2_cglHrG;;<Ux=0L*0y~3+R<}=canBICN{$9(Tv|jGUCn6_ZU=?Z
znbNqqXVQBK@4-ykq!D@XHifmR(>3jh5&^trW9oF~lZgU+g4SK4%`MWeAPmnug=v)l
zAm(_-_E@y5ZFCZ2<gN<A!U6S4!5h>nf&nB)98d12ah8%VH;EQl`aMVTkuYou)?o#J
z$qqM*U+A*RG?XqM6oCL>_E9{!!!b4@%2zlNMkB%$PlE5nNb%(;ccpin$mTE4qIx%V
z?ch^BAc&9*`6r`*0EjU+SMdw<Ogc#Oi%%+02qXY^ALj4*c2H9=U`_ZO_v6J*#{mK^
zX8pMOEUQQdNp^492?8A#BN0EUk;XDfpAcs%#U`{cJehGaWBXZ6x`9=c+#EPKwa4TX
zz@5?!?SuFAR~fP9V8O5!Bl6?TL0zCc5ZEU06wJ1IKAGm8x|7>?HNeLs_g6Z$x}em*
zOdat#@wm^zIa%}z+pDJb3x|lGyDPf)d)}*3v2?VyM5E|L?{}utd#Y@_L2&{~9+t?W
z=QtBCxfYXzt2yqYv|OK<{E^V8X=&O3zUjTcR9)zO7f~Q+*-Ty6AdksG(<wQkm^a+{
z#uSyE???{*QN_Xa{=YJYol!<0fd+X`;QG?mFlr9ceis>oi2IbLlUxad99z*)I?_jD
z99~L=`k>%IGA<_xOlbCvN9*X*WX*u&i9acYldji$(-GS^3XZ2%3$E;(2ogR@vnp?K
zGil_lG4bgaAy#p9_V)ynKTj0~=QIHjdl0Sc1QK#?LGAWbFQY$-(P^2Vye%SCDi$~v
z_wnIVbv><k%=Y0}tQ`@!TM=uj&#MiX+<99>v-K3f;^NcYThq}yDY%@8zZ@#njNIqM
zt2C-S>Ds}N4al({-7?3q)@ukZagCxNszf#Q4%vWJ$I=FwbDS<DfT5)3HOFm<m_5*R
z;*ke)MEEEjOLN>wxZz%)WJ8U?@=$8qwy|_TF+9QrQE;4`a1pYJac!(m$s6f2b%X=i
zUJPwIY-3EAlWT+&3<^}+Obd*aQoD(9<~a)WV*vSGXI3Ow+IR>Sk515PG=a`ay-<J+
z<G48rj*-(@vEP+ypc~1={%APsy^?Di)9Y0-T<1mY_4OY$y84pllF;`=HKn8gJXT*q
zrYLl34J|V1aWOwXr_pIc!?#_*q6~=s*<2a_0Mnv$dTrX^3u5y)`}?x6`h>tO+D+iz
z0*n%0fMV|3$0t5ZH?4Jc=_#5_hg5WYE;$&n1J}_^#@aWirR@h;W}2JY$aCjz!)WGY
zE-l8Tce<@vdz@sFRQ0O6Td6L2E&#{Ga*52@VjJ2gpR&7gp0S19X><)ZY}A6{<8RB!
zZ*>aor=l(00sjC103WB*`l?+flUYbp0tgrp_v3W^LEigSrluwXq_n{VaAw{ME6z)k
zQF52NrKW2GTpdlO#}NyfW~ORXa3^!zNRAgpsNUTjSMT_P34j3$oX|0@aV~6uc?1Yg
z0<C=xm^`t>NhDa{ttxa{&<jT<TWfjTD*908RAT_J;2HU=q0Sp@M(DtTJM;P|%XLuu
z+DN9`5Ml@y`u<8*w{R_Vf<%J?-m~{AxM*X`iF2dyfFgUxr;4y%%8dhYZrcZggPYj?
zN(^oP08yd4qB$l$ocxdo(-0qvnIOmCu28f+$89Q3tvC!aPtV80@>0?1I=xjw{Z&TN
zrr*NHnHlHEeS{o{!ym7z!1l#6aL_~n-|O^K>+96fQT`H%YaHkV0Bk;9J(8`erKO;0
zX~;DRg83lFly|;($9hd=O0E?u6THsiWo2(Iu)$MQ+M0@nrEiJufJun$^o7Xvx<;2#
zHpsc|yLZ$02gPs>M1EeYl=haGZDzbTyn#9St^7H1pEf`0!_&FrQrI_fT~A2R)UB+h
zec3H865=cuHKW>fAi2Vr0%ZRHRoQzzS<R{)@0(SjmT8Cwx8<2zd3xc`gQTG9p8I3k
zT}j7bXatDIZZqh!^_pX()fPcC+-0ur6!h!Yt5L?Q4L1@bn-0rB>Hh$Z(rj=6!KAU#
z3zMd-ZPE+~wguzq`KapL3vrWjoQKkTsk&qsAlgrt_2ElOiK_*|&B&Q0R?oI|Qs%V4
z^0B{~8dk9}B5!Gxsa3h$8{9Mx>*T6dq|ig!KM@xk_Tf36({ZzrQU#_s?k_4zrWycj
zkZu9U^Yl$bkv9f8jPc(;L~oMQCleyq{QGxH>8m-Rw*xa^Fi^3;l4KHi6MwQ)k7I*-
zhYl?T13&<RCBpt)zTr^`hD6Q5uvS>$NGARlKX_G|(hNa^^j4{~ZNq^ONbU9%)kTIu
zn032B`}_Gk$N{Fp0h>pXYD5W}2+1}mqFh5@$%p`gKEI-nmXE?=tv#(a{_5nA(qiJm
zc}1q6*$~J$m1XppWIz&dIP&#SCQRDymdwH6=2m@pIRpSWIsMj_uAB)3P9{I9%hVD<
z;0%$0-{zcE>77)P?Y00-h0xL)pAH7xf&`;`L3Z4Bv?$tkmqt$}(djFBIAoFFGX!=3
za8WQsIFdk{_f5BO8=?TQJfmxXacq+r7LMJ!B~5=$WPvk*2j%*!aUfU&ao)<U21I!a
z$KE^@ut;H$0pywbEPl0}DpqL(foouW`6y!?P3H<~jwYn-5IGi=8;rU)Smfa}eH@a|
zW*P*>e*XZvr&vhvlXLp3MD1k#6-Sr?^NW)lr-(o(G!o|x#PB6KpvN_!9kI=zAG;|a
znvgCXa{^%-7%9%;(s-VI74EB3M$-br5q}dA^jZu*1Wm<=kJ(b9WyRJvF)}|@afAsZ
z+#U`>iXv)qahW{9g*#2bBXP0f-@Xd02p4Ov6rshUB*-@+E$3jQ#PBzc!<avKLQJ*@
zxjs;n-!8Dtpv+Db+6@JLeNf6xK4*c?36)lGCP0(l%|Ta9SdISbpyb%X?=*cIrgW-a
z_gA)~zy-wcs`X!K^Z@&<288N}8DN3)i>Ek8Z#k$Up#_l{nF%d!qtqK#q4)hXQZx9a
zC37Ovf_W`%=E4eyf?!0=;|tSY(YjCYjIC2eM7**?u_ilttookKXmwPDZ{dq)1)ww!
zUVcj)KmLXlreDR$z_d7@)GaMB@qxsxJ%6*bXuP?B%-{m;$JI;F{hO<<Zu+&Gr=$)}
zPoP~He<KfHW;4Z<x>F9d&&ZFwsOr7}F(%^17fq}7TTM!lG&+Nv;4)fA<g+yX&1xz+
z-^EsK3y>!6cQXqtf0N_CTYg!w2nIOs-53H6Bgt>G*$q`^2I*@Dj$A+hddjh1*j;5R
zLF}u&(If``SN#<K0Gr|7wpYWOoMCB(_Oy%@a+M~S2a$q-+B(`zU%IJE9vy%)<g8Vt
zsM0l#twro>{{Rm7A4R{sy&_|0YgFkquuccmDC)cd10SBiSyFdU0j4r7AmK(eVZ?%9
z9(#W2`kvM*5M3jY7Rm>Mv=Lx!${%_N+&+AR;G~V*LGDlJpF>DJ*zIf)g;Q0g1*O1q
z@>*m-jDyU|t*NtMnQ0iz1ydrpxt&ELPN5@@44|b@fpZ;*;aS!}+}OCD6&8YDv{;@N
z^QJa5^iP+QCx~0LOPtPN$B)zR3ooYh_Yt^^@JHmgDd}FufB^*e8Cd6L)r_bMgIyuK
z-Y!%7m%Km*;NQ1wC-|D-xBxu=09AKN`X<hBFnPL*c;QM1K~cg3OpKUJ%_Epei=GAH
zboAM$b289ub0_AjYP9s+Wt&B?2{Y;Ul@rN&KSnzVlO_g8y_Btx;kI#b1>>>&loi0H
zOf(1_5D?PUhPoVL!a0JgTj?_CXo3I{adJmKir${6a|<70GcY751x7f;{4qHZ_hh$0
zu%8&pn-sRvz~cg7@Fo&8ce;Jw!Ujhf=fwkS21J28lPKM)V6b>G0@5eYsP(a19V$$s
z?Zu$&1U6u*+Pz47fnWjz5)8_1POK4UOmIwjz*&_mZ>c8}1Dp7(#~bxGCH7aDGHu-D
zJF5c#9%KM-{c|ddDiP_DIsMXgZ-yj@xEJ~>^-s48Z`1;0%$^TFaG`x-9r(YZtm=dj
z4U;fUu7}=(%ughzF)gf`#%$syNIkp#_$t+PL7;%_XORjD`kk%#Psvh!R}v-BCl)1X
zF;3ID?tA_jyqk%YGYk<Y0|NI{n(d?lIKkmaYqSYz<B3(G+ehC_-onxk>Y{yM0Nb=%
z0Z?|U;j^Hh8z_#To6Lx@DSDpAsjLH>4sT$nx~aYGI7L>eB$6}TRo-`P`YW9#nyZfN
z;Mx_66-b~WW@3A&UfGH5gmjw?037+iOqSSp(>=#^Qx!2LbJ<y7i0%2O_df%P5J=7t
zts0B=*K39dyp;N=sN=)~X_a4npt?5YPj|Eyo-jXjv=Gvy=N1U~cMlbM0IK&9U?Om<
zX&OyKCm))c0H!Z%$vmLvNGrIRB#1aFokATEligUV>!0|0W>s2*69WM>4^9WYZSD^W
z^V;HA*phKKl}dpd#o)-riq_FnHM)+Tn6Py~2|Ri!w^38KZ=)uTmW47Pwwp{%kFOW;
zR@fke1i_ZH7$@uGqbzHLN8yMeqQOiVP(dyP_XLRSxGy!+@mpJ~r=roC#oq23d7J%t
zTu)K1`i-es^S$<HE-u<U36-<gbuQKG2A#54%ek#HBLYAczje>+9q%;^q}d={!#_W&
z>G5N!wmDE~sRpnYUVHxlWj!i>_ft)`Ah=0CeU#0l*3lps69i)FUYSF7v+%%UNsO&+
zDce^{-KGGNJAri`<5#M>0NTbyq6gRtS|kdjG@Bclg$|r=)oN-v&>Y~=9G?LTooPCi
zX0xYO+DRpmXdVk2rMepj9r5C>b=U0Ch8G`+z(E-P<x<c!%mK}1jo|*vTz};^w*3+V
zfOA3ekt*Kfc=(90X(XSa@Kv<TYgjFFllj7t&x!c#4WsYwRyApw!si>nm;lYb?k<B-
zsimeq>@#!BzyeR-?4YYx-P00xJm5eFzu%I#r0Lp<hMiZnwPZ;yXghmB@h1L7E|0XG
zEzR2cfRB1|;t3!q>-BdxRN7Bz)E1E~aU>4&e;-8sP1UqC{{V$<pWQm6h}~ca@{WI1
zOG%{^*i4x>^I2Nk3>KKiRjl{2kfhRNHL?Kx{{XtPENcKZaXgYf<wnBVO(fV51e|_W
zPWKJ6IpB<}Ep$o1DgZIh9C+Mea=z2F7Si})B+O*4rNn|@+?$W<<hieD2%2pWA*8^M
zCDGvj0H!`~`Et=B{g$6a*HzP38z*Z%G69PZ6_`XA9=-|bxV6V=B1g@77H&NGxbz;6
z*C=Y7*Se<X)CYA3-hEA%LB54*t*Jv<?wkkvW<z6vi!UtElGc*KTK1WdXe(1q)%t@q
z0RI3RR@epR{zHY$my4boOr7Ua&{wTU>V+Tm8UT53^Z1DKS(Iw*mOCW#!ajr6=~Sg%
zy87f*q}d+%#D@-Z=B#V_eveb6cDJY1h;H*pUvqgPFtM`rT5VDA=VIn#H~RNbR(n7U
z066AyK=V|cUac}nc@csw^-7gp#Fslfl1izDlXLjJMbl}x8M!0w6t67;&56y=lBrIq
zX&)CHz`8GfNhFW~?tc6I5vn&5m;3;4^#F-cT22fLp6U})1eajUcpw#`^Tr1Xv|C&F
zT?FKr5KrIqR;hq6d)v3|g+Y~p=J__CHENwkx(GIl3mH*mIaXT+z_i*X$IFE+0Db1-
zXK@xdDkeby52Oh5Qc)qK$dTJON5gMvM6fhlzj^wr6)8hE6CZvnFHcdz;BAh0Kfm-=
zDrz3;WDAZ#A!br3>x_s7V3`GFRwtP)Cil;Edb^9s#2J}c+JZx5T01~Q?RTXbggB4~
zi0l+@kRW89ES=<sPU+@Jgam^CT;_iK*78<Gxxj%uW9+HufNBp*&*{&}SDVD(+7+!c
zVbLPwnZ)+DuViI7IMkpBX(VzWV%}r-R$fD?mVji%fhI>jYPA4BkRWDG3bNqn06Hfe
zU(+iFv!zO;7eEqo0_3Q~I2ibWIh-PS$=XEYAAF;v*)bD<aq~?kE%<Oro-g->M!1JK
za4rGmYdNK~<;3GXmTs+}H)y$Wk|UJy?5cxxFQiBp7l57Gnw1{}2<&)7-6KhM>V>j6
z1pVPNwA&1|(l~IyTJT)0X@MY1Zy4k&tPly>Y%yc?`^u%Ju*Zlay1da1+(btNSgfyY
zF)hXqYeAE5ztte<9mza|4s-+IcovmDgGEzcs%0uoGIQJy>ZEOBL87a!)CRO|U_XjD
zU0|!D=sh^qE)BeKxyOi5RT@5tp?AG7);HrZ1fB%*bw^32t<-CdqpoaIAMH^%Z)<l<
zyF}({y46ElSmR4h42?KB^H*C=l`-yUQELFeZKKgx6koq_{071;A5~{i?IxCqt*EHl
zZ6D0z9w_FQOdg@rMN7G&X66!9I-j@NYBgO?MWt<4gZRwB`XryON9`7!qA$=>b40gb
zlH1L@{ozHZqUULv&~aed7VNzGpIYjI3w4!j1UGbj7Ozk1-DbT}Pu>ck4~uXBecgHa
ze+S8jJ9Ojc$1ANbbrDgmcH&&pU>GnwrD^o4E<G!KCz4|V-z867LawVxCsRy3o4_vH
z>pnaMkD==wN1~Te0hQ@Zn@|!ygb=m-?s#ey{{T$4T}2@F)M<NxB$x*BhhEC>)G?H|
z9oQ{cb-k*6S}vU?+JhTi@Q<hJwLP8dI=y{rbRR*EbBu1dBh_<f{C_4B*`_{RZw}+8
z^yarf&5Uzi8YDE`L#Zy)vERhC)Z7066OZIp?vqzTOG%8W@XLP`LgM|r_KKY?LugTB
zDzGq@I!%wF=JEdk2inZH%5R>_r5!W$RiCKpT9-5yf>_fQ9)6!y#&w>fUtOwwYLy&d
z{0$`hJ)%%l>c7Osz!=ehCJFhK4N@P0a3Ies-0=86JG#R!B+78lt5x^8gLN0bcq2H+
zWj#tVNslg652Qn^Ae&s_c1s-gRjNA3w3BZ5T6${1&B-Kl-E+=s>6q?pC0x?#oYw){
zH_BN>V|jEBg3D*P<!9>%Yb-4W{wh}2+|WgbfK>HQfI%kGL`9Ti+f2G->d4$6i<v4;
zm85ljKqT9`uJ>=NbCZ7s(R6xDYg#~&YlW^>$mjaq>AEJht&u#;WMOGkp)DYr9uFhJ
zeo7iO8pa5AFia+^UPKdtZhrY(cXqa#W@~g&E@;kLBE@7qnyNJk5(t~$^IPXo9OEWL
z_Mbn?ls@j2awVY2&Hm4-(r2m5Ts%JgG3a$ljds_QA_1PuOHb4Tzy}BOQ0qE-OE)#3
z$8J91W;L}a=N*;n9J)`F#g~n*si*H+T+#`_cXQ9#SUQ(Ji{fxh0JSMqI{-Hw$JIrv
z)3&P#13ky;s~#M;YWiJh1_;~_(P?|>U?5Ch0XJOLM!JncB=!VlT+?fuKn2D@vb?aA
zrcFAUmo@fu7K;xR0oJ{<eDXO8H`U%RljK!;j@tVVhSxlVy-#&>-xmu4C!YTRvH(ai
zxC_XeN6|q}sTVwb^XQ$`gGBe@GxJrbOFQ5|F~lfb{iF=uBn%Y^)I(i>coM5s)I^X>
z%=)52;c@z&*YGwVD$msy9`k}Y^i@q#OwITNpmWF>!JP1`MYWez1b4mx$IS=4+-=3|
zs~V$&5!y;`dGTihn<f?weW1a?oWhOOiQ)uU1p}NxoXJBZkr@Mo^hu3Z2rx+#2zQrA
zJYG|_o16dx25h7nI<6D9JP#CbdaAE};&1gyvH&1uDzqO-`&=M|oc*=~=fUtn3PCm#
zZ#6A4L23B{6f~X#iJzLBd_o1UlpO71n+u%YLCUjKFg9CzaVoU}<~wf#z2#eLcQ)j@
zZV3q~mg;OEybE$83d0&3MXo2D5rre^JW_q6PbjJujUz#9-5Um?8q#~55HaM2xS4B2
zc&1?uV1w~?AqjEXNio}>ssgn^2L3=wf#3`NO2t6ZJII;ts<?@W7UeX3OQ#c2z4%=x
zv)W->tFoaID9eq+ADO!6^o^R0F8nhv7hCMD2Axf6?%gZ^1GqpD<g)VK8eutf#h{kU
zLGXzAqI2CwjZKBlU>m*?JcmDh&@{t{Epd_M0S=(Gx@R|U#6f}#5&5n!t^PS$&)==D
zP#A|1c7Z;V;<G7_f30;u*&x6GE453TO0)7yiIQacm0P8->~qM;nV4OZh{E=sqnyx7
zn`yBSWlhqC_f@D}V9Rcw%Kea5WH7o6PBIETGsOxUo$!~Nf^aYry&Pyzv2)8oF~=R0
zT|L3{ed}Ncl5aB!-0hk=^@D|&V30-p0v#y#8a6j>;{wFr=N;61QjcGFce!o99tHUH
zwp6+R16#CSB$F0Ydc*hYDKaJm#f|=63rD9m8YINylCJTJezZkL25s$WjDE@*wVcL+
z=NA$n1K-s(O6HW7{AJgJVpQEbr`6Rw)pdxA5h0_w@mX}a$TZqom8!74j@;{!6SMoP
z`kiGx5T?o%0jk*~11GjdG4xmZeuN4YjT+S7sE>hw$UWpwc;#abn^R9*`|<v%xxtTL
zKh0&9P4K|`>uMeEYt-K2NdS^#ynWxrOdR&KJ8X#p4>wNM1_0L&$e+L2OvY_DA~P31
zJS=T=NM<+RQX$`n8Mw$!(hG=>h#$mx9%#1)i+&@Xe(<8F1@hA+!v@y#8UBh4P7NpG
zXmQQCKc&|**8z242gS9pAlu|uPUeW1fNgVw$)B3%x|15}uPrh=pQ_)(+aE7ZLBI#P
zNU`!!wb0;rQ|boV&TL$nlAwrps`RH%I`+}gG?%(TA;XRpih<0iA+{pNx;~poJou5D
z{S~UZaw@mBCo-bt%I8XNFKY>%ptmcg_E%mTG~7*AI5mzFXyifkT<)obpf{6v>=xg$
z-}hSLv)tBFX_vX!A4Q&e((yM{(~TceuSakpt{_1M2lNG1r&DRSA|v-(-E*pRZFY^$
zp{D~_>XKX-8e_za*>gMP;yek!PYv12JnY{K!o7f-ixDv?8uz*5o)s|iIl1JX6Nw~-
zx?sc*B&X_IOYcm8JTfFA6>;#vHp+lIiJtrbm`svQ?~9|>30E0eq2Nbh2fDV(@e)kU
zghI-f4D*#ED><g_GI_{K;}<J(jZo;3VUS7v6`C&ufx73t%q(p_pqP{9PwG{wdYCL}
z9sq)WRgPGyWZb1!ZN~(N7AMh8YKI91A_y4+@945AYDU@IbB@a8D!ZBt-Yo|I040T$
zzTxWtZTzQ?zcrn!05Wsn4T`@}{{Rf!TZ17`)s50Qfdt3wo;{XK_k!k^O~IKU%&5eM
zKsh-AB}-CBxwmfT$pc#^_>4m7mxduMgWLnzThmLO8Fa*h6X>h#Zrc|S41+WERy6+r
z0SENR^YTkBjy0&jXe4F<iy!ORSaX395j>vitu4kc2gWVq1Sy_nkVW8J8xpzN{8f1Y
z&NDtjAmi`NOG3`TwV-f6RSi#x!!zbSsHqqr#%~{aO)J$NLuM=ujOAw3yRUiX(hmyV
zSNuJJwgi4_H&sxz;la}f%gY`<Dacz9W6S0Xgg`%<chi=c7d_<t{{U45J!yaoNj^ON
zQhun_CurKkz{v>pqiM9v4FGO{Z5jEiJ4V-7d4MnXDhdRcvA+WG*;=PcrlI1MCd-^d
zTG1qv;-=kmS2{xBE&wnQHX!5f7hBPE-{G1+;+#QygvsD=v)Ry@rlKqAtIa`*Z2|%F
zIr)05uV{UtdDF9}(j|q$@xaG)VSTzyYS64bqw1!s%Z86mY(L{Y{CpQeT57qWkQ&<n
zxlV(up{D8Ln+ODpfeT&zVupuAYgBnI0}%wJcfBr2rQWO5w7n}zYFB$i3`sNSyx*$(
zL#pdqf*kWofRNf?llLK1>m6>tR@FK6DiF|MFdQmWodleLdtr2AhNC-Ho2L+D6p(xW
z0C?vxTZ!(Hu1kh(Ccy-fZRWSVoo?Fio6Anw69??9YEl{`#E6dHeoJelYQKcl3ax8g
zNP^RY`y<aciTEs;^46<HyD68|cf&9VB1z>^tES%4T-XEG?+W&}Sj}B;;v;0VH()YF
zuWsIoj;6bs*fxvEvgPs3Iy||1DAdzZ4j=;n+!(U8q1C^pJ+&w|<7osENWYT8)^!rf
zhiN-bZ*>p&tH<+a&2?WNwB^a~r>Ofk?HzuvsTw_9-vls$bLg-=xzRgM)GB+*fu^`6
zq9Bov(P8v{rB;TZR$!Mzf)u@TRjTT5_tol7*)3_bPhhae!|CMVd1vj5nWs_ID><(=
z1j!2hI_+=<K?EBUA4QX?Xr%ia{z+-;{98^BW!aSBl*-1_8XRZ8rIG&t3<>-qLFERG
z;n%d7Apqk*9KTxtV7BC73oLHk6IL?o&=PMVZ7Q%kU27i;Elp9}ZINSZsvhX?$IV=+
z%706ADml#!`2my~z~CY;7(c31oZ4h=?Z{Hrbvm?xoaD^HB;;Ghoc&p4LPic&mrm+N
zi2(30s%mLBadSx{#Z{8mSlw-@W18MwzL%+czzB8XB73XMsB18^UB+>6eAgAD)j5U%
zj0s!XJ!m6oAkFUtEOPO#WVu<cY}!X4ew~!MYK{*8LC!Dcpfy`jka;$OqorGIZL?sr
zxbW=od1S4&wA5zOGbGR7nyIMK3%r0Y7y=co9cDfl-XaCa`YZ0B+Kj*_Y2aOB?QyS9
zEGLr}C0*X+%vfL#zUfh}A9%c8Wm@Xm7cd>S1G&P&p;WT1&QE36jrN;(-fthJTIyO*
zT3!wNtWe^|_A8}%HFVAb;0zuYIjkEhW-sJI(-$>ar(RemzcXZ?S9=BIiId8r%A^RK
z*+^=S!c55XznZNU();3O0WoZ(`rDd7G6x5RQ*?K+<u>_j-h0mo(A`ezut}SqP1H}S
z;(HS|?xWmfat;UxcLHP6H%5sebdMvMOkV5}Aq3h72b652_d3m-Su+g!PcWO3Vq-F=
zI%fh};AHZUq6MyI=pfV6zN=?#gj`{3XgV>j&Bx)*id{2fD-l;{Y5r#kk4{%XrXqvd
zFY!%@f<{HWR$1j^Ok`({n@yBWu1sSec2{(|I&GmQ#wKke=AO7Bp)Mpzuo8TKRbX!v
zN%(e)0|3ve%H>z6<a)ScdWP2AKoUeBlFrmoEvP}yJ_`Q;R&;ej@_z{sA^=n>yi|3)
zCo86O%A;)(ZZp8{q+okrGck+d8wG%qK8jXTFEA`ef@GeXn)Wn+K_69egJq^n$jBV<
zvZy)Ct+rrI{1&Y@1~l4aUM{Ap6-JY$aToF)zN)^Yfu^P#$jKlpZ*V$i@JQfTs#>BQ
zq@~~>l9V(d^3W#bGl(q!;%p@|oei)Akpd(0P;POMa3%omtveM#*}%lem0ZDiwSp%Y
z7qK2Gsa~Y`+7OlMEDa!$-s6?m`zFUXLs9OtUr+!@&(GCz46_Ujo<Yj*dKD7RjZc2z
zHynXDSb47vuxNq<pcpfE5Kri#)Uum&RXz>Co0m)z;zli%F2WtQWC#RXKc;@D>Na$>
zY44wgWZVvO!sMN__~3elK;0|DJ}G$sG~>Fl(^wlpv6Iez@~d^X6I25*=g5}Cg?CLX
zqY;r1EKduw;~${Zksvez9cU--l`x=cy`yaHi#GG<pd?h(HLYt%J?$lEGKi$$Tu5=i
zGj4G{9I3a3VuPr-w?f@BZeLV1h!zUa0Mj^V?>F+gudIjG(=@;Wq}&U_vrt1xb?5NG
zx${21MJDI<jrYA=jKBa&)usa|%U^IL2^LuJ`kh*^7n6c)2b$c{P&%iztZcyqiO-_E
zxQLyubd4^ewN~l_H*6x#AObu|&6S?BsMWH7{t}xhweQ+ZHjn}Gi>S3)>N;%`w6Cj8
znt}YJgMTrVTT1;+Y808MtX$fEg{HuIE_|=oP8e+-hfh|7I8=V@mjYy;KfJ9czNJ?a
zS?+NF2|V_nq6X}2fNwEtgY@xIOJqpfBEW(^!sp8_T~cT+l0n@7c1FgM;MpKJ0z{en
z!j7mW@(Bb<&(p|-B@h7{@6E_EJO`?q(WIyj--JoTk$)Z#GzQdzVg=7{l5*p^#$&vn
zuaaADxPj<IpOz8=faa1U0A?{iW>yZUZlw3X1kNB21-d~LT`&leNRbN@S0b4P{sv4f
zyj`;LbmR4~-9aO{Rm8a5`72uX#Zoz&oTw?Z64)1?n)K&SI`-e_)c2S(J9%2wDHha_
z&_i;3Je5v_(%NO`;oLw5Kp&dFs;nB)1EtQe2Y|Wq{{W`5rd2v3wyWL{;>W%($7R%3
z&_6@$^WR3Fy%j)fi~+D`9NlDe?FCIs*-o1SOH7dGMDhClS4ujrpFvu$PwET|qUvF?
z&`CY`?Fur=`zlGN6}JF=VnIOWxVvb?V5{23Q5w39(w+2LNMntO9wArSIEafI{!4jx
z7+8o17|$e&gy2aqOvR&wL_<J`gOCDt4c8Z(K^9FQ4S)!}@_A0{nHHRgArJ(><A4eL
zlh}iRJEQ@$3j>@2kbxtQ6y>r3Hu@CKVE_|xfd+nwn3-2-1PtaaDt78v8#fRD78mtZ
zK;0P!82wYpIOh;j)Ei%+T$2LBxs|D=c`!0!AYKB*(+`WraC7uqI$h20AY9%DuPdL)
zBYZB`oIqg&L=1_NscMYo&7Row^j9811*{VsIC|w#)Q}?5d?v%?Z(jwTJ+#R|&S`9J
z=R6<kj5HR2E_>d4p?>DO>1La8z(cqWVsQiJx6>qnrKa;Ev6)sVkBA797$Yj1pMio+
zq`>+sdSVNSi*g0NifP8J5w({BdqBUcDy{$zixLSoQPV90!#0o6Dy}<2K^uVPFYC&|
zuf=%{AOa>ywf^v(wF5~OyyE`=d??8!@p}+3e)y1xIJMD>3mCBSDe9zko<+An8MmUN
zsc-;H?lz2P@0DV)B9MuIE;9vFsNh5!`dqCsO;-*rox(4U-O;hYzr5rC0tkRW;E5nl
z)F~;p1d9p0URIY$jU6xm2KY>#Jxbf?f|1s_&$ER$ZI1Z-io~Em0OoTaKa?WY>uTvK
z3bfqeP+);4vgXeZej9I5XRdYEv#3-IpLa+Wv`N2-_57D0buZNFA6@cG&4DJv%}1}+
zJ)>zQ&Z&13bD!v>(`$6krq)$x6KPxc{C=3T-qtj_>a=XaM{A$a7g6d}A4lD~qnzoy
zl79UV)zVU?;i#hdU@;MeMypLn?r|pn03~uwi)y1D1LejVZY|2x)6z9s?{?kLB=`kJ
zqXX%Nx<ngK9)%Nnm+;j6-4|^j--1HB5f-%JtbWp5Tk}a#e!pu04JF*jNi81WnDPXX
z7_^*xmbQ+h9D(LxW<U%UyaEVQX`be~1i{SR8FL0Rmh-BD1PhMXRclRFk-I#`Qfs#n
zCII8_$R?=Z8!oUmk!W1M{+%BfI_)qHB$$F$T~C0)Jj}}1*Vd*%6LD)@W7IP1Tzyw|
z9XRv6wR&EqtuW&dK5G82T-PvwZ^R7nxyo;KG9cWkAE|j45Ihla<fRU&<@F8X_*Ln<
zYv;FgP9@%`Mn~qWj*EP@G*t_T6Y3#rHLZJO84)uJAEvR+hJk($b-zl15d#s0&*b8|
zaN47p+($RzCQxCM`bGZ$x|34&(<TMGWPqwSUe>r@J>Z0-+P$O>wS|o*pBq_cn(iV5
z$o!S<UZJmV!a+O$vw?@1%-$Bb&KTu)cu44V6Wppb01YxZvb9p6Ed#eBDxFg%IN@m-
zQ_^@D;|otosV(L_d#rF^t28RXxhLd}2TMk)i$jB^cs9OM>I9ABD?3T4BpJ_-3tEY7
zYIZUY5VNyOl?IQgzKg(lBK(;8E$uf~VPKc%kZ13O%j)!{r*P*Aps!YzEGIcpmx^V}
zzr*~SjMa4=OQ=)SGn0P-`mRc)V9i0=P53Iy>(n&ho^J~-xIXmDy;hz)H5mT@C(AQz
z(AEv0Iy^a9+Si9Tl4RyiR1B>H#2yN2&Mq=?d{ybjW<TPYx;MIi<vtitw8w%tB*%he
zQf(Y}3E%@KCkUBE?lFnFzoqEfolV(9mNzao1JtRxV?L^5mH5qq>kIgmW{*#;REP!+
zAS=z#7C6WeI2V=C==2BpDo=GNkRxneMqWuL2QcTB2AiXTM&Tg#&)t^3o2F8yPyo+}
za74<y`VD0^z`41Rg?mjwg1+m6vSu$n7g=TGt!6Gys%eEP=V$;M0H}N+#6`%M0c!O}
z2Wsj73`}A#^I1i%1c?(q61K;tSqZK*t_|C8NabwltA7zpJRgV~?j*U#<-)}U(oCCk
zlCEiL3M^}jh-;j87TA&CjG<!JtmYc6w2~%v*!`7lz_y+~C=d;!%~WH92m}H$05cqg
zX0zs&-s8&LefTUdZDWn<fV-(wk_iJM1=SZa^W5cL*WajAJ%%ArbI#)&E}7FBk(-<P
z@U3Y+D(w?;d@QN4`J9{(GOSUk8sO|f#AQ3sout*#G_?iI{3PZg3c06veJ#LT#L3|n
zwVwA%9Rke|LXW1j8ZRbGroix*#eG|-0i**#CBo;r$Sx#?8^MTM-sn3|01!owW9qPV
z18%9&!L^Z&g+}ps?mX06WRh><6{oSqve>tAg%;4!xbwjZkyYRK3Fx#E2)L9L2Iu%I
z6qp(&4DfhCt!c-+b1642APKPG532NC9+c72QKz_Wroaf3mFK!DKKofhm>Wx;05S*F
zdP`|+rrCF3BL4tY%jd1I3<DcpM-XrKWOWQuc!E9~kN}@l8=4FEIG1oS2sM=pH1$3?
z8*L*WUglRk?W2e5tu5*_*$$X89N_DD$yc=dz0M>C$q@vFoz_E7Rix3hLrjK$Rij6E
zrwnWW0f&7TXT~2<Q2^_j;Mf2jBZYr-RB(tUWEnH)sp%TyRlvF$F%fh3;<Ou2-Q0i+
zq9Dl%GFB)n2OVCVPx6yFly7kWhdOyUA1_!?J9GM}WNq6_8<J1-x;<m6vrka&X%I*P
z;SkhxJu0@N{`E+gimx&Si*M<FCA`AUnN8*7?xezYfB`3opEZlndcL<yK)ZWePIk0F
z$0_waN~c-WhkLP1G{c@8em>SqE$vv%hf2e%*1uO(x2R{p1-QnvKn=ji=h0TvGiYmC
z+W})7t3YiLAQ;8K`by@`y6~EW@LC%NK(&<H#EsX8i5{vqkVuOWo0}h(-O@i3<j5ni
zfU{^|@wmDG0N@291DtmNAe`WM+sO+Z4Kr+r5+}~eMwn|%+zw(7WT+d4=NKYlB>v$&
zK_(932^bUk6Ws$uP5vTt0D<51Q!$~`+9n-<VtnOKKrb<+d8lBU06Sn^W9y7D(af1Y
z+s$n1&e|c4AO{<Pdn{dhfMN2q2>GqNU9$3Y<GO{ap5&erXxYA?5i!Cod77Y4JlR|5
z^tWmo;d|a!rN8xXS<9f(()Su>lE?wOjGtf15c{v<Y7+gRJj*A6=B!pGv2LE7%k1c|
z1D`2Y^qLUbjAd4Wz(Cvy%I3Lgq=ikb>D@Y`-BW9xJ^%!})6r$>wT{(R-%z-c00i=^
zYxg@)TR(v90NMwVqp)s*BoJ;4LU_sAa;0KK1AYewlm)^-;PPaBVHA8)0Rypw+kfIP
zBh%)uh=9S`Bt+a1`=)7j0KD>#X`4lff)Ale^FNi-+H@R~7VZs`!6F(xX2lhNH<%G5
zrfu%wY(#UyK8*x`2IpRWiHHWx@|g9)LBcZ-c)?6uL4bEhf}%d#Okj6I19X6W5`tO?
zKQzI_b37ju1+6)d0mzFFos6x$7yttx9>?tPu{315dz&6WAE%<*(mK(R7UKT^CC}t<
z<6L<mvlBJ}P}IB(iSH3_qQ6j`n@4N-{!#aZLr}tVU<IP_$orL!J+;X|wV}E7LAFeQ
zd}P9z&2e;NGi)g0Z7~?a>5Pbu*fEG!^sR?XM{$@`+DLp@3uRx@2sVQUo5)e0M>}XR
zOL>vOpV0+lAX>o0fpPnPMJ*{b;6=Nptk46D5&`vCHTa~!0W&0?_WQt200tZ|Gc)6B
z_@Zba`F7{B5DxGzLyk%PAzD%DBLv>}!l?2h212=56B0O%@%yS(Ko%2jMU})*01Js1
zC)PZQk$^PqCg9%x0HTNz1;;rDJ(SD@7;u}w2ajLqnSE(#hz++KTI(~d*DZZIV3J**
zxmpyU*o)-Bi>`02>Iv$kL~IIho5^>kUbXwHkUy5$P)mWta3c3ZxSEb6$tFyNBTSHE
z1TMXB#iysH>RMnN;!C2^n^fZ6jTj&jN|C|Or${$|2^=A-Ow2e*9fHg2MLNxS0j$aO
zQt33UXZn=^E<8l56ezjkz(|;Yr_$(&Ad+PMDYADjg`sJ56srx-U<~pZNlU1%Hi`_u
zxq^~YbghiiVWQ(4juA4oHUM22xs_MrbvbYlN6}QPQO$I6K8sCk_oqchu&88t0#69F
zZUT&(5!l&g^vz43lZyyib)nRWb;uqBe3v#;SvJP+7d5PVz2?wuFhpfnYaDawb$?pH
zy!V-ss%`+=j&i#3>B*O8V+fh$M)rX`PbC?u0L%n2zDq$Tx}>3>o)d=OD!|gb_!3}v
zsT-yZGJOcLTRksy;6zy5Wo}XIL`NWSu{4B&2agMGghhr1ZThZHE2qYx)f@zonaG>2
zaqn=cWAe)A`kPCq;fVs?3!JS;weB%*$!p^2pObq?%C2yU#yBcK3yj6jVxgp9i8dLs
zw!jDyE#4Na<yU(N$+TcuRd^u8URC;_Hp3i+QnP+uX~Mmr0TWC)40xvp=9w;0u2}h%
zy`!)2G#mDSxt6%cp9@P%s@MSrAX!bDINH<^R*>NN$I)igqsJT&m+^bVP6kO)nxVj%
zx$KmR*j{$Dj}Mqxx`&54#aOLSnG+eZrBTGX>Wb27lXOlqepGF_5D}K%1<a<D`du@t
zn_X?_`WSQC>coafE)4=e<`i8bpSozh$(!C6MM>>r0E~Cf%q%>aUZY$j-z|=x?AJ^8
zY^|oI*YX_q?mnaXEe%$hmX}OR%R{8V5iasv?JZN7;%T_HFlHl#FTQh}NjscwTyK9e
zyzk39mhE?b*!t>34$(}I+sP71`{iDHDpS#E#W(<x(hM0t-2)!eJ~=4f<NPsm1O~cX
z0I^%{^ym7l_sKE?j|AKAD*FptOgP$Gkupagk>zJSzVYMU4}=+<*<BAMwFv|O0BI2;
zfn{Gra=P~J)W2`xlb_XRxvh%=VQFhy6`Vmg@5;?}+yNh&>&K@)K?DYYHYVNGx3CQ(
z3yJPQ#&}hOW=o`j-@3hBv8HXW6K)YoFVJyyI&KCeV{8HI%Cz?bNIC#<vC7Y=?{)k&
z6L*FJcDg|HR;m&134mvke^9csmb0fPu5|0Ff@TZ?1S*aMxP#AiHFy62-l~vnOsE}G
z;w}~^!E{cT-qJOM0l-EZ03Lf47rvO_--JT5V)K0XCQaH86|}*nw;*G`_E69s=TQ1_
z@d8g}7f;lXKqe=PT}bB^>qFdb;UJ&C6&hPfuH4^tTmZ}-eHImPMy1Vz5>IujOp<p8
z006!4vdg#Y97&K3{8O9(yu)H3k#ZGnLyK7xd;0wp)oR6t0Pd;QcISxW)lpMYXeLR|
z)mS!xf)KVE(cipTAvJA=jwU4q7y1iDe!Hb?*{<AYj(<hyPjkKy*&<Ay!}MHFvNWfu
zxxxu+Y+x8Z9hYY35LniU-D4L%vy3i3HU9u+_;hzYfy6XRj}|lHtMw8_kd3(l!pAXx
zlAVkL!TgX(pFh4;PNQFS4YT;31e<~Cv&XgGoS#~As&%FUUPx?1ni~hm56xd}wvaoE
zNCNXS-Dm6Ib;i0R#FEDs5$aZzR*|MH4BN>B#IC#352oqBxv#3@V(2(p05fcv1l!^4
zsAvNQnWP`YNxpM+cA59C0oOJn1VHpzW!p*8v)oTouHfTlhKRT2gxbcr+Goz&f-pTe
zPf4yC%|k>G2ml-KBJtvbUis3x3_#4xdhzO<?3kK{>@@UkJ0Q!N39{MH>DtO6tsD#p
zSh@}Ry<4joBz_`d{{VJfHkzOMA)riwGBfj0<*mFprJ@5LahS-;o}|cwV{r~L6EUuH
zV$nWcaHe%J0Knb%-Ev#Ys04f<5p#JUW9Ec4pzbCN;P(0W6(MPzz<fi4&!UaMjKq>X
zJ6$c9={z~2o4qHWRMWTMX>bvdX!`k~2IFwJ$1&^ZrDG%y2WRgIa0D0x0Wmi`&)?*(
zd$xw|1d=icfhxON4S*0hwEb0&cpJ2|SPAk`O<P@127%^6!Rl2djx!+6Y<w24Qg{`B
zm;tgP&=)1u`h_!0vXi$NB0-q%`L4eP8|BUB%0^96YwC!|9Ic*|#!`}OKps-E^v!K5
zF0Aw3$Cb0uXg;kMRj3G?iIFGluH5*JIn8TmXjN>uox-!O1*6yctMzM9WnDccy3HWG
zm^b3%Fcpmj5mt>xv;yGG;qfuqV;Y3GxSNt>MprL&uSs=mBE3KzFiai~Yx?r578x^Z
zafnj1$;^T-X9W-+%F`#06>{n)QBcz$+T*umC9v#UxwY1lC`)#mi~(^dbsBKiheQEs
z810WVOw;PkEdUn@5o=mUl4}A^W+0H2Mv7nnZV9<UAKlP%AmwQiIR$`$>EeL28=e6t
z$Lyj81QWOud&&xuN!ka1JpK41Me!WL1msvnp(04<H^25nVgMt=0+0~mB<A;4kpM}7
zXt$D32!bXoW0c{bHbIH8pT88G*AZesiy-uyNKKp?1bP*@quN{~+r93wv|zNE;_!Q|
zoggzJO!GNh{zl$4xBdtgwqRSxDmsEdA+NYGd-w5INg>h=^BDOo{X^uz9<U9$S!3Hu
zfz6ni5MpGIKBx%3<0JzUpQi~u&IAUK2#cFr)d&y?2OXpGTSOisi6q6%hnl^nW3C!3
za3&;tjs9wr3>%oA3l*WI5<r=OH}B@BE{-%Q0&X2{<0(~aQ;z+G{%Pru3G(iqqo6>6
zJ?yY(qG`?MJTVEW0Ou2HlQ9wklAt>vxQ{cH8w55ap8{v*5A3cb9bjPCiR2p(u&LH?
z0NeqA02OaiUQ81tb}Efh0502ZZ^F|QP_S(_Bo7j%q~Kx=?=WV5{{Tf<q?mydi>c_H
z)+QY@VG6Q^8Z@V12Z218^<2MPrt#H@JdOreQ%(Q|X2)y`pS~47tJB%3{{T`Y%oa0y
zto(UuuPMWh8d|THvfvo<N{*hXs?oIOPGw!F`kGf30e<9uO2(GEDl_oGk&MrZ>bHh^
zSf)1AzCf@XA*^>Bh$9hcQf&Yd;TN(O1HGj8<gC`3fNLp{7bArhr&ii5Ev0uJP2oqV
zuI7Ro=#g?22z1`xHCW@%D9a>;_A@8InMnu$3y+#hR$8KanA$NT82kAwtxK5a81gDN
zYMem?`6yiZ_2oGZ<+hhjc@Bw>6}!4<?gAudzwWt>9YWW{ljK)LLHp@jff5Lix%`us
z{yW1Xs5%J*M?Q-<xVsqfT3tnex;Y&8RcL8T?nX#kVbdJvD5=sfa0i$c;ce(TZsWPw
z<dHv=rK8YG$HR+}E2h!um?@3cfC2lQ;YX7_ba>*$tDzLzG5B}@&(&pWU9r{|Gw8a#
zV_Mdp1>hXza{BfS&LB@R1{3k!jFq(Y$*l(99#0FR(Z6$AM{9x!k67_stuk9%kU_LA
zi>K811l*GVOeph4o+`1arMMeh_=~I!Z2-gZ40_#k&mgpt1c5L~5U_fKnp|K^ctTz(
zJh6;jw8x6GN~|(FU`nY_!;SgK<r5vB71HNVQm*G21p3)msANx!%5q#}cR)dlgaCf1
zfgJWhc}@+`1|akEMhuHsPjwhQkpVp8!2vJ6cp!s|NC=r##9I7HfRyy!0fU4E%6ezk
z5(haw&{pa1V3;G(Y4q()HncXOvF<{-uJ=CZyKsDg0C1;#o}95}$3e=mr=ni)P2;jN
zO|@$g7F(JP4KUD1?cj{E(xfdN7d6otH<h9r@Y~^mVtb-(5CDrw?|;b{Y`De8gn+s2
zX*x%_;nTV~o)6}vYo0)ke1zRCfQ&%`B>srlHwY#z1b8Um&AheK5k165--mT(j;eH(
zbm6k&a`DI?f6+i#OF%7{HawrnU+G#_yGyC8`g@3MHhcJ`bgX+vOidQK%&?O;CU~;3
zQjn*nsZ@`N+)c-zRrJ&DhUUaX0Bq%NUeGkfga(%aTuwpl_t{U=ojDy#$*AcW#{p0x
z3>m&j%q=ZrO`8s|w1|?ZK3d>hIFCiyk4yo!2!_BWA#1vksQOgE_(T^(5=WQGWm9kZ
ztZ5|fa1q!oijEYiRj76l8x0PSN2v1q;ZUT?zr<_m)qq+Z07NVGoEt!i0FXf9RW6>!
zwSXVyIq*|z9k)vN83$?gOX>V_RZP{XKk*ahqJmF{ikYA|InFXkGEm4Ci_gt$$eT#S
z_u&|&4F)bn{1b;s9z6C=Ndoo<x&nG+iVYy$rvP3NRIVQO8_3C&I(8bN!)>Dl7w~n1
zW-NR-ONA~Pkw!yFfpa<estk10H&$u~H%$(-+B`g!0R4p9&gdISRlo@&(5k@8_eF;Z
zRja((M=*+*8w}bmLS!J~=LsJSV4Nd00R{{az~m@*5@Kze@Qea48L&OjPN&%VL9Eo(
zb|5xQ{)?q)B<_P_gU>dO%MaOlMZGlCi1&`0WRH^H4`^^9vTy*g`}_0Z@?Z9J<Ivjj
z#u7jc0^ms#<7=#rt5vkAQG<6pxznEf`K?;RWiz?JhX?=?0azV;h7CIboR||fG5RCN
z+1{A?mY4OKWQ?}Nv=66((XH-<0L|7hd9d|WpG~gnbjcf`zsob~Rh?5|p^an*92LD}
zw?>BDD?k9if!vJ#s!B~X>qN+i5dcT8nx36EH~}uOxsefzTf>yqSQ<;)!wG-@1MU`E
z<$oD^I^wtS4}0$Fgn(v87XIqLTddY;oaY;x0nEpA{Uo*296$t@5(KKA&>!?zA`6{~
zT29hk6Hoq}ktZX-eygIQ2xzqC*p->l=sC0)19C1_-5NlVaccuJ9hWzkE?vGU!fH*1
zkVMV~?2P_JFky3vBF5m0$9M>~4bU6_1B(enz)UtQED3}33!81g=rIkj?P(u<`=%`}
z1i_5lpT1R+HWrq*;=~I=PiThQ#F@eJOJho`q)Vp3qGtChRxmUZkYgeMQWl28Tmb}R
zoO#&y?46;YfNmzh^U1O;0j3-c{{WlFf!gXv8GVe%xx{?aJA14kUSQ+Z1KqY|v5yiy
z@`*N^cyj*J^*2)0KTS@~Edj)th08Dsl|Ca4ZhTgswDo=OT+|l-0PKhy7{Ap}qfE49
z2+W1+c(Ich^E{bvB$XQ4kEf;wU~vSKd*OC{H%MDal=qi3unTZ_<JaW5j)|oHH(ctK
zU9>jfksy`adzWc+RNYRXcK-l`5H8;n>-1Uqa`<>gheczlGNaxJ4?AE0J|7$|cU`9x
zdWC9nX1stg0`W4sDo*!0Y7HM0X69Ue%E#*s4eFJz8;p_Z#g;hL)@-yij1CAQ%1TaX
zf5mx=h*7#p7>^zh`@joeV}djLEfKz3lv>BNuwq17<JJ&XqQ<qr*p3g+J+iCmwHsIf
zf=L2FkGMrsPy=r1%$GopFtXPzH^V1d3thFKfIC@J4l``xC0^AX<^lPtyDecc>bh~f
zFrb(Q;v)ufkv)&XIGiGTXOl77{{VMGk>_(M?iQ^xh0MY13JtJgCVO$}x(=JssERGr
z>lk0W9jp>PNt^xMTdUDDRoLN1o2a;BBp<0<_xU)?wCTtA*;%(O7a-aOB!21l5dZ=>
znHTJ(uR-)`K9izaNHHn4kSsS0cl?(`%2~q;HB;$<AWfr?arR+r=;UoMOxhu2X+(PB
zIpu3<ow0z-nGi6r@-GFahz2j`%A=`d?vWG3gDUIm2i|1G;FI#Qr(C(y7b3z;+vKAi
zw5SCg^#GA<oU3&DX>bQ@z|2`gr*QX@Lxg!}E2GmopSld0F*9$=?+Is}ogvIs(;61Y
zoCs6tUoYJQz2L&s>N;jQptNV0AG*hN7g2a1Lu~{Mr;c9GZ<ehZpqL~c1lU3<)d-t|
z0@u%y&(hbD1BK3H&z>Pp_sdIxi(&*Hr;pW6s`gGKw9E-O5I`y>gD@L62FArF)o5tr
zz>tBJL68nPCfQXXY&wg$h$b;Stop7XcL#t71NT<k;P#1p+bXSM9W4ZWF~>iB)|jIz
zi+994L`C3_xmGmivB|_LyJQcF2)Q6gD%PE{=!iIxH&vHO$F`ji2xuZBi3UDjqI#eJ
zC$<O}O-ewF#7T}m`=_j08Td`F#LDOT==jLiY3SB`x}Gh;RKJC#P2vPcc>!tF7dY}c
zKSfHoEs|~`AeGaN;f)3H8GuFq05n7r>*5HL`=DR|fiO7-`Y9k88*e-nN!q1zC&2Oo
zk8ES+Wmv2SY<5tXjC!pnNh)m{jE%mt`6Q&K1o)(pxk*oyz)6KDW#HezXw0aRDq72b
zLs*aiGa?~jkQzZ9*0)Y+F*gS|S?3yIR;y`C+~P#e0xDX4hjW|ofzB1Zpi`EPF?mqc
zs2XJ2c^}n8J4VWkq(Q+so65A#qY@fG2l+-JWK?)@v7ZRxGYKGVi{J~Y_1b-!TAgXA
z(n;OeV4mwEQ0>H@mt*oq_dq0!92HW%Xf6iis>dA`)tq9mlh1Xxp>Wy`aWl%m4&74{
z1<jV;jM=2-GH|2H)5O21u4!(xj}^~rYs<`YI9i=XsUprKf!`~iuHs%~%%jB*$x#z7
zD4GfILE6|-Fy3PPcrBPFp-8;r&fKY=rf_7AJEf*{A3s%cnQ5KE(awIUY~vmD;FTCx
zbu9(N{M93pnID_wNG(1n0Go`d7#d8$pAHZJ@Inj~JDN<`6dQRb=zw<Cdrh*e{{R&C
zGzS^9d8%ckX22^HS??SKSaJ|+x;g}qU2-olw{+XvZVlk@sA#CzZnR&<e3sUll(c*%
z#{lEeVV!A=KIj1B$k_RP@}qrn2$2AAC-?MLDpGa97Kx6^g<=7WdF5w0RZcY`2JYEJ
zD+re42rz$7RY+dvk}?rUYsmw^g<i2cUS_43F7UmU&qvbLuR^U&wHnaUTOdP^AHEkX
z93~nhlK=rXT}Fdb{teR{b;Ry@8)*}Ej~_W%=NE-nqRNJ+yK#p#(F9=6AJt9NYP3-z
zWVmGVtGiO&M^F_h0LDGQfHHl3RGRrZR}dmZNaNF+6+c(;&g&fyH;~-PiTk>y^3o%1
z#ik4ub5_$)?F3+05@tM=I*tGUS!{W(ta@|(4FFEEm;hPQCejw3wXXgZz;1%*FaZR7
z{FZKz594X>G6a~gjx*}C_fmBiG8woy5DSU=bNv#?%Syo7EXB?T7mihKr&J9UR*1>C
zy1Phip^F21#QN~EI_tgN2evF-;+8zqy)c?UryP_JOvU_Bw4IFr+m950BHWG)t=S!|
zCebn39m{0#!9pAXiQ|q@NC5mhNkCZ88LAuq02QuQ$GU@aBv?Yk)3`RPNU%JGvRwDN
zoEx_U@$ykuI#KElijWV&GvZcjn&WaH6N0bx8mx6f(|8vkl~Z&u4n`p5R83WIyn>y$
zkTKm%_R|)No)kdHfHB!J8yX2E!#PhkA|!VS#1TC3o{4Cz-QvOD5R8PcM>}_&4w$9U
z^!M6G-NYZ7(`6tIE^j%G->R>tAZcipZ3Af9(jo_h{z_&L*9R8(&BT67g7aN7qSZc|
zsxqF@5M&9rq^vHc-+ixR8UYLfVr}&+VYH^J>b!^pCh#(DeAX}zsaCe2m&=JQi;^*6
zJ(iw0St8MGezvVZ((R1`1B-$El+~CkYN@v394C@_!SYo)B~w)D9XH=&v;*=BN{Itj
zDoJ+HCT9FAy`n7|qLV`>2P8%Ee&dA>zO#mwyXByYL~R@h?>;G>$2rB$879pNw@^Ok
zr)mfeB(&nvR7uM3O#YJD(Kq6Ucyg-sf$uA6*bUt2Aml53DDBa)%@*g)ReL@E09b1y
z@QISO-jN%+8e3?90TLqo@nxXL3Q!3mM?C%G(MP1E{+`<s22K1Fg{7gw;Dcgh1;Lfx
zPeU2PN`DcPfq4qZXaJBc#9BXf1t!1^2O>xUC+xzo&>BsF5qbOjm5pdm09bRzWhNN_
zkr84@i1{aRbBkabT4MmD!2k;a3`pT17c+4roJEdNGz$sf-s0b7Du!PJlP~}>o|{hH
zyGV(}zJVZK2LqlZSE_y>0kcG@-sTq{hjHTfBhFPFSq!C8p&$ma&IWi|c>e(EGs~x*
zYij%MrCO}myq@VbrQyMsbBqC*Q0*j{xp!r!(sXqz>X`Ce7{>x)Z|c2$`r`8Y$<cd1
zqBN*Bo0%_Z$>3zIS66LqNoyoiV}A^o2M6o<Cfsz|Am)#XtB7a>k_29T57i9-Y11Xc
z@n91%=j$ty^LE~F(0%FAZ{jmD2j6v9tLf`j)ryRzP_@kfzk+f3ski&>MLJ?(k!*s0
zPe{Vw_e`s$_eUEnM}IZX<??z)-Y*~Nbm41ht5F2iqFULuWAjyB%bsiqIfdDcJ8H4a
zs5b{#gOls%79DL9X;gD(@R$=aM~B}lPZyhI<?#Bl*;(QM8)uP!G)}AwB%Q>JOxw*{
zc91oohLA0h0F?zw1WocRFZwO_7;34Ly5MBz!^J+(E(Ab>60FhDyHiwAXt^>-z#Pe2
zS}u)BzNwm8RK5G%_@3i$8)Dz0k0+NLBJl9s`E%{0Ne&_-h!^{W{{X`g2VsDGm!r4Y
zo}WUF!J=(PH!pOOeyc00bQ)A?olcddt!VhE<`*VE&ciIdTK@ozWZF6uMSUv^Obw7E
zV5H46=$9Ox#F_j3)lDX<ww%LEwD992)9S5O)jqpOXcXrp45|F?_TG7<$4+-&KtEen
zq#ualGN?X-nfOe3E$u}K_=m*flLXJrQmaN9C9((t03zRZU6^I4;m&UDUqeVbpxtl|
z4A{ql-qTT{>O>Yu@izJ_CO!0>AmGWx)y8#qF@dF)7&Zyzx-qgZJ4w1r1V<M#e>F!_
zrL98F9|<`Yjy~mTbiGwAZBWAKcVZ=I>Uw{^rtsJ@-Y&D=t4(dkHPHD7;fs;a*;i=m
zp3r5$f!zgOn_aGOE!sn@*dpFa7wDeH+9Z#}1VXMWD=NwAji}wPsp?1v=s!hXwzZ%Z
z@7xb2)=yWawba9dZbT9X-;%4S=^GC1<V;@p_3%|DF*aBsTsg#k8NtCS!)wKhyr97P
z6`tA;pg|5U0AS4g)XgA}8eon)dMecH<yn1LCBW{Ou)HX}>1YANAa)V=@=}0W+gNuP
z`~A?pp7F9z1^}7xQ@On~_2C!11lS96JSblK&OtW32>n!UdzyB}CJ5)cebQ_qGIR4<
z$i~rpF#vf1WoD1U@MdR`VpVpYW(nlQ{_Lb<h$b;NF$kwD+ex)yvMdeYSdW~_3e}nb
zfIQDWs;|9qHjzC2k0le{=R0pXgJ@aIIbX4=)C3!o2LeB3TB+ROc!4o>DQ!q`+Y!j+
zK>BfVW^H~x?6rur%|8^6Nguj7+Ab~39H;<saSk#GfhUAOf+Ew*j4IdE?5ccIE}7>a
zd?>csBp<A)3uH;af-fjbdvI)c{)(V8cT8M_lZKIDZ#0k`;~EJ5i1!BEDBAe_k^t0j
z0@sLIAw32fwHpFsj4iz-AxsYW1O-b)LK+GA#ukBarqRLRD78r|RffPg=ao&><iY2^
zDXP>ik^m+>4|NE2$@+MwF)LDnORdg)6YdQ%L|*_zD1-<k<b4!1t2hG&{B}=7T8&2?
ziBu|8F1CG9KA~|th5l+V(&v7#naGz+(nus)Vpi6eZD=PxI14*W&9>(%%}T|M+Akau
zvCEd&aS`ikhdhY@9tHtfPj=^$r_|NlXUts6tv%M5%`?jKNZLRbx{W-QNLXb+(hS&l
zR$HmsEe3K{eBm^af{<CORJdkj+;&ys@)DEtP60S6gNfqaDB5J+@H-$72)8#1Mu|AT
ze#ikPEJR0S?I7_U2I-z4<0+cx5Ks~p$&AUu($Hx@0nQ+iz~|Le(o!#UkV@Ur(Ig9E
zE-<4iykgd#kYE<E$l+>ffNLz%Za@K3Ye<3w0cB;bA-h0{%t)2XmD_A$N4gqh0!X=8
znvh)ilLvxTr`0V5t}{FffIf<wo(qhbi+QTR={B9GZcI#(9;j;4a2efx1W%fmb3qa<
z5#*EPf=RK&*;+KuCDfkMEJ2w+d?^F9TDM*AsAngVj-v~odj}8)GPD~<W0=M^V89Kr
zKAt5kvXX4E$9GHAI(1q#G(@yGkM{sSht*l?Vf3_40{fep$JJ=+Dbt}=n|8IY0$}m?
z`YfKQSA7@%05KqVK*GZty5A|Nz^(wK!)BPyexG?%o_D>VOPk=XR&e(%J7f!AvZ&|=
zKsGku50dN0r!_u-rP}akY&4Q{<c-yapIjyI9wNd|hY9+7rn}tcNo!*Ss-0Ey_(sAx
zlzk@C3)(6$&gcf@5(?-2p?uW>a%F00H6v4{6<7mEmbt73!^HD{&0}@9Zq@CP@U&PX
z8LBPep3(zb;ldr#>2W{^6Dh!R%tF-y4kY{|l`C3JiOgHF7PR0G0#wWZcR2_9pvpD?
z0FM=>%H+{&`$SA#XAszIx(wkH>r`V)GSd)e0U(1@Q*~lzCP^HqgXZVSC;U#6^h{br
zTg@^%%sBpGHZVU@m<e}qiUr{R07L}ToZYv)_bP28zwe9<X7Wr1kv}2es)|Ik5pKa~
zbZs-Lbxk}C+g=O~6y?#*-Tu%iw1NO=gK=@$0RU1j0{8$xAov(kxBx5x1lgqL?5CzE
zz2UDdXke3N!}Zk_wXYO(Y}1J^sb~lAj~_q398R-Ynk@&rmhU6*l0&@hm9R}W)OMxt
zLw^W10Ct~6#&ykc)@w)w#f&iyf=34He;j9>X!|t{(bW#N=Vbhr&V{b4R->FqXoiAB
zj`sOS_E`SUkj)$SaN}jW)84|@I6D$#2!o7zrI}*1?J#p?<TcI&fL2FeA)P9tXWkM(
zFid(+-Imw9YsSOD$T%O|eOEc{GmAQVS`3HO;Qoqm?O7nux-^5}k{f}-kEa7$s8y*E
zCus0ZMp>nqwXCf4CIe622VL$a+^f%sJac(E8bC3EEI9)oc2W>aSVZ3NE%QvqZe__N
z4^O|{7-%3D86X}#JN{kQoqKu^K$eS<0!$kZOrmRxoarJ!=JEDZ#`_WKCLt3<gEq05
z!BC=Wx8&U9MiGlR2{Ll--iSxx6JS0>eE#J-Ttmr|0CFNy1mZp4V?Jl>{;7-D0T4G1
z1mEbH&i?>X;s6&>f<-_g=5fd0=&C`py}$_$k>9adUeMP5AEj0x-QYABiJrk~I5o7$
z4fHmTSy_E=LhhkLl?FHixV^y5#~+{3ZQ|Fq*!h3u#OU-@^}2%1J9kD*fCBD%CWPo|
zQm0y=kRiYTNIl2Z4@~IvG%bGD{6yW`V1BWSBkF~%)KjTKtUGKut#m>A3$Hx=xN&2S
zxk;>b8rt^?PnvDBIf39jPzQ>-n8<G?437T*nxja#FvkFi7oW*l(zsJ2al4Gb9-Y+a
z+FA#`uBJmafr2<3b9h|=)lsI+zVr&Aw`u$Pg_YCk8g*s4Bz_Pv;y%i4T@uctS_H|F
z1R3RX`EZ%KJUQGePjxmh+&GK?XS%JY>2KER%Ni$nmpBN4{nT^=j1yrHVr2SRb-v2!
z+Gtj+MZ=>0U%4q^j+NxkeX)<S05ko=xZK+-1F3X+x+;Z^B1p4{i5^~y)JX;5rJ^Gc
zusyBPQ#vM9+Ba_7jBvE(GMcYH(6yR<YeV@21KaQST`xf_({+YAqoJjRqqP10YX-dl
zNpWD2{Lzn}-PO*Wrqt?5e*LtW{P8ApY<-o_=bq8kh1Ao$)Kc8h;lMB!UskP0OVeua
zr~z{b1myk7f{krgyV%`KxQiD_9u|PP9=oaP57au2tn4kmz)1$vZuVRrALHD!JhS!1
zRhq}ssqnav2oedfQnyr3!@D^)6ZiTo-NX|jz}OBx@`<%zcOX~=?E3KGwWR9%qjO`6
z!ISs-DA}ierO+G$23#?I!B=HMNVs=#*>32lmuRS#jO2(H#r{hlCSH-RiwT|Op6-e&
z0$Ex}xCBJ~Y_e+6sZOJ*)M056GbR^a-4><4hLJ5dkRm-jJ1m~Ez|BAo5KBhsu?jq1
zYi^o(JTgtjcTcJOKCQ2O!5JWkE3wgPn))H^cJAQ*>yef@!KO(9W;x*n9d&&_!pD%(
za7kSjdve46U5=@=Dh<vp5<822ODj^^ZeiHnz@OzXv4>OCJb>rh*58`cpyO4RGEKqA
z3M|s6Dv#C*ZVhmlzyf3L=-pn8U<80CKb)v_)L7CCT;It?9ZA8i07&4Kaw4<!t|7&Z
zxF^X##}<~71cM-hKYS}X*U~SL1^L<(A(JJywT4#maL4NNO*+)~gYg`LY4z|-^43%C
zPi&H)qRDh~zzad0Db<)B`i*GdM(g>kY?$4)#@wcul_b29E^I$v$p{g60B1P#{gv%j
zhOV9QPyz_ZGbh#+eWQ_fj2;hy+F4^9rIaiZCvX>;j3)qt8)WknbnR?5;K?1h^HMRA
zVq|vgp5bW1i=OZW=bq~29V?)c!()?S`KF>ay{_+qI9hZANf2bj!La%Itn$gE#Z>8k
z({U_-V?C8hv<RPqBZDW+Z|!SA1c)G5ef`R#R+|nC-0|H;Io*qko~uvdGZHOj54#bG
z^1{;8QUTBltT-LK6*tmzW(B+k7evc(orr*eVVhVBCyN4GJ+a+L1KVuRd61e0;s$<D
zJqmn~4L!s8LGqN8kS{X@Byw&&vP>)bT`_ccc`Dj^WzGaGjXMKdF@cz}+M*?<0~!T@
zjudrXM2rskPf_qpeT7-h-SK$jB_OnmiN)@dL`eoD&G<|WgyA`G4L&%w6%Z;sG~e}A
zD!7vo<Pq1ZNZU9)vY}&3$vFck0SzwVB}~SEKq4cQVu=lsYyyeS*htTW*-Gb2Uhpxt
z?yJ_WU5igVs2xy{JP;C2Q6b!k&fIW?Jb0#5%!+uNB?@LTKqv@{P^)gF0L8}&2ev1{
z0U9>M+~qV(DFuc-Y=BS)r+yTza53fbQGnye%`=Pxd!QZXb>UgkbnQl|rS$0%z;5WU
z^;fzVLVr{~`mI*d)3m{Ro&@*+eAiW}IvpmBnq519q&^};V7&MbMUH-5T}GJUw=YLb
z?K{mUM7JbCTH0oov^F4~*-)Bg)k~Z|5wxGxTGN0C1LB;42LAx1l;wC-nqH$wxO^mA
znV&T?=sBd=m^g!FKTV_yfFK#<apm<2{e3WFZX^!)A2rQ#<-D77I+|{4T<D(=Oog7}
zH%-TJD{o&!xui4_O!*%1Az`_sIouO-<hR3^ikXjTWDp6=ixneWA56fwv_=(1n1L|{
zC0S_z2SnZ`Nct-yxrN5W+aKSBr_-y`eM(`(`^LGXNEh|v3WGp#Hs=#J^~$@`XdmJW
zrXjI7KR?-J<kpqr{{W`LtBUV&dv`l+oyhf{9y_evU<HmO5i$py-Dzo?yM2fbIkE9n
z_06X)1dt5cJ9sR7Yw|hgI%&9*02szQDx?`88y*Uf5mePJIdpyYK<OaB5ZuHAE4DdS
zI%6IFqoyE$Oa)V`3287S@I**gx>HZNd(41e1mb?mgR4@Of-l^($ODWeFZ&H}tI>Bg
z?`(W7zS3ZNtp2D2wJ&fo4<gH7rP12?;Eq5hF<@05w!n2yaGS;U#~qc<)2<k;Y@Y=z
z0Wpw(8%KMo8*dx(ttSQqao8f?v|93TP;PDS0Om574cH{&2Z91k(&?P>60K9BH58n2
zw!o^6A(%fFBeK8J+zkrAfJuV~1uCed?L9`BB0(`^&ZSVF!|Mx4^#MCT1GWN=?iRs7
z(>S+m+k|v!hqSoyfeKo>ceL&Rxu|G4tRx;nWG0QIk}WvIEpKNjKCf7$@ID$~ml4|m
zWbOo$o(F>7_DYy`zO7q8W`3TlKPCSFRbkt$a4na-zylx%`9g-UV`+wE^{r^I^!a+J
zhU&E%!0p?NouvIEx}mAnHIJ)Z#0Jwa!s7rRPh75CX4~7tFWN1C@Op8iry<U47P+=`
z^B~*$E=7$FB)G{YWm(tPcCT3ahSxAAN2H*n*de4zJ(pfHQO|DcccM9r(7a&oxgul9
zZBaXD3=l3(4t}c_qqH+l_PAd#>wc?tog!%gx?!O9u)4!O+Tw3}7HGL7$tHbPFSSlB
zqd}(RyK&$Pe(PneWS2PHgB(xXE`QpmgI<L##ALw{93#ir%cNB*y}A;8*tm(u<>I#b
zEfUHs8-EBKOiJgTOX$;JxpROqmDKdcmQy>WwgO;@`Q=ZZ@ut2B%S0G#IDA5GECEEq
zdsv<?C+wjCu5n|e`Ao;yDo=H?9576PO!#4NyJA2KnfPrcz>)GqOGLz6SORn4i(mjw
zTYDV$L9mhwFB~7As?L~%H;`unO}&0-3!3W#5ozbg>nT<xjT6D1Z^v&G&-hq4CxRk-
zU+)Qm0t8I<F_i^jR8`zv@Bte@yz#=bwi+y8azY(}vRwj9z>Ix`Q?XFdajQ-aW(dO0
z>KfXbZloF+xvXRvAOcPOa<xN433CB#o<ToCva4xLeN*aiT^F#A(QV?^-2Q9AsjFVF
zmpHYqzR*lhiXld&QF9`~Pj5d}q|G}WKmufomc{MhrlqB6ra|0D$!wpIT|16NMro<j
zV^|Eou>12?spwlx;qDg#WL%G*lCa$&V*p{0Zb^bj`Cs~`_d8te%@=MD>{QyNzM5S6
zvOksp1CLv&dXt*kt*8JS+a_dB7Eh$7{63MPqD#z?H~K2wa<f><4s^5@kpx6`g`H`I
z(pnXP1Ot=aS4-^QQEd}agUoMmAfJ-q=<1oc81T0$<26oe!-hvVQ_NmgSEm}(+YhuH
zlZDLtP1LE^YiKmg8Ml5G6FRo6)AxS#mW;`h{S{YLFC>=|;BrYnRIRkJk{Yu^go_i4
zL<P}wZ8Kle+5{MNz~Il{(Q`%tAoJS)0AEn9bl$IDrdEdhRNeTRY!-RrY-{Z4^!hhk
zJh|M#XaF7}%bE7jQD(A}Sl0W%--v;avP>;KN470AUG8W@z&3UQ<n_9$+UoADS=)&x
zu~Xvl*>=w>xXGwVCSp$_#U?aJlVB~}soua&GuT9?4I}wr@=g4{!tLJ-bSf<b=I1ck
zT_>|-gHSoL*2SX3y60)K<hJLNV}C`j(Dkj-R(mSW%7O_3!;Fk&b9u4$#kY?7*I4~c
zCSPd|91DQ3J*U%Ct3kS<0piXr=asPWrl{7`6<R<x1dETKlIH!P>MgDtr=;m<IVJ%s
zm&N0=CfVn>!7nZNrel+9{{UqP5a>MIND19Z?J^DG#Qpdq_i_x2m@o?T=MFo`>5Bo|
zfj9f}SG3xLM<T<SRGPyU7BDhL_E#B2^}!7rfbso)N-|zCTT`iZTEU~6z$E^DDz2nd
zXlZFMZeZi&w5sUED)gLK;Q}DZC-+xhpk+csekKvPP6~4=6YFb{f4wGLOoIRml|3rq
zwC-*!Z4kFA^yam%h>qoA>NJE@b3h<?P0H41GU=9B=`A|VVUjLRGPHEs{e-~<k#I-P
z^H>jcN`DLz0nSm;s#-&L0!gssgk^-VW0jTGd%8O6Adoi*mY)86tCT&TqVexN**y0H
z(pFzZ)Rt6i-KOVoxbyz2svSkNsQ}Of-Xbi#{{SXgVRaslj|_5~tbJ!mq}C7^8UT=B
z0#`e!q9|1$6CBtT&~-kv!b>tnB7Zf9SenLS@pE%M?zQmcmF<_C3};12%&;arBOf)V
zL*48fTO|6=saY&wv663Mc|NN`oj1P{;mi^weyY5c9z1`wx41kxt$;<my(jYQv))eg
zHY0;|VXSpIaly>b<<AP0QMkw!92-gUQ-wb!C~5*k0%Y3%0I#B~S#Ds&*ehb3n~X%r
zIsJ1eO*V&vY(V5s&2&tddUA%2g0`i>%>}K-<Mpz=N7?H2fpaP`tvtAAy@I!+8!0d}
z03ZNHKJK^ZG;JejfJhi7KEmPi{{SZ}I(GPfjgC{7H&@wxV=0Hxdn&=Wh>y8RbZ&FK
z<X{j9g9Rmo{$G`~{x6N>x=k<*4kN@XXKPqs$()=inj21#i+n#dR<%hm+fD%s=qE23
z4M%P`k3|#%figg{kbqmb4)b*28}fThD?kB@v}1(@T96I@0J@g9!Ohhgt}cCyf=~?S
zZX>zKiz#TZms-+!Lq?Y49IN!S@tpYbk$|uU%-;s~l~SWY1WnJ;Xc}b5jPOidRjov{
z7jxi<duNdrP8%f?nhUI)%SD1|2HyUVme?~HNV++cw*mbs0@X=CAUGjUMos|r3I>TH
z>g5m%cAxBkk>LBeiwjR>Mxg+h`mGnY{W9P9ZdP?s4(>t#H$pZ7>-M^e6^ce8M4t{&
z5(dLN&#6Eu7q~=zK~GMbt3PW!0UtH1r0HXB=NtC4eAL?^>Z)k;k0K`=0}FD7mtfrB
zNQ+DrH61Wzpa2Bea;!9#*b5xV`|i2&%5AKOA5Foc&`t;w_x%#n(IwXsP4X|>m1V8o
zFNaOQ{=brXVh9noN2K^!PS-nX^wF}Qfz844^;&w8Hhfd&GO~1410E;FB;j1ISQeP#
z#D09NFKuHVs?~;_)|-<amR#Lht#h0SB!k)^ZS_MtfHE_21n_Muo`LTS2?sNelIrnJ
z8CsRDu_EAPV5@XVdrL)u9*PPrw2>lm2j42?8X`eFT=V-%tchZSCEI3BBlKESDf^is
z1jgZM1C>`y*fIrx;>4?k@i(}=g~H|XR{kw-Y1vRx(^PSAl1KwGs%k0|Nzstm@(0tw
zD%FUh&<FDza(}h`6c23asyvcU01FSORpXo&lea#~CZ3|ZMWo0RGbmot){ziKFDl0N
z)UQ>6rp<y)rB<tPn@@Gs9JQ&@1Zdt~2?fN;hp4r#bPHl*G2*7_UlxuVa$Iao51rIH
zwz<`tA|${jE$ix{c7G3}q}J}JKL`^f$H@9DU3<-7`k~V}1k4W-wl(MYZlCV2HB6E~
zAo(5^E~NhepkgjN=Y^&{?KPGlc%mdn7wn@4j_AZi{DRR04S*s%Q}5gOMe*Q)&NBm^
zQwb(H@IV~NKa$YUfl8kDSUAP*sX-uG;A9`oYEq>=HI4pr@qPS~r;lKXe(u-vNv-#_
zJ`n<Md@ACp+*osiD$^?M_+}<P=gA5V#}^RcXpn-XDsKmD5-sSZd1I*VC*qltj|Br|
zrXp=5WiqPopc*7Ob>7A@kfm4#5mCR5=JVRzS#vwL7~2G7n8NG5jif(;toK+s;{C^;
zlFQ5g04nkJ)71X}qQ|~TeNtOY0RRslA^9vWyI=Sn54sL}Ur=!bJG|b@Ygb*4-KnhT
z)6)@5hRokFJ=Y7>HBZ*-Uh+f(Xf{5JKNde%o%p8Kg6oW%D->bASCtDJBt`hLw?)1d
z#Ff!{Wg2}kgzf?_iHrI!rGxz@?HgLdasY!Ej_aH04QKI8Yt4ZMNXhcQWz$VC(;$#Q
z+G6D9{{Yc)`8#cb8jWxh5MTl%-<|^FJ*BBVTmzQ0WA^o3rB{If8H0-)E(@*I_q|rD
zfv|Ozc&8=gTGPAU>L|7Z12Y^huSzyk5tD&<T)#|UsjC1CFm}4@bkw%d1;-*l9R142
z`4!RNoF=3jV9`5?1m?<cNT>ur<Q##EsA#%f0LbJ557vAXjRo%xBg|gje=+;6I_rtt
zAhhi_z&!rQUughAnTdhqlb!_5B245V20`&8_krf8KqR!dmk5qw7T{TaM;4Hydx$f^
z7r!28MRouHv=A(-K`AY0xUszU9-dq!up7EW37f}{(5U|ah^FBH#EjeMXZxuMYnlki
zumg~S5F0@y&<1(>s5UaF_(<GIf#1E<m5BE*ixAQzn7%m+FHKd{+XS7>F$59<-^U!@
zYSp1hl+O1NX^>#gq^ocYKbHXkqw_GT0$Rbj#E@p=Y<<dSRUx-%Ye+K&VQ9f`ds(^(
zwb~tF_4QR~2R^C3={cMzeerP+(*(qJ<F&Y2x;8PSOL1ctx+$zvYi&7b@9X-kk7{2U
z8V}p>hed<}7R%b!xS9M}0PwNBtzbPnOz<WgZ!s3(JZB|b^&LbL5#l0DdaIQ6a|D<m
zL;!AlmQw&GM9xT$)lSC{4X$zsh*xoy64tNrZXotHF>j%c6s**^y3j@-nOS0_cn-!*
z{JhkxsRY4+#Qy+woyBt|i?uB+limR$2j%?|b!LD>T<|-6m7Z&Y1?2W9g>VT25>3ti
z?w6_iEWO*Lhgc6dQN5rGnI_Ut%1rlA3ZTgjyickhd;_GLN1A#b$-^2%<dI@PAF6$(
z1X?EGPu>-8eA6v9j0xo%YKIXf2tH}*jl{vFrU(G~5>E<xazhP<E)ErE)&PdsK(yL^
z$~IRGuy7zi9sNR^6*60JaJJ$gT21~%Q+++onTg0Y^U9a_k_FBuCPDkUilt!TAjvq1
zN@J=mL}|L?Gh&b3mk`*$pv~>`QNFXOHf%?pK5DzVi`f4FfC(0YE&B4bnA)ASXz3kL
z#6;K|V;S=7w|Zudj0T4{-bN4KisB#D>XT}aBw$DCpH0<u6tzupsAJYgk}ryv=<j;J
zNfk6^t6Dd<_pvd2E@br@t*BYHAVHgbS6$HhyE?Z|m!I_;XGqThXLX*5Mx{dfQpY<W
zau$4jJ-O<gNHCJ;0T+)2k6TwyQCR!8f<>=<`S~gOze?7%zV3k!i$Rc2Z(rGD={jz%
zja}6#k&;e+^0&fcA~LqFxI6iqHn{{JE2`n8wE=CQ%mF@sky{L=(?|z^OMBc$ilwP-
zI!)XxNtiwJl;zoJqoAtRFmBHfbLM|^9Yd^CV+=fiI2c)9#)eYb1PIOKtWAEdt#`zH
zBLfhkcxAh-r<+*nG)Gc(N~F5-WUajxp|7^<7=kVNE=q+rID$mXtxXf&$=k^j;I8<j
z=^w4GiNHEMO@x4xDpX^)GE7bn^-HJeD%RRSmW!L^Xun1p+lV6Ya1TG4=F3aRB@p*l
zI~?LpKrV>y&w{ixMdnGk>}FFogTvXh1IvNzt+C;z0Jz8lm5y9=?(zP5-fER}Z=^Rg
z2f)DueHBAU?k>fyK?m;(Qm(1Mz-S=$$ys%Y4s&3Jo5XUj$8XjiT)cMw0HSI-uX5;j
z#64wgX}aA8QVgil;w%Z9`Yv7XID;cHGr~4jAOHqLL|CQd@ME84hnC{$H62=uoPFCj
zxSN6kO9i@%TWK-KFp|oD9~j&AwaeA43m_Z#s{Zm!8<Zr1*1+N<-2^zg=Ih&$<R%di
zZfAusZs8;uixCi@Yqt1Lbt$bB;N!?6l2X;Uw!nGbWmRJfop7wzJB&q$O+}FD7VmVz
z)}E6<k}fbu3fo50x+EO;QLqrff9v;UGZ^v!1J+b+a1I-w$Da^|^r`H)ndh>%?i>Z|
zF>IAjR^kK~W?%%R8jSk18eB_p*+p8BXaIJ&3Qgn^4X@^vR}F$M-?E{l0suZelSCV$
zc!y;VnxqMWTsy1Uc7gzz;>wE$#f5)NM2`%RfVTF00tDjAKD8ick07+FJO&7K+wSW+
zwOj$4<tTMo%>alN<sB9|?f?r<2}a@rM9Es+Ge^1G$dGtS-5xV(^qOGS0>;S#3653J
z<46E9FLUaq6ooo?&lf6%dW49y&EaP`UEy|~q+fA?F=b)3L$G|je*D%8s_e13;CLw+
zsB~D`G1)&v`!|JVw<hC($xby5YebQllYhIcs(O$>2JzlhkJN%M9pEYa$Gb(E!q<lv
z9AtM^e~u4e4Ry~P4*C1=ST3#!iO-RkM#}Kb#t5Di-x}wUYh75p8`$_PvfqkH=K~)#
zb@dwd<H(#SHo)3=99mW5H44;}KqtV1%=Z3@(;QlU5i`$hs;w|22_2F(OGHe8-srHl
z)1y?AV{$kq-ipJV?XDz?o92A}ap<x%)ywNKfEjn0kGNV@C8KG-a6}Thypxw#ho!oI
z9R_X_xR?=m`;}W#_O*qwV`!2;PwK2`NNo^e;P8DvU!twAGS`-kq9RZCMkSwY1^_kC
zPvRVyl+{|oTpBELBcC+@c^c-EaVPpKHAc{4)7^C9=gFngn(yHlCo>`df%>SmH`E5^
z1gSb`CqTjkfN*(0Lq@2!CP#i0{T~&&umY<H$Yw#C`FxgEqz~cuPG-wnsTVzz-3MqK
zGypwTYuZal;E0~fOjB5ZK=Mh>n<A_>)F5Ce{8orRfyz2<E_To)@dyTNJr(UWEtJJW
z&vFn1bl@|SgCZ5v4AZrnj275%s=l)b0GK(G!kB_+G4pYQ2`Xs?T{}w)fr5gKczA2C
z5G2`3*FYxFF#{?ZwD$t$*o$}+&1{Cflk^HO#<%&s2>yzrD><YG7y}R@Ri?SF9|_NM
zlotWqT<PtfA4S~sy-B0&>uT58O^=stXaKpm9C$8cq-Yslqw7{JZ3+ThT_8C2o)dLm
zr2TzOqy0JzwKfCzMiyDmZWWIL>s@8~+NPBsf`}wHX|WzYs~|L5_U=v5?IPXL21x{r
za|-d2LcH=CVBc&_t`)j%F>AQyN%cWZjRr`Gh`dU*Mv)}CW;<FDXkO_-^tDYb0Gp5x
z109bGMf%6I$Z-uYBn<Jvvdo}BtkOrQKb$MttuU#mrfJW_VkL9st+d5s-xts|3_$+?
zhQyzm;dO5Jnu?DC32`wiuj&tOmrV9kcH$atOv%BCpRl;<pxmBKh1Q-poZCGNZ>%0h
zK$9hPfbFD67r?)#>bXB>3x?y^z??3kn3v7OWRgG!)pGg&0O{G_{{WQqTMma$i4uQb
z$y$ep0^U2o`ItXd9T36;vhLmZ!kCzFjF~r!`Gv`ArXzSalRJb#DV-uj*bB^Dr*qH6
zwA^xUAAfS7d3hkXi=4>_fS%%i55)#Z7KFl$N1VX|!gK2$YC!>{x8j2bnLcTn_c_1}
z3qc>Eq?IarfDQw3IVNCZ@(OmaZ6hXR5t#X;7#+!z{{XZ=Qbe@5qDbVZ`fT2y9?BIQ
z&`5KDl5_hYpwjR)4cj(rw`526vazoNWi}HaI!FU3bWIO)m~Pit18*;?>%;zBep~*_
zPtv&2mYF%mJyJSpVf<V{5I$>9K!1p(w1{xABnuo|sf7q(Y&48adVZ+d7`kdi)!Psz
z<lE%7XwoHzb3q_T6OXu2Qqr-tH;HqAA|hk%K8nj-w4Zn`Oju3x>B>6-+~C6ECD=BA
z2Zfe5w9XFc)SBb*@5ESly!P;0>><!BOL7MvWn=qH*wRsK=AprZ>ZvY3(<Fin0B14b
z`IJp0k#H|)$@!_h*#J$z=OqcHxzGt71d;mq1zRl}KZlM$gay#xBzE|w>Bu6&;75nl
zWeI8BVa$1^R3+~q8JHM|{DL2R=745Mz3`MdxHmYP$Du<UocOAeCzOq<<Jdr(074S-
z=^#Ly3H<#OO*8PgK^%o7oyJK68HrC&2tCEeXucw6?+C?6h`9rv6mDp2{8l`V$qc&@
zB%2(4*)0vH-bn?fNxYQa{y}ZCFfH}@s_uKOIX93pm^f|cw8A}L+S_Z8ZO(c5Cpv;8
z&4+XM%Bl%1gMGd#ElpAw>x+Q^VLx6*>Fa8q_hLMXjn8p6GXpsO>a{o6Nyy<FUI!90
zaI}yMXcM}`@fk_ZA}14WY?}mw+s;%qE48CH@hRw?XR|$B+5%5|5PbYR*IQ4h>GjRs
zfOrrAK8wuyW}$(kOiip{6|1GyJ4R2tB)5^@pGC{%@!yJ<hih9&M!7p&+g#1QN;mYD
zv9i!F<Y(^x05zSa*E3SkTHzxg#LxFzIt@X4T|l@z0U#JFn^sj#xj$%Ve+WfJS*9d*
zTz0co_ta=<kPnjdUudYdwvM165(LjN=DfPKU6oxR*qF8#Z-<y!H7HhXRtD6uA`BTM
zrgn;(pPLLv?z35{aWNP_HD#4*c**P_Ev!oADlBY(AOX%oyP}{UR0heLlQ9c9qO-Q#
z!Mp{v(f$g}Y=S|#BnTcwWu4&U<<xZT7rN8S1_jH%zv!~Mzghkn=92_CoRj19`7W1C
zSaigbcQyds5VCr=NYiSVT=vU}%%82-7qpj?%ITA%*O=eEt|Z9><a&84P1c8rJ7k=!
z$8-TqO(!|0fK87Ir9-IEzhzKJ?l7;{9J*znlPkljX2(+wnIuS)DmTz$-5_yte`x(z
zIYX;CzYhM!NtI=tY=?d*BpVSSFUu*@OfrpE=^8*FW5@y9zvQdGKpSE#FD)e7=l3cW
zb&yO*wX8?x5xc6>drsRBV#deM`yx!$bJr+;13>v+KSY!jtL`Q3BwvRVma`E&u4;C{
zjudWqYly(^qJ4P-g$t@Ul0AGEdj$UYj2}b~)mkSdReRkz^+P4j9pwRJ)!f<7nkH3n
zB+Y;ZF?A-IkGvplp7X-L{v78-GbgdP%}=yc#ZsQCsE$G7y0t^7e0yDN0uMe))4E}6
z2lDfOva$7SbsC272XL1Z`fa;b9Nn#tm{e;5;OMzLUh6)dsm*u;=QvhQq{$O8+mvXw
zsa2(qB;;SoMr!T^PbhU2H$6kRs~Z!qbT&$D3CD^OI$+2lUDJ{TW5rWo;0c~px^`?>
zo=0+kJw;8T#~qa^E+9pZD)`aXt#B@F1zOYelE`!JPiZixaw<)$Xml=n!=gO-T^^fF
zv4Q|^q6!onNLT_#F@<CkiQ;i-SZ9^3vBySx1Dg(j20=6Zy;Ta@P~u3Fdke@~1&(x*
z{9;ZAp9r5o7J%zW$(tPAM9W8U+XdQ30%m0Q`Jnz3gWHb(07bm|1kV$1KfHRV9j23e
zliJbuWvucuxH6kYqhMU$fn;KpKy2wJ8Mk|_PvMC$)|Uyf#G`JM#0y3Qeu;jA^Q?zy
zo@DN@<f0T_2@ri2yEJ5t#~gtmAEFm&fxKpK2>lQ9XBIrVCU6Yl6KJ@J<lpWU(?l2?
zcl-OgMw3K*F+65JB>mv^RSa|q$s7d!>Br|JkgR*V1GJJ410BAp_Pce&LA?IIM7@vH
zIvqS$4ln~mgZ;l%cBSLIT;eTxL#JpPiEP9+1@GtbM^(QJ$OM67KYT8H?(3_^HHqN=
z01U$0+}Oa6AXQyuXPrCu0stOTwRE&;4G4S0&wTurH(09CMi~9AIc#NK472ToTz`mR
zHkWgl@>i;cxugIGVr;TY+$#cK%GP|5_;(W66F1{4rZLH(qH$)5ox9xOM*!9?Ye8`t
zBlS}0kNwJ++h~gkNR6b3v9w}+R$of0>Y80bqadBIK`#IgLz~UkV0(y@Y5rKB$!k`o
z*H@_lvRYt}UGfsKSkeoDam|ba;8wV$ovmPPX&m?}-$kL+>FwI#<qCsZNgb8WlTuJ_
z=ZU~KDq8j9;EW?<O*$pqoCs9jOPoo{WODLK)T3DYfX5680K(0oMb&Bm(TkN~)eB+_
zZ8-2j#IA4?v|iI3B<I>BLFGa3!3KFr)d9BY2pb#PDj7CEl8L*SQULj;6+k9(Fj2AW
zbyi3aP09UIy$I>+HPkv*lSa|B)E@Gn4-N;+dGq1mvnn@xoi>BTmW^^f_U5^y3St9W
zx5L#`+aq?{Oat9UD^%JVcVzJfVGwTVn*q$>OSk<=g5pi$!8l+s0w=I4h^a6KUQOae
z;az1tt!ajdurrkd8UU~|2$HSQAT#6md=Kr)ie%93(k}+U5585F^)}OBW7;=1c9W2N
zJE(z}CjxvYx`w+5dkJlam5PlVs(nXX%FQ&{U`Ub%dtbnnky>Wwi}|fO)QGI`8xm&R
zEbT~(f#w3&6xO%1uh~uJ)^NGybo8765<zhuAFfv?&`At%AMs^$q1zf98Z&W!)p7Z!
zF3%0&qQ^ztOaWp00rzA<`<w_3w-O2ay01aU-G^;I6|DhI#{lOzku!rg?}7QQOC2y|
zmQy6$o7~u{&2e>6YX@YE`Kf{4&<nC4Hee4#C7`l){u6!Hp7`<Xh-Eun@tB@LvHS8v
zy_<;LxbjTFKQAR4%VNe3Z3a8?qht3~0iq8Fx~*_c5aXEuMo9w5g}`*}B0(e)K3+d?
zpruvMrhViKSl<5tpVdFS8{x7*?sBJGjfKr^Qe69kYe^^Mk514VHWB=wK}^yEw-E&J
zFF$akqCu$O0B9f*4nCteT{!gO%bK}G<+`TBM%MsIBxji=X|d5z$OJmW7WsHAY9PIZ
zfPz|N1Cq5VjnrD;=_2CdN6K)DQD6qraBy$}&;^J<@X!I0W=RL1zts~<pGcQ5jm-vK
zpBn_tV?hQ$10;il!RUsuyQB&5$DNixw6JMu;9R(wBP&$63%RBN$RPe-c3FPWlIm0+
zw8if@DNfH$cVU8XPp{vqa>7i%k!u@%*hF4q@Q_K&%#>loiwKePR9Zu@I390^geH@6
zU<&~f6^69#xsm{jBXdMRGJf)nmXzN_5nuo|F>lF0=ER7eWKGtQr69Opg3=@Ep?xE0
z8L)wb=0$O4xuiNqPk69S;zi7p-rj2C=n!rWIT!VUqID+$ao*?a-CwjussiE+ktevD
z{SeDy_A%-epK=JZO}p}&-2VXV=Z;63e$Z;Eyi5_qnF%)Yx&by5A60D7B0HV{<qx<5
zWC<Bh+7^(o5)7VlWXGzf)DFf2pAluH?ud7uNjUi``rRX|Q6aIqZ2;V-EE;5rppY41
z&vh#*hJppbo2pf5R?|GS^xWStf%=z$gYx}Wyi!}Os(}H=#|oKwsCN@@6;!>>Cw4Ok
z+VUJY<Iz%AY1ANmHX^{Qnl7gHKW$P-KZN2vR(&nC7&aS@=Lnh`_*^2^Cc!HugKnQs
zSkJwn?XCcyhX=ap`X5xMRsR4Jq2PQ#m?z13&YN3TNbp06Y0NFnHm#Kat4ZG&Gi-&=
zpN*zDJ1(k6@bw%-?gqn^=Q{lh1!uf807)PP(P}!bqOPCaJWrD4HQM)@<(LD6c(Jir
zWe=v&vCi#r6J?;yI~)A7c~EGp2Rs;%4Wz(Z8ag7jrLCK8xCh6@UROzzb*s@mnSWCo
z{{Rb4Hq4RwEsa;OPo+rIpz<66o84{n%8bzIn#xZB#0iTPeY%3ykouFm#d6%+OkQEB
ze^cn1h^e{l2F4CMDJgqY_Yv<Y%<wKx&&hXPdqt$yblR^jX>e&WGd>H;>QZYfIjuj0
z!0bJK@~UHwbCwz7%cj!x9X7H^Zr2-t-~s!;(LvOyGOZ^A1mqd}g@-}Tr%)MyE^P|V
zqgPV3J+2`551%1V<*c|xjGC-#9Oi*Ab8D!YGX^HZ6Y^8lVcbkinUCnA*$$W(%!SuA
zp{j5UKy;471JO-M&S@KovB>?<L6K+!vAR+S0r!$}Nd+q)<vZ)S!tULZb25?w%YGp}
z!bxbf=XR49zi3Qb0%JLqV6-)YKoQ?6y+_VW_FVxBg}r`iT{RN-gT&<xHaerGEEc|p
zN(+Ys_mr*SQf+gm>3|~GTWr$or@XX)FNyh|*#?6~8rHqSK(-a_Q`t(1fFKS(Udxj#
zw_C}>D_f{z1=|rlmpiNmyWHSpe3pj2<@E>{C*-1~rixpLyk2g$$AnfB*jP#AruU=_
z{Z*}CXmKLZ*-@FtD_F>!WX~vJS*JzL6Brjj`g=*6k=ZbX6G1o$5p->7gW!T28*pYj
zcq>|Fm(~I<Ad;XB01IC+;IDMnmsBnWz~FL~=;v+CH4Us55@$IIcd)RyOvWNVWb|ZH
zE}tp4osvJyORh`+Pwkb<Z!P5D0J!*w@xR@X0%J39K|BP2oA&IG1`X^olHk}0hgriU
zKp?9_Z~z;Q-PAON<<|$1!m`%S#h)WL`mFM*h-w!cS}!NEk8#~`APn%Qs3JhOa3<;k
z1dK;uoJi@qO~s9hosqD7Hyx~_qQG7FVEEb<wlq1Q5GF<Peo8W{#tDsKXlY@vCdVg<
zw;52lHL)UdB&#g|NaT02f%a@V^J`o0D3t1}%LTTA4}7S>X}K{rw3Q=(C*cGUY5u59
zKcR``Uid+!nCB8?CfW$x06sBe^YT(PgB`^Br0Q3>v|zv){nOtALZrdAMWp=x{{UoF
z832pVXqYJ2MKIQr4gg?8{d`bWcoOI{*qI~K&1Ln|CkZuLnsl_(SV(CNfK7?-Wy|$V
zQ)<-OF9s}P%THaly(s)J2@qmA^;o*9TPXks#6-=^u8$nkn?BSY$vcg_7Rw#GO2a?M
zE<d`;f=1aB#FAE)jPKEHI2Pu9L2YXr8hU5!(zF3_5__qOTK5bBE;A)Zrf3wYnA`}&
zk#9LtF|Yn20D&0nvj~U{dyAyFH~CB(`1&jYOX|x>XuXZ&*Oj}}p4wVc_cU&N!$^qr
zvgZq>(n&FxT4B;^O>=$bl#<vW%w&a^NCIz#p;D`T*=wD0A_-JOgFzVb!qOt9s_%x*
zw_tFoxLp(9CVTRRON?j1GFZ~rZ7JQ#hN%vFoD71cZj^@0?cv!;OHA61jleWyplMaC
zbDJPzl)clyn6lA%qfMcr+}iS`ZA~#{z!8E^C^k6+WFn?v<Giaiw!WpT3RLJZ?i*bO
zeraBW?a=B=p4L=xs7a^zp8&hljH<!#LuTY&FCTxZy{@J1snMn*aAZ2=#Hwv+5>2C-
z;Y3-02WWx;{M3wS-y}?PHb^uA4elm9RQnoW#lVvX9zUX62HF7x5o-`st!XWdw-*H1
z$M!(bNe7Z|VQ-R^q`A=IE`JHP=A;~#mWi<E5)f7cXX3c*COp)SWu)e3y|R4pf&r#2
zlX8BUKvSv%oLv9|$>nBh+C;hpPH`)3^n+SS$6zKFX045I5`G{Ej4Q^9E`;C<!DzMf
zZp)_AusF4($dP$IvDtFH7=fvCaXhZaOu)#tZI7Dd^Y-iTP8J9NKmarx8T+zu8ecPE
zd&R<Pr))rwdlXC{FukpkO~CqJh05uLS{g|Xf^h=s3X(t#aUyaL>ErsTD-GehOoRBK
zNR%=dTG+Wf9C~ty73$G<x&%5V!~uW1C#UY2P6ejG;Ez;wTrdE*na|hnl@p9LpfnN5
z0Oec@45_$5A}^4CLR77S`<=j<1PSN*s+8)AVr)BsauoERwUXdKXe5|1Imeg?_2oA3
zu-AdeF6J%w$}j-<yYL1AtxRrlCGZD({RiFD%rha4j!ts6lZPGcZNKTYwHh{){G!qK
zD(0EP_>HbR8tciCJ^rgZa5Uu0oG)T;^+;+R?#++DHwK4ba)NE{0l<p~8-s5PqmT)_
zxC{(@mVS?1D>bF}F2s;$?iJc(5CroC#k?o)lmdH8ns?eP0PGbmuESra0!jY>OyvH`
z{brC1-^|SZ$~uiTI(25el1pYgl|NS;u%nAieSV5TM%TO!dHG>Z8UcaLjGP}nKJ1TU
zW<bTSbZqphDQ(^7Vl5t}Ke-@}%n!e^trpGZZ;YWWktF8QdGknWE4r15aFS!e5TF{^
z4U>GBg)2nl<GE4>1Dnqhjh>aAKoc8e{+Lj^O3^YifJr}+(b@nv&OtZuQEifJAb>yz
zCkM~^qja?t4wetZNE3<s@AOanEI65%ybDjSs<=TBCS!=XNB}NIF(ePa=$^~fOzDF}
zo1B@k2wSEDYyoJ=pS~6Spo@ZMf(l>*Njq~G5h&UEv;P2va25cFw+T?vI;~Og+#Fs$
z`1My_WC$eAZ;w!}^ubSgE^abFDBTkng!G*ycJ9>B1P0#Us>SPiK9ub<snF93WDG=;
z@?FDp%&kO_?8xm4BVEu#Sn7e3HU<b!9PGTOEM84pOj!9M)1K<IM1zjydKl=`p5sQf
zHkO`9fEPX1y_%_^CG>{o5g=dhmDi7!Ru>B#^B!33z`+XZDzdoR>pD5*RC`NJ#l}HV
z;^RM(+qP?t4qz>D&(q0CLs3$T!|A#sBf5s2aNrqKNZ|4lxw~P&$UK5>mV=_v^%`21
z29ve;f-U`1Yx-3^PsRWaCm=<aKF5~7dk)wb3vWTy1Hh$JZFEG>=(D)l(&dRXT{YCE
z@+Y?(`K=m6=-w*>J8ndgKSVTZ8bA<tH0CU>IBDxsI|u~W&S7JfzgIN{N3PbL<?s#~
z2=o1v@AitdMuwV_01*z%e`SHI(AT56^yIJ}1T5B9s`|S?f(?l`l&6kWd%Eh~R<r4C
z4sdB5<a#VOyLD1U;%*AJrfW?w+V6r|FS#{s6R4ku@SB1;93vf03oX<wY{84g%;htw
zxzilrso6{s1Ro(jieW^DV(~FQe?<0Ptm>3pO@?ep=g|+ja5%BxEki~QW(XvO8`-cx
z79wmh$Lf!9>sijE>5(LJbmEZY0elI7KSg-b2Nty3_er#y7ZJpp{pCMm^-^sy5?nki
zlDKJxfJmHOB=$7q9`G6gFrsBQR(n{{GGIbZ{631`N&f)Fk1)C`IIztLrwukaveePh
zrq=-=Ma-zG{{Z5B5s&v-=CR7|HR-y|O*$+me&RVdpBwy^R=-x#PQu(|9Q`blKiYSa
zqg(yc<`iR$+1gB}F5K~)7_yI5TvK5U_&^x#&+4on!|1bh--!M8T29Kds~0`aBbl;?
zX=#)4RvVM#qN=ux>c;43Gj9G03a534J-J3OJd-p1#-OSo)7pG<3PB>mM}#Bpg`bR|
zhDo<_eydxiF~AOIfU(YfS3%N$+IRe=Mtv(D+Ej~x00IwiVh^9y0ds(K18BHY5B3}6
z6jeU~{{WaSf33jkkDT8IE`DVbUO~HL7z7y;_b6F`<eY!I=$(QmqIXn+Z+MX~K1$$z
z?}T9U6AGOV_fMDTr#T<;ta6;Rg!eSF5g#wPQ2Wqt-uIiSpZ?Rl!luvn5B!RvD(xzy
zyEeg-7V-HjO?VR)&OZCB%0G!{jJy70`mFM4jo^Ou(IwI`Fm3Wsx}hQ*es>?yJO2RN
z{{Z42s*11TK4_3o?vpNy<k+V7gt)kK;zS8iHv`Z0Lfm>E$y#={*MVvD7E)7hy{_01
z5=nqydHO1j{71=N(Ek9w-z2i)FCNk|lovb?gqZ*zpVdLBku#t8dGO&~ss7zhRaaB^
z8TZ1=9^2lTPAA>fE@^0J?-TV{+L!y*TG7e5`K@h#;D3-<)qew*XN}F09i_~W>j4C1
zUFppwjt1lz0L|5EU+n(?@hf^vzsYFEwP*s`EptqPU;r|H{{Tc(T;1-eXeK6N0*;6O
z0H&W$ivC4s^qIX&P%G*j%5c_An%mENtjEJX0^1(yT3_xTGMiJMMYcUNton}IF@iDz
zuH!y)l{G~Re;2B>m%AkStGa%lRnXu~&B#^h{{Xsu7emv(@NM+`l-hDL^?EAJV1}Dm
zk|eC+%FVvh@?A!&Aw+)-Klza<xu=?^EhEu3%F$-JJc`+^hk2<f{{XhCC5i5<K$>O-
z2?bf{w5nC=6e;Z_mx~2!Q<nKDIw$^{q5l9k=$2g^?W@&YyIX=FT<r}b&%KpKyaLvX
zL5U{Z{FMDm{{R(V$bL$_Kla=I03cb#tkW0xoDKjTl)GB#hYRF(P*VIsSf%*=Pvlh4
zAxMA=J}A!;ikloAG63unci<oMso9tL6)Msv(hXt2CiBnDMANpwW&{}ucYnBilt=yM
z`Jc%XqC;41Y`De2f)AEe9YY#g;MTxCI9giY_RpC8R(*W8{$Jf`iBvrbk$+UQfY3<*
z8wK0YrYX`cGU9V{BraRo{{Z}bfByh=`mVK~?w>!=aepH6?eKJlsA+)VAWe<P`yX~r
z?SeLpUgsVrd?8n__{WozQ~WLe0F|o`jnlM2yJN%vFXi`z0f$y`5=%@281M9zEkE~u
zNvZxCzG=XMqgoyuZzSA|58lZ&%{!YIGHgifinsf}ebi_9zvWj5--@T=fxt6<+$ovx
zGie8Gi%w7XP*MC1^HM+kvF1NjQ?zB$!$>3-HpEE;m~W>zQnA2OkYG5>oQ^+8^ig;J
z0GUl!@elb{pB$e=9MUcc2LK2{8X8NQA+Zh^`@iIqKk(zsBY)xl0O9=#05r}tYq^2M
zo597rmdy&}UI{WTarbqKKk*cNmcEzpzy5#O1M5kS_ofRhL4%k+YBw~pX`Vz9AVO3#
z{{X~OSN{NYsw$l0$fc-zE(=ToctyC2WMAZ)RsR5utM|!BemZ}Aq_{HuvOp)ma#SXv
z+*}$<&5N?5sgM3Y)ib}{XZ*zhWv+3tbcY+0Bj~1S+S!<o<rzGxT7SgVY1tlXDxjX|
z6X;3hS2clf+(qvOz<kuM<Nj~mQ>pk*pXi<?Qwqbiumyn${qQgew{m`H4?bh_37waJ
z$`vPRO=go5G2@iC0BkNr;=&Np{$Ke@jsF0R2AQC|SX;3l(G5z@(l>KKh{R*fNu~TJ
z&-79CKlaa&{SiAXuA)wz?FRSzr@Oew%IPWV=~ki{J~_hk4JZCB{{Zu!&33v>555&=
zjETo9T2*vZTz%m!fMiZ*m|0z3o|>f+$L)*)V#DqhgZ}{8{{Z1wn*RX6{{S)hDaW(Q
zI6Y>7zo=B9LEGoz5Czs{S`{d~vZG5}Gb>|D{{W|d_`g*bUH<^azsY(MxvXRoU`drD
zD><Mw##z|#R4El`T`g%F+kXf_y8D1KITlMv_)nkegnlxB*3k8LGAY(^a6Rt}U^rD{
zT}A%@E*B`@#JUcX{@C(aXN*)^X17sA&Zk+sfyu}}MMGHD(bP9pMWQC)arbq1sNjFh
zs?`4g(dX}#oSLTu%GGr>sX3nr2Ilq)qSHE#qKj$NYeRsBn*u)Jb6qp=yUeYPAN%e9
z0GKH9M!2oJOj!7HCQY0o3XoE4g5V2ynFsInQ|hwM`F~W*&&^@Bx(U+LKNE;>zyjzh
zX^Vk#PoM|WD&0T!>3N_(&*nc=Z1ke~L9E-3V#Nb#1He28kYgXwTYg8GQ1bNlRv>*Q
W?IJ;#7#T@3Gyed^`XsAq)Bo9M@!z!o

literal 0
HcmV?d00001

diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
index 5ac041b2cc..0c9da1a764 100644
--- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
+++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc
@@ -13,12 +13,10 @@
 // limitations under the License.
 
 #include <gflags/gflags.h>
-#include <stdio.h>
+#include <iostream>
 #include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
-#include "paddle_use_passes.h"   // NOLINT
+#include "paddle_api.h"         // NOLINT
+#include "paddle_use_passes.h"  // NOLINT
 
 using namespace paddle::lite_api;  // NOLINT
 
@@ -32,11 +30,21 @@ int64_t ShapeProduction(const shape_t& shape) {
   return res;
 }
 
+// 0. Enable OpenCL, if needed
+// Enable `DEMO_WITH_OPENCL` macro below, if user need use gpu(opencl)
+// #define DEMO_WITH_OPENCL
 void RunModel() {
   // 1. Set CxxConfig
   CxxConfig config;
   config.set_model_dir(FLAGS_model_dir);
+#ifdef DEMO_WITH_OPENCL
+  std::vector<Place> valid_places{
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+      Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNHWC)},
+      Place{TARGET(kARM), PRECISION(kFloat)}};
+#else
   std::vector<Place> valid_places{Place{TARGET(kARM), PRECISION(kFloat)}};
+#endif
   if (FLAGS_prefer_int8_kernel) {
     valid_places.insert(valid_places.begin(),
                         Place{TARGET(kARM), PRECISION(kInt8)});
@@ -68,14 +76,22 @@ void RunModel() {
   // 6. Get output
   std::unique_ptr<const Tensor> output_tensor(
       std::move(predictor->GetOutput(0)));
-  printf("Output dim: %d\n", output_tensor->shape()[1]);
+  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
   }
 }
 
 int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir == "" || FLAGS_optimized_model_dir == "") {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " --model_dir=<your-model-directory>"
+              << " --optimized_model_dir=<your-optmized-model-directory> "
+              << " --prefer_int8_kernel=[true|false]\n";
+    exit(1);
+  }
   RunModel();
   return 0;
 }
diff --git a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
index e1833814ca..c40e3d5e9a 100644
--- a/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
+++ b/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc
@@ -12,27 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <gflags/gflags.h>
-#include <stdio.h>
+#include <iostream>
 #include <vector>
-#include "paddle_api.h"          // NOLINT
-#include "paddle_use_kernels.h"  // NOLINT
-#include "paddle_use_ops.h"      // NOLINT
+#include "paddle_api.h"  // NOLINT
 
 using namespace paddle::lite_api;  // NOLINT
 
-DEFINE_string(model_dir, "", "Model dir path.");
-
 int64_t ShapeProduction(const shape_t& shape) {
   int64_t res = 1;
   for (auto i : shape) res *= i;
   return res;
 }
 
-void RunModel() {
+void RunModel(std::string model_dir) {
   // 1. Set MobileConfig
   MobileConfig config;
-  config.set_model_dir(FLAGS_model_dir);
+  config.set_model_dir(model_dir);
 
   // 2. Create PaddlePredictor by MobileConfig
   std::shared_ptr<PaddlePredictor> predictor =
@@ -52,14 +47,19 @@ void RunModel() {
   // 5. Get output
   std::unique_ptr<const Tensor> output_tensor(
       std::move(predictor->GetOutput(0)));
-  printf("Output dim: %d\n", output_tensor->shape()[1]);
+  std::cout << "Output shape " << output_tensor->shape()[1] << std::endl;
   for (int i = 0; i < ShapeProduction(output_tensor->shape()); i += 100) {
-    printf("Output[%d]: %f\n", i, output_tensor->data<float>()[i]);
+    std::cout << "Output[" << i << "]: " << output_tensor->data<float>()[i]
+              << std::endl;
   }
 }
 
 int main(int argc, char** argv) {
-  google::ParseCommandLineFlags(&argc, &argv, true);
-  RunModel();
+  if (argc < 2) {
+    std::cerr << "[ERROR] usage: ./" << argv[0] << " naive_buffer_model_dir\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  RunModel(model_dir);
   return 0;
 }
diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt
index 56c70cf1e1..40c9541554 100644
--- a/lite/gen_code/CMakeLists.txt
+++ b/lite/gen_code/CMakeLists.txt
@@ -18,7 +18,6 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
-        BM_DEPS ${bm_kernels}
         EXCLUDE_COMPILE_DEPS "ON"
         ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 
@@ -47,7 +46,6 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
     XPU_DEPS ${xpu_kernels}
     CL_DEPS ${opencl_kernels}
     FPGA_DEPS ${fpga_kernels}
-    BM_DEPS ${bm_kernels}
     EXCLUDE_COMPILE_DEPS "ON"
 )
  
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 8949602cab..0c8866eaf8 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -1,3 +1,5 @@
+
+# 1. basic kernels for basic models
 # for conv op
 add_kernel(conv_depthwise ARM basic SRCS conv_depthwise.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(conv_direct ARM basic SRCS conv_direct.cc DEPS ${lite_kernel_deps} math_arm)
@@ -14,51 +16,58 @@ add_kernel(scale_compute_arm ARM basic SRCS scale_compute.cc DEPS ${lite_kernel_
 add_kernel(softmax_compute_arm ARM basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
 add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(pad2d_compute_arm ARM basic SRCS pad2d_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(prior_box_compute_arm ARM basic SRCS prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(negative_compute_arm ARM basic SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(crop_compute_arm ARM basic SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(calib_compute_arm ARM basic SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(transpose_compute_arm ARM basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(power_compute_arm ARM basic SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(yolo_box_compute_arm ARM basic SRCS yolo_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(shuffle_channel_compute_arm ARM basic SRCS shuffle_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(argmax_compute_arm ARM basic SRCS argmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(axpy_compute_arm ARM basic SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(conv_transpose_compute_arm ARM basic SRCS conv_transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(norm_compute_arm ARM basic SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(interpolate_compute_arm ARM basic SRCS interpolate_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_coder_compute_arm ARM basic SRCS box_coder_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(shape_compute_arm ARM basic SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(unsqueeze_compute_arm ARM extra SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(reduce_max_compute_arm ARM basic SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(sequence_expand_compute_arm ARM basic SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(im2sequence_compute_arm ARM basic SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(sequence_pool_compute_arm ARM basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(range_compute_arm ARM basic SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(dropout_compute_arm ARM basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(layout_compute_arm ARM basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
+## 2.other basic kernels: basic kernels that not used in basic models
+add_kernel(negative_compute_arm ARM extra SRCS negative_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(crop_compute_arm ARM extra SRCS crop_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(power_compute_arm ARM extra SRCS power_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(norm_compute_arm ARM extra SRCS norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm)
+
+## 3. extra kernels
+add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(density_prior_box_compute_arm ARM extra SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(sequence_pool_compute_arm ARM extra SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(layer_norm_compute_arm ARM extra SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gather_compute_arm ARM extra SRCS gather_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(reduce_mean_compute_arm ARM extra SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(stack_compute_arm ARM extra SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(assign_compute_arm ARM extra SRCS assign_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(affine_channel_compute_arm ARM extra SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(anchor_generator_compute_arm ARM extra SRCS anchor_generator_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(range_compute_arm ARM extra SRCS range_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(assign_value_compute_arm ARM extra SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
+
 # for OCR specific
 add_kernel(gru_unit_compute_arm ARM extra SRCS gru_unit_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(gru_compute_arm ARM extra SRCS gru_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -74,7 +83,7 @@ add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite
 add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(read_from_array_compute_arm ARM extra SRCS read_from_array_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(fill_constant_compute_arm ARM extra SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
+add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm)
 
@@ -90,18 +99,17 @@ lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_comput
 lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm)
 lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_arm)
 lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm)
-lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm)
-lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
 lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
 lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
 lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
-lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
 lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm COMPILE_LEVEL extra)
 lite_cc_test(test_argmax_compute_arm SRCS argmax_compute_test.cc DEPS argmax_compute_arm)
-lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
-lite_cc_test(test_conv_transpose_compute_arm SRCS conv_transpose_compute_test.cc DEPS conv_transpose_compute_arm)
-
+lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
 if(LITE_BUILD_EXTRA)
+    lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm)
+    lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
+    lite_cc_test(test_axpy_compute_arm SRCS axpy_compute_test.cc DEPS axpy_compute_arm)
     lite_cc_test(test_layer_norm_compute_arm SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_arm)
+    lite_cc_test(test_lookup_table_compute_arm SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_arm)
 endif()
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index ebb96e21d5..69e507ba34 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -32,13 +32,18 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   auto w_dims = param.filter->dims();
   auto& ctx = this->ctx_->template As<ARMContext>();
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int ic = w_dims[1] * param.groups;
   int oc = w_dims[0];
   int kh = w_dims[2];  // oihw
   int kw = w_dims[3];
-  int pad = param.paddings[0];
+  int pad = paddings[0];
   int stride = param.strides[0];
+  int threads = ctx.threads();
 
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
   int chin = param.x->dims()[1];
   int hin = param.x->dims()[2];
   int win = param.x->dims()[3];
@@ -46,22 +51,28 @@ void ConvCompute<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   int hout = param.output->dims()[2];
   int wout = param.output->dims()[3];
 
-  bool kps_equal = (param.paddings[0] == param.paddings[1]) &&
-                   (param.strides[0] == param.strides[1]) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
+  bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
+
+  bool kps_equal = (param.strides[0] == param.strides[1]) && (kw == kh);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
   bool flag_dw_3x3 = (kw == 3 && kh == 3 && (stride == 1 || stride == 2));
-  bool flag_dw_5x5 =
-      (kw == 5 && stride == 1) || (kw == 5 && stride == 2 && pad == 2);
+  bool flag_dw_5x5 = pads_all_equal && ((kw == 5 && stride == 1) ||
+                                        (kw == 5 && stride == 2 && pad == 2));
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
   /// select conv impl
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
+      no_dilation && flag_dw) {
     /// dw conv impl
     impl_ = new DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>;
     VLOG(3) << "invoking dw conv";
   } else if (param.groups == 1 && kw == 3 && stride == 1 && kps_equal &&
              no_dilation) {
-    if (ic >= 32 && oc >= 32 && hout > 16 && wout > 16) {
+    bool use_winograd =
+        (threads == 1 && oc >= 4 && ic >= 4 && hout >= 6 && wout >= 6 &&
+         pads_equal) ||
+        (oc >= 32 && ic >= 32 && hout >= 16 && wout >= 16 && pads_equal);
+    if (use_winograd) {
       /// winograd conv impl
       impl_ = new WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>;
       VLOG(3) << "invoking winograd conv";
@@ -92,22 +103,29 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
 
   auto& ctx = this->ctx_->template As<ARMContext>();
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
   int ic = param.groups * w_dims[1];
   int oc = w_dims[0];
   int kh = w_dims[2];  // oihw
   int kw = w_dims[3];
-  int ph = param.paddings[1];
-  int pw = param.paddings[0];
+  int ph = paddings[0];
+  int pw = paddings[2];
   int sh = param.strides[1];
   int sw = param.strides[0];
+  bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
 
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2);
-  bool flag_dw_5x5 = (kw == 5 && sw == 1);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
+  bool flag_dw_5x5 = pads_all_equal &&
+                     ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2));
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
+      no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kFloat)>;
     VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
@@ -130,23 +148,30 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
   auto w_dims = param.filter->dims();
 
   auto& ctx = this->ctx_->template As<ARMContext>();
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+  bool pads_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
 
   int ic = w_dims[1] * param.groups;
   int oc = w_dims[0];
   int kh = w_dims[2];  // oihw
   int kw = w_dims[3];
-  int ph = param.paddings[1];
-  int pw = param.paddings[0];
+  int ph = paddings[0];
+  int pw = paddings[2];
   int sh = param.strides[1];
   int sw = param.strides[0];
+  bool pads_all_equal = (pads_equal && paddings[0] == paddings[2]);
 
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
-  bool flag_dw_3x3 = (kw == 3 && kh == 3) && (sw == 1 || sw == 2);
-  bool flag_dw_5x5 = (kw == 5 && sw == 1);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
+  bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
+  bool flag_dw_5x5 = pads_all_equal &&
+                     ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2));
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
-  if (param.groups == ic && ic == oc && kps_equal && no_dilation && flag_dw) {
+  if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
+      no_dilation && flag_dw) {
     impl_ = new DepthwiseConv<PRECISION(kInt8), PRECISION(kInt8)>;
     VLOG(3) << "Run DepthwiseConv Int8";
   } else if (param.groups == 1 && kw == 3 && (sw == 1 || sw == 2) &&
@@ -194,7 +219,7 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, ConvFp32, def)
 
 REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
@@ -203,7 +228,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out)
 
 REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
@@ -213,7 +238,7 @@ REGISTER_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
 REGISTER_LITE_KERNEL(
     depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Int8, int8_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
@@ -223,7 +248,7 @@ REGISTER_LITE_KERNEL(
 REGISTER_LITE_KERNEL(
     depthwise_conv2d, kARM, kInt8, kNCHW, ConvInt8_Fp32, fp32_out)
     .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kFloat))})
     .BindInput("Filter",
                {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt8))})
     .BindOutput("Output",
diff --git a/lite/kernels/arm/conv_depthwise.cc b/lite/kernels/arm/conv_depthwise.cc
index 6a20d607e3..e2eaef51dd 100644
--- a/lite/kernels/arm/conv_depthwise.cc
+++ b/lite/kernels/arm/conv_depthwise.cc
@@ -31,19 +31,28 @@ void DepthwiseConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
   // select dw conv kernel
   if (kw == 3) {
     VLOG(5) << "invoke 3x3 dw conv fp32";
-    // trans weights
-    constexpr int cblock = 4;
-    auto oc = w_dims[0];
-    auto kh = w_dims[2];
-    auto cround = ROUNDUP(oc, cblock);
-    weights_.Resize({cround, 1, kh, kw});
-    // auto w_data = weights_.mutable_data<float>();
-    // auto w_data_in = param.filter->data<float>();
-    // lite::arm::math::conv_trans_weights_numc(
-    //    w_data_in, w_data, oc, 1, cblock, kh * kw);
-    impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
-    flag_trans_weights_ = false;
-    // flag_trans_weights_ = true;
+    auto paddings = *param.paddings;
+    bool pads_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+
+    if (pads_equal && paddings[0] == paddings[2] &&
+        (paddings[0] == 0 || paddings[0] == 1)) {
+      impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
+      flag_trans_weights_ = false;
+    } else {
+      // trans weights
+      constexpr int cblock = 4;
+      auto oc = w_dims[0];
+      auto kh = w_dims[2];
+      auto cround = ROUNDUP(oc, cblock);
+      weights_.Resize({cround, 1, kh, kw});
+      auto w_data = weights_.mutable_data<float>();
+      auto w_data_in = param.filter->data<float>();
+      lite::arm::math::conv_trans_weights_numc(
+          w_data_in, w_data, oc, 1, cblock, kh * kw);
+      impl_ = lite::arm::math::conv_depthwise_3x3_fp32;
+      flag_trans_weights_ = true;
+    }
   } else if (kw == 5) {
     VLOG(5) << "invoke 5x5 dw conv fp32";
     impl_ = lite::arm::math::conv_depthwise_5x5_fp32;
diff --git a/lite/kernels/arm/conv_gemmlike.h b/lite/kernels/arm/conv_gemmlike.h
index e00b8de6f4..5e59eb8d17 100644
--- a/lite/kernels/arm/conv_gemmlike.h
+++ b/lite/kernels/arm/conv_gemmlike.h
@@ -52,12 +52,19 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
     int oc = o_dims[1];
     int kw = w_dims[3];
     int kh = w_dims[2];
+
+    auto paddings = *param.paddings;
+    auto dilations = *param.dilations;
+
     int sw = param.strides[1];
     int sh = param.strides[0];
-    int pw = param.paddings[1];
-    int ph = param.paddings[0];
-    int dw = param.dilations[1];
-    int dh = param.dilations[0];
+    int pw = paddings[2];
+    int ph = paddings[0];
+    int dw = dilations[1];
+    int dh = dilations[0];
+
+    bool pads_equal =
+        ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
 
     int m = oc / param.groups;
     int k = ic * kh * kw / param.groups;
@@ -66,7 +73,7 @@ class GemmLikeConv : public KernelLite<TARGET(kARM), Ptype> {
     bool kps_equal = (pw == ph) && (sw == sh) && (kw == kh);
     bool ks_equal = (sw == sh) && (kw == kh);
     //! select conv gemmlike kernel
-    if (kw == 1 && sw == 1 && pw == 0 && kps_equal) {
+    if (kw == 1 && sw == 1 && pw == 0 && kps_equal && pads_equal) {
       //! 1x1s1p0 gemmlike conv
       flag_1x1gemm_ = true;
     } else {
diff --git a/lite/kernels/arm/conv_transpose_compute.cc b/lite/kernels/arm/conv_transpose_compute.cc
index 5a18499c85..5c58b29713 100644
--- a/lite/kernels/arm/conv_transpose_compute.cc
+++ b/lite/kernels/arm/conv_transpose_compute.cc
@@ -76,19 +76,28 @@ void Conv2DTransposeCompute::Run() {
   bool fuse_relu = param.fuse_relu;
   bool flag_bias = (param.bias != nullptr);
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   int m = chout * kw * kh / group;
   int n = hin * win;
   int k = chin / group;
+
+  bool pads_equal =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+
   int group_size_in = win * hin * chin / group;
   int group_size_out = wout * hout * chout / group;
   int group_size_coldata = m * n;
+
+  bool pads_all_qual = pads_equal && (paddings[0] == paddings[2]);
   int hblock = lite::arm::math::get_hblock(&ctx);
   int m_roundup = hblock * ((m + hblock - 1) / hblock);
   int group_size_weights = ((m_roundup * k + 15) / 16) * 16;
   bool flag_1x1s1p1 = (kw == 1) && (kh == 1) && (param.strides[0] == 1) &&
-                      (param.strides[1] == 1) && (param.paddings[0] == 0) &&
-                      (param.paddings[1] == 0) && (param.dilations[0] == 1) &&
-                      (param.dilations[1] == 1);
+                      (param.strides[1] == 1) && pads_all_qual &&
+                      (paddings[0] == 0) && (dilations[0] == 1) &&
+                      (dilations[1] == 1);
   ctx.ExtendWorkspace(sizeof(float) * group * m * n);
 
   auto din = param.x->data<float>();
@@ -129,12 +138,14 @@ void Conv2DTransposeCompute::Run() {
                                      wout,
                                      kh,
                                      kw,
-                                     param.paddings[0],
-                                     param.paddings[1],
+                                     paddings[0],
+                                     paddings[1],
+                                     paddings[2],
+                                     paddings[3],
                                      param.strides[0],
                                      param.strides[1],
-                                     param.dilations[0],
-                                     param.dilations[1],
+                                     dilations[0],
+                                     dilations[1],
                                      dout_batch);
     }
     if (flag_bias) {
diff --git a/lite/kernels/arm/conv_winograd.cc b/lite/kernels/arm/conv_winograd.cc
index d1b8d8a48e..d02cabf277 100644
--- a/lite/kernels/arm/conv_winograd.cc
+++ b/lite/kernels/arm/conv_winograd.cc
@@ -26,6 +26,7 @@ template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<ARMContext>();
+  int threads = ctx.threads();
 
   auto x_dims = param.x->dims();
   auto w_dims = param.filter->dims();
@@ -36,77 +37,97 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::ReInitWhenNeeded() {
   }
 
   int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
+  int ih = x_dims[2];
+  int iw = x_dims[3];
   int oc = o_dims[1];
-  int tile_w = (ow + 5) / 6;
-  int tile_h = (oh + 5) / 6;
-  int size_tile = tile_h * tile_w;
-  int size_trans_channel = 8 * 8 * size_tile;
-  int max_ch = ic > oc ? ic : oc;
-
-  const int n_wino = size_tile;
-  workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
+  int oh = o_dims[2];
+  int ow = o_dims[3];
+  int tile_block = 8;
+#ifdef __aarch64__
+  tile_block = 16;
+#endif
+  int parallel_threads =
+      (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block;
+  if (threads <= 2 && parallel_threads >= threads) {
+    if (last_kernel_is_c4_ == 1) {
+      return;
+    }
+    last_kernel_is_c4_ = 1;
+    auto pad = *(param.paddings);
+    int pad_h = pad[0];
+    int pad_w = pad[2];
+    int oc_pad = (oc + 3) / 4 * 4;
+    int ic_pad = (ic + 3) / 4 * 4;
+    const int new_input_size =
+        (ic + 3) / 4 * 4 * (ih + pad_h * 2) * (iw + pad_w * 2);
+    const int temp_size =
+        (tile_block * ((ic + 3) / 4 + (oc + 3) / 4) * 256 + 512) * threads;
+    ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
+
+    weights_.Resize({1, 1, 1, 64 * oc_pad * ic_pad});
+    ctx.ExtendWorkspace((temp_size + new_input_size) * sizeof(float));
+    void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
+    auto weights_data_ = weights_.mutable_data<float>();
+    lite::arm::math::weight_trans_c4(
+        weights_data_, param.filter->data<float>(), ic, oc, trans_tmp_ptr);
+    free(trans_tmp_ptr);
+  } else {
+    if (last_kernel_is_c4_ == 0) {
+      return;
+    }
+    last_kernel_is_c4_ = 0;
+    int tile_w = (ow + 5) / 6;
+    int tile_h = (oh + 5) / 6;
+
+    int size_tile = tile_h * tile_w;
+    int size_trans_channel = 8 * 8 * size_tile;
+    int max_ch = ic > oc ? ic : oc;
+
+    const int n_wino = size_tile;
+    ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) *
+                        sizeof(float));
+
+    const int m_wino = oc;
+    int hblock = lite::arm::math::get_hblock(&ctx);
+    int m_round = hblock * ((m_wino + hblock - 1) / hblock);
+    weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic});
+    ctx.ExtendWorkspace((size_trans_channel * max_ch * 2 + n_wino) *
+                        sizeof(float));
+    auto weights_wino =
+        static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic));
+    void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
+    lite::arm::math::winograd_transform_weights(
+        weights_wino, param.filter->data<float>(), oc, ic, trans_tmp_ptr);
+    auto weights_trans = weights_.mutable_data<float>();
+    for (int i = 0; i < 64; ++i) {
+      float* packed_weights = weights_trans + i * m_round * ic;
+      const float* weights_wino_ptr = weights_wino + i * oc * ic;
+      lite::arm::math::prepackA(packed_weights,
+                                weights_wino_ptr,
+                                1.f,
+                                ic,
+                                0,
+                                m_wino,
+                                0,
+                                ic,
+                                false,
+                                &ctx);
+    }
+    free(trans_tmp_ptr);
+    free(weights_wino);
+  }
   last_shape_ = x_dims;
 }
 
 template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::PrepareForRun() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
-
-  auto x_dims = param.x->dims();
-  auto w_dims = param.filter->dims();
-  auto o_dims = param.output->dims();
-  last_shape_ = x_dims;
-
-  int ic = x_dims[1];
-  int ow = o_dims[3];
-  int oh = o_dims[2];
-  int oc = o_dims[1];
-  int tile_w = (ow + 5) / 6;
-  int tile_h = (oh + 5) / 6;
-  int size_tile = tile_h * tile_w;
-  int size_trans_channel = 8 * 8 * size_tile;
-  int max_ch = ic > oc ? ic : oc;
-
-  const int m_wino = oc;
-  const int n_wino = size_tile;
-  int hblock = lite::arm::math::get_hblock(&ctx);
-  int m_round = hblock * ((m_wino + hblock - 1) / hblock);
-  weights_.Resize({1, 1, 1, 8 * 8 * m_round * ic});
-  workspace_size_ = (size_trans_channel * max_ch * 2 + n_wino) * sizeof(float);
-  auto weights_wino =
-      static_cast<float*>(malloc(sizeof(float) * 8 * 8 * oc * ic));
-  void* trans_tmp_ptr = malloc(sizeof(float) * 8 * 8 * oc * ic);
-  lite::arm::math::winograd_transform_weights(
-      weights_wino, param.filter->data<float>(), oc, ic, trans_tmp_ptr);
-  auto weights_trans = weights_.mutable_data<float>();
-  for (int i = 0; i < 64; ++i) {
-    float* packed_weights = weights_trans + i * m_round * ic;
-    const float* weights_wino_ptr = weights_wino + i * oc * ic;
-    lite::arm::math::prepackA(packed_weights,
-                              weights_wino_ptr,
-                              1.f,
-                              ic,
-                              0,
-                              m_wino,
-                              0,
-                              ic,
-                              false,
-                              &ctx);
-  }
-  free(trans_tmp_ptr);
-  free(weights_wino);
+  ReInitWhenNeeded();
 }
 
 template <>
 void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   auto& param = this->Param<param_t>();
   auto& ctx = this->ctx_->template As<ARMContext>();
-  // extend workspace
-  ctx.ExtendWorkspace(workspace_size_);
-
   const auto* i_data = param.x->data<float>();
   const auto* w_data = weights_.data<float>();
   const auto* b_data = param.bias ? param.bias->data<float>() : nullptr;
@@ -124,8 +145,42 @@ void WinogradConv<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
   int ow = o_dims[3];
   int oc = o_dims[1];
 
-  lite::arm::math::conv_winograd3x3(
-      i_data, o_data, bs, oc, oh, ow, ic, ih, iw, w_data, b_data, param, &ctx);
+  int tile_block = 8;
+#ifdef __aarch64__
+  tile_block = 16;
+#endif
+  int threads = ctx.threads();
+  int parallel_threads =
+      (((ow + 5) / 6) * ((oh + 5) / 6) + tile_block - 1) / tile_block;
+  if (threads <= 2 && parallel_threads >= threads) {
+    lite::arm::math::conv_compute_6x6_3x3(i_data,
+                                          o_data,
+                                          bs,
+                                          oc,
+                                          oh,
+                                          ow,
+                                          ic,
+                                          ih,
+                                          iw,
+                                          w_data,
+                                          b_data,
+                                          param,
+                                          &ctx);
+  } else {
+    lite::arm::math::conv_winograd3x3(i_data,
+                                      o_data,
+                                      bs,
+                                      oc,
+                                      oh,
+                                      ow,
+                                      ic,
+                                      ih,
+                                      iw,
+                                      w_data,
+                                      b_data,
+                                      param,
+                                      &ctx);
+  }
 }
 
 }  // namespace arm
diff --git a/lite/kernels/arm/conv_winograd.h b/lite/kernels/arm/conv_winograd.h
index 33f0edc017..40ea54b291 100644
--- a/lite/kernels/arm/conv_winograd.h
+++ b/lite/kernels/arm/conv_winograd.h
@@ -40,6 +40,7 @@ class WinogradConv : public KernelLite<TARGET(kARM), Ptype> {
   Tensor weights_;
   DDim last_shape_;
   int workspace_size_{0};
+  int last_kernel_is_c4_{-1};
 };
 
 }  // namespace arm
diff --git a/lite/kernels/arm/fc_compute.cc b/lite/kernels/arm/fc_compute.cc
index 1983c73318..525eca269b 100644
--- a/lite/kernels/arm/fc_compute.cc
+++ b/lite/kernels/arm/fc_compute.cc
@@ -127,7 +127,8 @@ void FcCompute<PRECISION(kFloat), PRECISION(kFloat)>::Run() {
                              k_,
                              param.bias != nullptr,
                              b_data,
-                             false);
+                             false,
+                             &ctx);
     }
   }
 }
diff --git a/lite/kernels/arm/fill_constant_compute.cc b/lite/kernels/arm/fill_constant_compute.cc
index 0b1911abf4..05d43dddec 100644
--- a/lite/kernels/arm/fill_constant_compute.cc
+++ b/lite/kernels/arm/fill_constant_compute.cc
@@ -25,6 +25,38 @@ class FillConstantCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  public:
   using param_t = operators::FillConstantParam;
 
+  inline DDimLite GetShape(const param_t& param) {
+    // 1. shape is a Tensor
+    if (param.shape_tensor != nullptr) {
+      auto* shape_tensor = param.shape_tensor;
+      auto* shape_data = shape_tensor->data<int>();
+      auto vec_shape =
+          std::vector<int64_t>(shape_data, shape_data + shape_tensor->numel());
+      return DDimLite(vec_shape);
+    }
+
+    // 2. shape is a list/tuple containing Tensor
+    auto shape_tensor_list = param.shape_tensor_list;
+    if (shape_tensor_list.size() > 0) {
+      std::vector<int64_t> vec_shape;
+      for (size_t i = 0; i < shape_tensor_list.size(); ++i) {
+        auto tensor = shape_tensor_list[i];
+        vec_shape.push_back(*tensor->data<int>());
+      }
+      return DDimLite(vec_shape);
+    }
+
+    // 3. shape is a list/tuple without containing Tensor
+    auto vec_shape = param.shape;
+    return DDimLite(vec_shape);
+  }
+
+  void PrepareForRun() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto outdims = GetShape(param);
+    param.Out->Resize(outdims);
+  }
+
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
     auto& context = ctx_->As<ARMContext>();
@@ -107,6 +139,11 @@ REGISTER_LITE_KERNEL(fill_constant,
                      kNCHW,
                      paddle::lite::kernels::arm::FillConstantCompute<float>,
                      def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("ShapeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("ShapeTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 REGISTER_LITE_KERNEL(
diff --git a/lite/kernels/arm/interpolate_compute.cc b/lite/kernels/arm/interpolate_compute.cc
index a26777826d..0398dabeae 100644
--- a/lite/kernels/arm/interpolate_compute.cc
+++ b/lite/kernels/arm/interpolate_compute.cc
@@ -28,6 +28,8 @@ void BilinearInterpCompute::Run() {
   auto& param = Param<operators::InterpolateParam>();
   lite::Tensor* X = param.X;
   lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
   lite::Tensor* Out = param.Out;
   float scale = param.scale;
   int out_w = param.out_w;
@@ -36,11 +38,12 @@ void BilinearInterpCompute::Run() {
   std::string interp_method = "Bilinear";
   lite::arm::math::interpolate(X,
                                OutSize,
+                               SizeTensor,
+                               Scale,
                                Out,
                                out_h,
                                out_w,
                                scale,
-                               scale,
                                align_corners,
                                interp_method);
 }
@@ -49,6 +52,8 @@ void NearestInterpCompute::Run() {
   auto& param = Param<operators::InterpolateParam>();
   lite::Tensor* X = param.X;
   lite::Tensor* OutSize = param.OutSize;
+  auto SizeTensor = param.SizeTensor;
+  auto Scale = param.Scale;
   lite::Tensor* Out = param.Out;
   float scale = param.scale;
   int out_w = param.out_w;
@@ -57,11 +62,12 @@ void NearestInterpCompute::Run() {
   std::string interp_method = "Nearest";
   lite::arm::math::interpolate(X,
                                OutSize,
+                               SizeTensor,
+                               Scale,
                                Out,
                                out_h,
                                out_w,
                                scale,
-                               scale,
                                align_corners,
                                interp_method);
 }
@@ -79,6 +85,8 @@ REGISTER_LITE_KERNEL(bilinear_interp,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
 
@@ -90,5 +98,7 @@ REGISTER_LITE_KERNEL(nearest_interp,
                      def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindInput("OutSize", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("SizeTensor", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/layout_compute.cc b/lite/kernels/arm/layout_compute.cc
new file mode 100644
index 0000000000..bc52c5ea3e
--- /dev/null
+++ b/lite/kernels/arm/layout_compute.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/layout_compute.h"
+#include "lite/backends/arm/math/funcs.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+#define NCHWTONHWC(type)                                                  \
+  auto& param = this->template Param<param_t>();                          \
+  auto input = param.x->template data<type>();                            \
+  auto input_dim = param.x->dims();                                       \
+  CHECK(input_dim.size() == 4)                                            \
+      << "NCHW to NHWC should guarantee that the input dims should be 4"; \
+  int n = input_dim[0];                                                   \
+  int c = input_dim[1];                                                   \
+  int h = input_dim[2];                                                   \
+  int w = input_dim[3];                                                   \
+  param.y->Resize({n, h, w, c});                                          \
+  auto output = param.y->template mutable_data<type>(TARGET(kARM));       \
+  if (c == 1) {                                                           \
+    memcpy(output, input, sizeof(type) * n * h * w);                      \
+    return;                                                               \
+  }                                                                       \
+  lite::arm::math::NCHW2NHWC<type>(n, c, h * w, input, output);
+
+#define NHWCTONCHW(type)                                                  \
+  auto& param = this->template Param<param_t>();                          \
+  auto input = param.x->template data<type>();                            \
+  auto input_dim = param.x->dims();                                       \
+  CHECK(input_dim.size() == 4)                                            \
+      << "NHWC to NCHW should guarantee that the input dims should be 4"; \
+  int n = input_dim[0];                                                   \
+  int h = input_dim[1];                                                   \
+  int w = input_dim[2];                                                   \
+  int c = input_dim[3];                                                   \
+  param.y->Resize({n, c, h, w});                                          \
+  auto output = param.y->template mutable_data<type>(TARGET(kARM));       \
+  if (c == 1) {                                                           \
+    memcpy(output, input, sizeof(type) * n * h * w);                      \
+    return;                                                               \
+  }                                                                       \
+  lite::arm::math::NHWC2NCHW<type>(n, c, h * w, input, output);
+
+template <>
+void NCHWToNHWCCompute<PRECISION(kFloat)>::Run() {
+  NCHWTONHWC(float);
+}
+
+template <>
+void NCHWToNHWCCompute<PRECISION(kInt8)>::Run() {
+  NCHWTONHWC(int8_t);
+}
+
+template <>
+void NHWCToNCHWCompute<PRECISION(kFloat)>::Run() {
+  NHWCTONCHW(float);
+}
+
+template <>
+void NHWCToNCHWCompute<PRECISION(kInt8)>::Run() {
+  NHWCTONCHW(int8_t);
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kFloat)>
+    NCHW_fp32;
+typedef paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kInt8)>
+    NCHW_int8;
+typedef paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kFloat)>
+    NHWC_fp32;
+typedef paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kInt8)>
+    NHWC_int8;
+
+REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NCHW_fp32, nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, NHWC_fp32, nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NCHW_int8, int8_nchw2nhwc)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, NHWC_int8, int8_nhwc2nchw)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(TARGET(kARM),
+                                      PRECISION(kInt8),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kARM),
+                                       PRECISION(kInt8),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/arm/layout_compute.h b/lite/kernels/arm/layout_compute.h
new file mode 100644
index 0000000000..13b8621029
--- /dev/null
+++ b/lite/kernels/arm/layout_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+template <PrecisionType Ptype>
+class NCHWToNHWCCompute : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  using param_t = operators::LayoutParam;
+  void Run() override;
+  virtual ~NCHWToNHWCCompute() = default;
+};
+
+template <PrecisionType Ptype>
+class NHWCToNCHWCompute : public KernelLite<TARGET(kARM), Ptype> {
+ public:
+  using param_t = operators::LayoutParam;
+  void Run() override;
+  virtual ~NHWCToNCHWCompute() = default;
+};
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/arm/lookup_table_compute.cc b/lite/kernels/arm/lookup_table_compute.cc
index fa7e2c0c3a..ba58b378f4 100644
--- a/lite/kernels/arm/lookup_table_compute.cc
+++ b/lite/kernels/arm/lookup_table_compute.cc
@@ -28,7 +28,6 @@ namespace arm {
 
 void LookupTableCompute::Run() {
   auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<ARMContext>();
   // inputs
   auto w = param.W;
   auto ids = param.Ids;
@@ -37,7 +36,7 @@ void LookupTableCompute::Run() {
 
   auto table_dim = w->dims();
   int64_t ids_numel = ids->numel();
-  auto ids_data = ids->data<float>();
+  auto ids_data = ids->data<int64_t>();
 
   int64_t row_number = table_dim[0];
   int64_t row_width = table_dim[1];
@@ -76,3 +75,14 @@ REGISTER_LITE_KERNEL(lookup_table,
     .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(lookup_table_v2,
+                     kARM,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::arm::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
diff --git a/lite/kernels/arm/lookup_table_compute_test.cc b/lite/kernels/arm/lookup_table_compute_test.cc
new file mode 100644
index 0000000000..78748edf39
--- /dev/null
+++ b/lite/kernels/arm/lookup_table_compute_test.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/arm/lookup_table_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+
+void lookup_table_compute_ref(const operators::LookupTableParam &param) {
+  auto *ids_t = param.Ids;
+  auto *output_t = param.Out;
+  int64_t padding_idx = param.padding_idx;
+  auto *ids = ids_t->data<int64_t>();
+  int64_t ids_numel = ids_t->dims().production();
+
+  auto *table_t = param.W;
+  int64_t row_number = table_t->dims()[0];
+  int64_t row_width = table_t->dims()[1];
+
+  auto *table = table_t->data<float>();
+  auto *output = output_t->mutable_data<float>();
+  memset(output, 0, output_t->dims().production() * sizeof(float));
+  for (int64_t i = 0; i < ids_numel; ++i) {
+    if (padding_idx != -1 && ids[i] == padding_idx) {
+      memset(output + i * row_width, 0, row_width * sizeof(float));
+    } else {
+      CHECK_LT(ids[i], row_number);
+      CHECK_GE(ids[i], 0);
+      memcpy(output + i * row_width,
+             table + ids[i] * row_width,
+             row_width * sizeof(float));
+    }
+  }
+}
+
+TEST(lookup_table_arm, retrieve_op) {
+  auto lookup_table =
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+          "lookup_table");
+  ASSERT_FALSE(lookup_table.empty());
+  ASSERT_TRUE(lookup_table.front());
+}
+
+TEST(lookup_table_arm, init) {
+  LookupTableCompute lookup_table;
+  ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat));
+  ASSERT_EQ(lookup_table.target(), TARGET(kARM));
+}
+
+TEST(lookup_table_arm, compute) {
+  LookupTableCompute lookup_table;
+  operators::LookupTableParam param;
+  lite::Tensor w, ids, out, out_ref;
+  int64_t padding_idx = -1;
+
+  auto w_dim = DDim(std::vector<int64_t>({4, 5}));
+  auto ids_dim = DDim(std::vector<int64_t>({3, 2}));
+  auto out_dim = DDim(std::vector<int64_t>({3, 2, 5}));
+
+  w.Resize(w_dim);
+  ids.Resize(ids_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+
+  auto *w_data = w.mutable_data<float>();
+  auto *ids_data = ids.mutable_data<int64_t>();
+  auto *out_data = out.mutable_data<float>();
+  auto *out_ref_data = out_ref.mutable_data<float>();
+
+  int w_num = w_dim.production();
+  for (int i = 0; i < w_num; i++) {
+    w_data[i] = static_cast<float>(i + 1) / (w_num + 1);
+  }
+  int ids_num = ids_dim.production();
+  for (int i = 0; i < ids_num; i++) {
+    ids_data[i] = i % 4;
+  }
+  int out_num = out_dim.production();
+
+  param.W = &w;
+  param.Ids = &ids;
+  param.Out = &out;
+  lookup_table.SetParam(param);
+  lookup_table.Run();
+  param.Out = &out_ref;
+  lookup_table_compute_ref(param);
+  for (int i = 0; i < out_num; i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(lookup_table, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/lrn_compute.cc b/lite/kernels/arm/lrn_compute.cc
index 18e6654282..0476b1e6bd 100644
--- a/lite/kernels/arm/lrn_compute.cc
+++ b/lite/kernels/arm/lrn_compute.cc
@@ -31,16 +31,16 @@ void LrnCompute::Run() {
   int channel = x_dims[1];
   int h = x_dims[2];
   int w = x_dims[3];
-  const int local_size = param.local_size;
+  const int n = param.n;
   const float alpha = param.alpha;
   const float beta = param.beta;
   const float k = param.k;
   if (param.norm_region == "AcrossChannels") {
     lite::arm::math::compute_across_channels(
-        x_data, out_data, num, channel, h, w, local_size, alpha, beta, k);
+        x_data, out_data, num, channel, h, w, n, alpha, beta, k);
   } else {
     lite::arm::math::compute_within_channels(
-        x_data, out_data, num, channel, h, w, local_size, alpha, beta, k);
+        x_data, out_data, num, channel, h, w, n, alpha, beta, k);
   }
 }
 
@@ -53,4 +53,5 @@ REGISTER_LITE_KERNEL(
     lrn, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::LrnCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("MidOut", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/arm/lrn_compute_test.cc b/lite/kernels/arm/lrn_compute_test.cc
index 8e03000615..e7030d0042 100644
--- a/lite/kernels/arm/lrn_compute_test.cc
+++ b/lite/kernels/arm/lrn_compute_test.cc
@@ -91,7 +91,7 @@ void lrn_compute_ref(const operators::LrnParam& param) {
   const dtype* x_data = param.X->data<const dtype>();
   dtype* out_data = param.Out->mutable_data<dtype>();
   auto x_dims = param.X->dims();
-  int local_size = param.local_size;
+  int local_size = param.n;
   float alpha = param.alpha;
   float beta = param.beta;
   float k = param.k;
@@ -171,7 +171,7 @@ TEST(lrn_arm, compute) {
           }
           param.X = &x;
           param.Out = &output;
-          param.local_size = local_size;
+          param.n = local_size;
           param.alpha = alpha;
           param.beta = beta;
           param.k = k;
diff --git a/lite/kernels/arm/matmul_compute.cc b/lite/kernels/arm/matmul_compute.cc
index 29be34d0c2..d00a5bdc06 100644
--- a/lite/kernels/arm/matmul_compute.cc
+++ b/lite/kernels/arm/matmul_compute.cc
@@ -232,7 +232,7 @@ void MatMulCompute::Run() {
       int ldc = n_;
       if (n_ == 1) {
         lite::arm::math::sgemv(
-            x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
+            x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
         if (fabsf(alpha - 1.f) > 1e-8f) {
           for (size_t i = 0; i < param.Out->dims().production(); ++i) {
             o_data[i] *= alpha;
diff --git a/lite/kernels/arm/mul_compute.cc b/lite/kernels/arm/mul_compute.cc
index fa43b6cf8e..debe9e907c 100644
--- a/lite/kernels/arm/mul_compute.cc
+++ b/lite/kernels/arm/mul_compute.cc
@@ -48,14 +48,13 @@ void MulCompute::Run() {
 
   CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
   k_ = x_w;
-
+  auto& ctx = this->ctx_->template As<ARMContext>();
   if (n_ == 1) {
     lite::arm::math::sgemv(
-        x_data, y_data, o_data, false, m_, k_, false, nullptr, false);
+        x_data, y_data, o_data, false, m_, k_, false, nullptr, false, &ctx);
 
   } else {
     constexpr bool is_tranposed_y = false;
-    auto& ctx = this->ctx_->template As<ARMContext>();
     int hblock = lite::arm::math::get_hblock(&ctx);
     int m_round = hblock * ((m_ + hblock - 1) / hblock);
     ctx.ExtendWorkspace(m_round * k_ * sizeof(float));
diff --git a/lite/kernels/arm/pool_compute.cc b/lite/kernels/arm/pool_compute.cc
index 9f02a462a5..c9f0fed478 100644
--- a/lite/kernels/arm/pool_compute.cc
+++ b/lite/kernels/arm/pool_compute.cc
@@ -38,7 +38,7 @@ void PoolCompute::Run() {
 
   std::vector<int>& ksize = param.ksize;
   std::vector<int>& strides = param.strides;
-  std::vector<int>& paddings = param.paddings;
+  std::vector<int>& paddings = *param.paddings;
 
   std::string& pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -48,12 +48,15 @@ void PoolCompute::Run() {
   bool use_quantizer = param.use_quantizer;
   std::string& data_format = param.data_format;
 
-  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
-                   (paddings[0] == paddings[1]);
+  bool pads_equal =
+      (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
 
+  bool kps_equal = (ksize[0] == ksize[1]) && (strides[0] == strides[1]) &&
+                   (paddings[0] == paddings[2]);
   if (global_pooling) {
     for (size_t i = 0; i < ksize.size(); ++i) {
-      paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
       ksize[i] = static_cast<int>(in_dims[i + 2]);
     }
     if (pooling_type == "max") {
@@ -80,7 +83,8 @@ void PoolCompute::Run() {
       return;
     }
   } else {
-    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && kps_equal) {
+    if (ksize[0] == 2 && strides[0] == 2 && paddings[0] == 0 && pads_equal &&
+        kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling2x2s2_max(din,
                                           dout,
@@ -106,7 +110,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 1 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s1p1_max(din,
                                             dout,
@@ -132,7 +136,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 1 && paddings[0] == 0 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s1p0_max(din,
                                             dout,
@@ -158,7 +162,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 0 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s2p0_max(din,
                                             dout,
@@ -184,7 +188,7 @@ void PoolCompute::Run() {
         return;
       }
     } else if (ksize[0] == 3 && strides[0] == 2 && paddings[0] == 1 &&
-               kps_equal) {
+               pads_equal && kps_equal) {
       if (pooling_type == "max") {
         lite::arm::math::pooling3x3s2p1_max(din,
                                             dout,
diff --git a/lite/kernels/arm/pool_compute_test.cc b/lite/kernels/arm/pool_compute_test.cc
index 79e5332172..7ed8a142dd 100644
--- a/lite/kernels/arm/pool_compute_test.cc
+++ b/lite/kernels/arm/pool_compute_test.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/arm/pool_compute.h"
 #include <gtest/gtest.h>
 #include <limits>
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/backends/arm/math/funcs.h"
@@ -25,14 +26,21 @@ namespace lite {
 namespace kernels {
 namespace arm {
 
-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+int PoolOutputSize(int input_size,
+                   int filter_size,
+                   int pad_left,
+                   int pad_right,
+                   int stride,
+                   bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
   }
   return output_size;
 }
@@ -40,10 +48,12 @@ int PoolOutputSize(
 std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
   const auto x_dims = param_->x->dims();
   std::vector<int>& ksize = param_->ksize;
+  auto paddings = *param_->paddings;
   if (param_->global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
@@ -56,7 +66,8 @@ std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
     for (size_t i = 0; i < param_->ksize.size(); ++i) {
       output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                             param_->ksize[i],
-                                            param_->paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                             param_->strides[i],
                                             param_->ceil_mode));
     }
@@ -73,7 +84,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
 
   std::vector<int> ksize = param.ksize;
   std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;
 
   std::string pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -99,7 +110,7 @@ void pool_compute_ref(const operators::PoolParam& param) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int size_channel_in = win * hin;
   int size_channel_out = wout * hout;
   if (global_pooling) {
@@ -178,18 +189,22 @@ void pool_compute_ref(const operators::PoolParam& param) {
                 int bh = kernel_h;
                 int bw = kernel_w;
                 if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                   bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                     bw += pad_w;
                   }
                 }
                 if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                   bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                     bh += pad_h;
                   }
                 }
@@ -225,75 +240,92 @@ TEST(pool_arm, compute) {
         for (auto exclusive : {true, false}) {
           for (auto ksize : {2, 3}) {
             for (auto stride : {1, 2}) {
-              for (auto pad : {0, 1}) {
-                for (auto n : {1, 2}) {
-                  for (auto c : {1, 3}) {
+              for (auto pad_left : {0, 1}) {
+                for (auto pad_right : {0, 1}) {
+                  for (auto pad_top : {0, 1}) {
+                    for (auto pad_bottom : {0, 1}) {
+                      for (auto n : {1, 2}) {
+                        for (auto c : {1, 3}) {
 #if 1
-                    for (auto h : {2, 3, 4, 11}) {
-                      for (auto w : {2, 3, 4, 11}) {
+                          for (auto h : {2, 3, 4, 11}) {
+                            for (auto w : {2, 3, 4, 11}) {
 #else
-                    for (int h = 2; h < 25; h++) {
-                      for (int w = 2; w < 25; w++) {
+                          for (int h = 2; h < 25; h++) {
+                            for (int w = 2; w < 25; w++) {
 #endif
-                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
-                                << " w:" << w << " ksize:" << ksize
-                                << " stride:" << stride << " pad:" << pad
-                                << " exclusive:" << exclusive
-                                << " global_pooling:" << global_pooling
-                                << " ceil_mode: " << ceil_mode
-                                << " pooling_type:" << pooling_type;
+                              VLOG(3) << "n:" << n << " c:" << c << " h:" << h
+                                      << " w:" << w << " ksize:" << ksize
+                                      << " stride:" << stride
+                                      << " pad_left:" << pad_left
+                                      << " pad_right:" << pad_right
+                                      << " pad_top:" << pad_top
+                                      << " pad_bottom:" << pad_bottom
+                                      << " exclusive:" << exclusive
+                                      << " global_pooling:" << global_pooling
+                                      << " ceil_mode: " << ceil_mode
+                                      << " pooling_type:" << pooling_type;
 
-                        // init x, output
-                        x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
-                        auto* x_data = x.mutable_data<float>();
-                        for (int i = 0; i < x.dims().production(); ++i) {
-                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
-                          x_data[i] = sign * (i % 128);
-                        }
+                              // init x, output
+                              x.Resize(
+                                  DDim(std::vector<int64_t>({n, c, h, w})));
+                              auto* x_data = x.mutable_data<float>();
+                              for (int i = 0; i < x.dims().production(); ++i) {
+                                float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                                x_data[i] = sign * (i % 128);
+                              }
 
-                        // fill param
-                        param.x = &x;
-                        param.output = &output;
-                        param.pooling_type = pooling_type;
-                        if (global_pooling) {
-                          param.ksize = {h, w};
-                        } else {
-                          param.ksize = {ksize, ksize};
-                        }
-                        param.global_pooling = global_pooling;
-                        param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
-                        param.exclusive = exclusive;
-                        param.ceil_mode = ceil_mode;
-                        param.adaptive = false;
-                        param.use_quantizer = false;
+                              // fill param
+                              param.x = &x;
+                              param.output = &output;
+                              param.pooling_type = pooling_type;
+                              if (global_pooling) {
+                                param.ksize = {h, w};
+                              } else {
+                                param.ksize = {ksize, ksize};
+                              }
+                              param.global_pooling = global_pooling;
+                              param.strides = {stride, stride};
+                              std::vector<int> paddings = {
+                                  pad_top, pad_bottom, pad_left, pad_right};
+                              param.exclusive = exclusive;
+                              param.paddings =
+                                  std::make_shared<std::vector<int>>(paddings);
+                              param.ceil_mode = ceil_mode;
+                              param.adaptive = false;
+                              param.use_quantizer = false;
 
-                        const std::vector<int64_t>& output_shape =
-                            compute_output_shape(&param);
-                        output.Resize(DDim(output_shape));
-                        output_ref.Resize(DDim(output_shape));
+                              const std::vector<int64_t>& output_shape =
+                                  compute_output_shape(&param);
+                              output.Resize(DDim(output_shape));
+                              output_ref.Resize(DDim(output_shape));
 
-                        auto* output_data = output.mutable_data<float>();
-                        auto* output_ref_data =
-                            output_ref.mutable_data<float>();
-                        for (int i = 0; i < output.dims().production(); ++i) {
-                          output_data[i] = -2;
-                          output_ref_data[i] = -2;
-                        }
+                              auto* output_data = output.mutable_data<float>();
+                              auto* output_ref_data =
+                                  output_ref.mutable_data<float>();
+                              for (int i = 0; i < output.dims().production();
+                                   ++i) {
+                                output_data[i] = -2;
+                                output_ref_data[i] = -2;
+                              }
 
-                        // compute
-                        pool.SetParam(param);
-                        pool.Run();
+                              // compute
+                              pool.SetParam(param);
+                              pool.Run();
 
-                        // compute ref
-                        param.output = &output_ref;
-                        pool_compute_ref(param);
+                              // compute ref
+                              param.output = &output_ref;
+                              pool_compute_ref(param);
 
-                        // compare
-                        for (int i = 0; i < output.dims().production(); i++) {
-                          EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-4);
+                              // compare
+                              for (int i = 0; i < output.dims().production();
+                                   i++) {
+                                EXPECT_NEAR(
+                                    output_data[i], output_ref_data[i], 1e-4);
+                              }
+                              VLOG(3) << "compare pass";
+                            }
+                          }
                         }
-                        VLOG(3) << "compare pass";
                       }
                     }
                   }
diff --git a/lite/kernels/arm/split_compute.cc b/lite/kernels/arm/split_compute.cc
index 27606e2d76..2a0c52e7fc 100644
--- a/lite/kernels/arm/split_compute.cc
+++ b/lite/kernels/arm/split_compute.cc
@@ -42,5 +42,9 @@ void SplitCompute::Run() {
 REGISTER_LITE_KERNEL(
     split, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::SplitCompute, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("AxisTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("SectionsTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
     .Finalize();
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index b33fc8f6bb..4bf1cbf521 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -5,24 +5,39 @@ endif()
 message(STATUS "compile with lite CUDA kernels")
 
 add_kernel(mul_compute_cuda CUDA basic SRCS mul_compute.cc DEPS ${lite_kernel_deps} context)
+add_kernel(search_group_padding_compute_cuda CUDA basic SRCS search_group_padding_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(io_copy_compute_cuda CUDA basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(leaky_relu_compute_cuda CUDA basic SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(relu_compute_cuda CUDA basic SRCS relu_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(yolo_box_compute_cuda CUDA basic SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_pool_compute_cuda CUDA extra SRCS sequence_pool_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_cuda CUDA basic SRCS transpose_compute.cu DEPS ${lite_kernel_deps} ${math_cuda} cuda_transpose)
 add_kernel(nearest_interp_compute_cuda CUDA basic SRCS nearest_interp_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(conv2d_cuda CUDA basic SRCS conv_compute.cc DEPS ${lite_kernel_deps} ${math_cuda})
 add_kernel(concat_compute_cuda CUDA basic SRCS concat_compute.cu DEPS ${lite_kernel_deps})
-add_kernel(elementwise_add_compute_cuda CUDA basic SRCS elementwise_add_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise)
+add_kernel(elementwise_compute_cuda CUDA basic SRCS elementwise_compute.cu DEPS ${lite_kernel_deps} cuda_elementwise)
 add_kernel(calib_compute_cuda CUDA basic SRCS calib_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(layout_compute_cuda CUDA basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} cuda_transpose)
 add_kernel(feed_compute_cuda CUDA basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(scale_compute_cuda CUDA basic SRCS scale_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
 add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps} cuda_scale)
 add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps})
-add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS
+${lite_kernel_deps} cudnn_pool)
 add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(search_seq_depadding_compute_cuda CUDA extra SRCS search_seq_depadding_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(search_grnn_compute_cuda CUDA extra SRCS search_grnn_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(sequence_reverse_compute_cuda CUDA basic SRCS sequence_reverse_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_concat_compute_cuda CUDA basic SRCS sequence_concat_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(sequence_arithmetic_compute_cuda CUDA basic SRCS sequence_arithmetic_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(attention_padding_mask_compute_cuda CUDA extra SRCS attention_padding_mask_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(search_fc_compute_cuda CUDA basic SRCS search_fc_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
+add_kernel(sequence_topk_avg_pooling_compute_cuda CUDA basic SRCS sequence_topk_avg_pooling_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(match_matrix_tensor_compute_cuda CUDA extra SRCS match_matrix_tensor_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(search_aligned_mat_mul_compute_cuda CUDA extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} cuda_batched_gemm)
+add_kernel(search_seq_fc_compute_cuda CUDA extra SRCS search_seq_fc_compute.cu DEPS ${lite_kernel_deps} cuda_gemm)
+add_kernel(var_conv_2d_compute_cuda CUDA basic SRCS var_conv_2d_compute.cu DEPS ${lite_kernel_deps} ${math_cuda})
 
 lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
 nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
@@ -31,13 +46,28 @@ nv_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_
 nv_test(relu_compute_cuda_test SRCS relu_compute_test.cc DEPS relu_compute_cuda)
 nv_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
 nv_test(transpose_compute_cuda_test SRCS transpose_compute_test.cc DEPS transpose_compute_cuda)
+nv_test(search_group_padding_compute_cuda_test SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_cuda)
 nv_test(concat_compute_cuda_test SRCS concat_compute_test.cc DEPS concat_compute_cuda)
-nv_test(elementwise_add_compute_cuda_test SRCS elementwise_add_compute_test.cc DEPS elementwise_add_compute_cuda)
+nv_test(elementwise_compute_cuda_test SRCS elementwise_compute_test.cc DEPS elementwise_compute_cuda)
 nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_compute_cuda)
 #nv_test(layout_cuda_test SRCS layout_compute_test.cc DEPS layout_compute_cuda)
-nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
+nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda) 
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
+nv_test(pool_compute_cuda_test SRCS pool_compute_test.cc DEPS pool_compute_cuda)
+nv_test(sequence_reverse_compute_cuda_test SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_cuda)
+nv_test(sequence_concat_compute_cuda_test SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_cuda)
+nv_test(attention_padding_mask_compute_cuda_test SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_cuda)
+nv_test(sequence_arithmetic_compute_cuda_test SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_cuda)
+nv_test(search_fc_test SRCS search_fc_compute_test.cc DEPS search_fc_compute_cuda sequence_topk_avg_pooling_compute_cuda)
+nv_test(var_conv_2d_compute_cuda_test SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_cuda)
+
 if(LITE_BUILD_EXTRA)
+    nv_test(search_seq_depadding_compute_cuda_test SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_cuda)
+    nv_test(match_matrix_tensor_compute_cuda_test SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_cuda)
+    nv_test(search_grnn_compute_cuda_test SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_cuda)
+    nv_test(sequence_pool_compute_cuda_test SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_cuda)
     nv_test(lookup_table_compute_cuda_test SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_cuda)
+    nv_test(search_aligned_mat_mul_compute_cuda_test SRCS search_aligned_mat_mul_compute_test.cc DEPS search_aligned_mat_mul_compute_cuda)
+    nv_test(search_seq_fc_compute_cuda_test SRCS search_seq_fc_compute_test.cc DEPS search_seq_fc_compute_cuda)
 endif()
diff --git a/lite/kernels/cuda/attention_padding_mask_compute.cu b/lite/kernels/cuda/attention_padding_mask_compute.cu
new file mode 100644
index 0000000000..fac73b1adc
--- /dev/null
+++ b/lite/kernels/cuda/attention_padding_mask_compute.cu
@@ -0,0 +1,162 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/attention_padding_mask_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+#define CUDA_NUM_THREADS 256
+
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void ker_attention_padding_mask(T* out_data,
+                                           const T* attn_data,
+                                           const int* src_offset,
+                                           const int attn_seq_num,
+                                           const int attn_seq_len,
+                                           const int src_seq_num,
+                                           const int src_seq_len,
+                                           const T* pad_begin_data,
+                                           const T mask,
+                                           const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int src_word_id = tid % src_seq_len;
+    int tmp_tid = tid / src_seq_len;
+    int attn_seq_id = tmp_tid / attn_seq_len;
+    int attn_word_id = tmp_tid % attn_seq_len;
+    int src_seq_id = attn_seq_id % src_seq_num;
+    int cur_len = src_offset[src_seq_id + 1] - src_offset[src_seq_id];
+
+    int k = static_cast<int>(pad_begin_data[src_seq_id]);
+    if (k < cur_len &&
+        tid >= src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) + k &&
+        tid < src_seq_len * (attn_seq_len * attn_seq_id + attn_word_id) +
+                  cur_len) {
+      out_data[tid] = mask;
+    } else {
+      out_data[tid] = attn_data[tid];
+    }
+  }
+}
+
+void AttentionPaddingMaskCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  auto attn = param.X;
+  auto src = param.Y;
+  const int count = attn->numel();
+  auto attn_offset = attn->lod()[0];
+  auto src_offset = src->lod()[0];
+  const int attn_seq_num = attn_offset.size() - 1;
+  const int attn_seq_len = attn_offset[1];
+  const int src_seq_num = src_offset.size() - 1;
+  const int src_seq_len = count / attn->dims()[0];
+
+  auto out = param.Out;
+  out->Resize(attn->dims());
+  out->set_lod(attn->lod());
+
+  auto attn_data = attn->data<float>();
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
+
+  std::vector<float> src_cpu(src->numel(), 0);
+  TargetWrapperCuda::MemcpyAsync(src_cpu.data(),
+                                 src->data<float>(),
+                                 sizeof(float) * src->numel(),
+                                 IoDirection::DtoH,
+                                 stream);
+  cudaStreamSynchronize(stream);
+
+  std::vector<float> pad_begin(src_seq_num, 0);
+  auto src_len = static_cast<int64_t>(src->lod()[0][1]);
+  int _pad_id = param.pad_id;
+  for (int i = 0; i < src_seq_num; ++i) {
+    const auto* src_data = src_cpu.data() + src_len * i;
+    int index = src_len - 1;
+    for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
+         --index) {
+    }
+    pad_begin[i] = static_cast<float>(index + 1);
+  }
+
+  param.pad_begin->Resize({static_cast<int64_t>(src_seq_num)});
+  auto pad_begin_cuda_data =
+      param.pad_begin->mutable_data<float>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(pad_begin_cuda_data,
+                                 pad_begin.data(),
+                                 sizeof(float) * src_seq_num,
+                                 IoDirection::HtoD,
+                                 stream);
+
+  std::vector<int> src_offset_cpu(src_offset.size(), 0);
+  for (int i = 0; i < src_offset.size(); i++) {
+    src_offset_cpu[i] = src_offset[i];
+  }
+
+  src_offset_cuda.Resize({static_cast<int64_t>(src_offset.size())});
+  auto src_offset_cuda_data = src_offset_cuda.mutable_data<int>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(src_offset_cuda_data,
+                                 src_offset_cpu.data(),
+                                 sizeof(int) * src_offset.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  ker_attention_padding_mask<
+      float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+      out_data,
+      attn_data,
+      src_offset_cuda_data,
+      attn_seq_num,
+      attn_seq_len,
+      src_seq_num,
+      src_seq_len,
+      pad_begin_cuda_data,
+      param.mask,
+      count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_attention_padding_mask,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::AttentionPaddingMaskCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/attention_padding_mask_compute.h b/lite/kernels/cuda/attention_padding_mask_compute.h
new file mode 100644
index 0000000000..57d8c269a1
--- /dev/null
+++ b/lite/kernels/cuda/attention_padding_mask_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class AttentionPaddingMaskCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AttentionPaddingMaskParam;
+
+  void Run() override;
+  virtual ~AttentionPaddingMaskCompute() = default;
+
+ private:
+  lite::Tensor src_offset_cuda;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/attention_padding_mask_compute_test.cc b/lite/kernels/cuda/attention_padding_mask_compute_test.cc
new file mode 100644
index 0000000000..d11858350d
--- /dev/null
+++ b/lite/kernels/cuda/attention_padding_mask_compute_test.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/attention_padding_mask_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+void attention_padding_mask_ref(
+    const Tensor& x,
+    const Tensor& y,
+    Tensor* out,
+    Tensor* pad_begin,
+    const operators::AttentionPaddingMaskParam& param) {
+  auto attn_offset = x.lod()[0];
+  auto src_offset = y.lod()[0];
+  int attn_seq_num = attn_offset.size() - 1;
+  int src_seq_num = src_offset.size() - 1;
+  int attn_seq_len = attn_offset[1];
+  int src_seq_len = x.dims()[1];
+  CHECK_EQ(attn_seq_num % src_seq_num, 0);
+
+  auto count = x.numel();
+  auto attn_data = x.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  memcpy(out_data, attn_data, count * sizeof(float));
+
+  for (int i = 0; i < attn_seq_num; ++i) {
+    for (int j = 0; j < attn_seq_len; ++j) {
+      auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j);
+      int src_seq_idx = i % src_seq_num;
+      int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx];
+      for (int k = cur_len; k < src_seq_len; k++) {
+        tmp_out_data[k] = param.mask;
+      }
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) {
+  std::vector<int64_t> x_dims{static_cast<int64_t>(lod[0].back()), dim2rd};
+  x->Resize(x_dims);
+  x->set_lod(lod);
+  auto x_data = x->mutable_data<float>();
+  auto x_num = x->numel();
+  for (int i = 0; i < x_num; i++) {
+    x_data[i] = (i - x_num) * 1.1;
+  }
+}
+
+int get_max_len(const LoD& lod) {
+  int max_len = 0;
+  auto offset = lod[0];
+  for (int i = 0; i < offset.size() - 1; i++) {
+    int cur_len = offset[i + 1] - offset[i];
+    max_len = max_len < cur_len ? cur_len : max_len;
+  }
+  return max_len;
+}
+
+TEST(attention_padding_mask_cuda, run_test) {
+  lite::Tensor x, y, x_cpu, y_cpu;
+  lite::Tensor out, pad_begin, out_cpu, out_ref, pad_begin_ref;
+
+  LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}};
+  prepare_input(&x_cpu, x_lod, get_max_len(y_lod));
+  prepare_input(&y_cpu, y_lod, 1);
+
+  x.Resize(x_cpu.dims());
+  x.set_lod(x_cpu.lod());
+  auto x_cpu_data = x_cpu.mutable_data<float>();
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  y.Resize(y_cpu.dims());
+  y.set_lod(y_cpu.lod());
+
+  operators::AttentionPaddingMaskParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.pad_id = 12800001;
+  param.mask = -90000000.f;
+  param.Out = &out;
+  param.pad_begin = &pad_begin;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  AttentionPaddingMaskCompute attention_padding_mask_kernel;
+  attention_padding_mask_kernel.SetParam(param);
+  attention_padding_mask_kernel.SetContext(std::move(ctx));
+  attention_padding_mask_kernel.Run();
+  cudaDeviceSynchronize();
+
+  auto out_data = out.mutable_data<float>(TARGET(kCUDA));
+  out_cpu.Resize(out.dims());
+  auto out_cpu_data = out_cpu.mutable_data<float>();
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+
+  attention_padding_mask_ref(x_cpu, y_cpu, &out_ref, &pad_begin_ref, param);
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/bilinear_interp_compute.cu b/lite/kernels/cuda/bilinear_interp_compute.cu
index 7e1dbaf228..00b1457938 100644
--- a/lite/kernels/cuda/bilinear_interp_compute.cu
+++ b/lite/kernels/cuda/bilinear_interp_compute.cu
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+#include "lite/backends/cuda/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/bilinear_interp_compute.h"
 
@@ -20,6 +21,43 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
 
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    lite::Tensor temp;
+    auto temp_data = temp.mutable_data<float>();
+    auto tensor_data = tensor->data<float>();
+    cudaMemcpy(temp_data,
+               tensor_data,
+               tensor->dims().production() * sizeof(float),
+               cudaMemcpyDeviceToHost);
+
+    vec_new_shape.push_back(static_cast<int32_t>(*temp_data));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data<T>();
+  cudaMemcpy(cpu_starts_tensor_data,
+             new_data,
+             new_data_tensor->dims().production() * sizeof(T),
+             cudaMemcpyDeviceToHost);
+
+  auto new_data_ = cpu_starts_tensor.data<T>();
+  vec_new_data = std::vector<T>(
+      new_data_, new_data_ + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 template <typename T>
 __global__ void BilinearInterp(const T* in,
                                const size_t in_img_h,
@@ -103,23 +141,35 @@ void BilinearInterpCompute::Run() {
   int out_w = param.out_w;
   float scale = param.scale;
   bool align_corners = param.align_corners;
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
 
-  if (out_size != nullptr) {
-    Tensor sizes;
-    float* size_data = sizes.mutable_data<float>();
-    float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
-    cudaMemcpy(
-        size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
-    out_h = static_cast<int>(size_data[0]);
-    out_w = static_cast<int>(size_data[1]);
+  auto list_new_shape_tensor = param.SizeTensor;
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    auto scale_tensor = param.Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+    if (out_size != nullptr) {
+      lite::Tensor sizes;
+      float* size_data = sizes.mutable_data<float>();
+      float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
+      cudaMemcpy(
+          size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
+      out_h = static_cast<int>(size_data[0]);
+      out_w = static_cast<int>(size_data[1]);
+    }
   }
 
   auto output_data = output->mutable_data<float>(TARGET(kCUDA));
-
   if (in_h == out_h && in_w == out_w) {
     cudaMemcpy(output_data,
                input_data,
@@ -188,6 +238,14 @@ REGISTER_LITE_KERNEL(bilinear_interp,
                {LiteType::GetTensorTy(TARGET(kCUDA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Scale",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kCUDA),
                                        PRECISION(kFloat),
diff --git a/lite/kernels/cuda/bilinear_interp_compute_test.cc b/lite/kernels/cuda/bilinear_interp_compute_test.cc
index e7e8143150..e93f5b1f3e 100644
--- a/lite/kernels/cuda/bilinear_interp_compute_test.cc
+++ b/lite/kernels/cuda/bilinear_interp_compute_test.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
+#include <vector>
 
 namespace paddle {
 namespace lite {
@@ -98,6 +99,116 @@ TEST(bilinear_interp, normal) {
   }
 }
 
+TEST(bilinear_interp, update) {
+  BilinearInterpCompute bilinear_interp_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::InterpolateParam param;
+
+  std::vector<Tensor> size_tensor(2);
+  std::vector<Tensor> size_tensor_cpu(2), size_tensor_ref(2);
+  Tensor x, input_scale, osz, out;
+  Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu;
+  Tensor x_ref, input_scale_ref, osz_ref, out_ref;
+
+  int n = 1, c = 1, in_h = 3, in_w = 3;
+  int out_h = 6, out_w = 6;
+  float scale = 2.0;
+
+  param.out_h = out_h;
+  param.out_w = out_w;
+  param.scale = scale;
+  param.align_corners = false;
+  param.align_mode = 0;
+
+  x.Resize({n, c, in_h, in_w});
+  size_tensor[0].Resize({1});
+  size_tensor[1].Resize({1});
+  input_scale.Resize({1});
+  osz.Resize({2});
+  out.Resize({n, c, out_h, out_w});
+
+  x_cpu.Resize({n, c, in_h, in_w});
+  size_tensor_cpu[0].Resize({1});
+  size_tensor_cpu[1].Resize({1});
+  input_scale_cpu.Resize({1});
+  osz_cpu.Resize({2});
+  out_cpu.Resize({n, c, out_h, out_w});
+
+  x_ref.Resize({n, c, in_h, in_w});
+  size_tensor_ref[0].Resize({1});
+  size_tensor_ref[1].Resize({1});
+  input_scale_ref.Resize({1});
+  osz_ref.Resize({2});
+  out_ref.Resize({n, c, out_h, out_w});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data<float>();
+  float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data<float>();
+  float* input_scale_cpu_data = input_scale_cpu.mutable_data<float>();
+  float* osz_cpu_data = osz_cpu.mutable_data<float>();
+  float* out_cpu_data = out_cpu.mutable_data<float>();
+
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data<float>();
+  float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data<float>();
+  float* input_scale_ref_data = input_scale_ref.mutable_data<float>();
+  float* osz_ref_data = osz_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+
+  osz_cpu_data[0] = out_h;
+  osz_cpu_data[1] = out_w;
+  size_tensor0_cpu_data[0] = out_h;
+  size_tensor1_cpu_data[0] = out_w;
+  input_scale_cpu_data[0] = scale;
+  osz_ref_data[0] = out_h;
+  osz_ref_data[1] = out_w;
+  size_tensor0_ref_data[0] = out_h;
+  size_tensor1_ref_data[0] = out_w;
+  input_scale_ref_data[0] = scale;
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  size_tensor[0].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor0_cpu_data, size_tensor[0].dims());
+  size_tensor[1].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor1_cpu_data, size_tensor[1].dims());
+  input_scale.Assign<float, lite::DDim, TARGET(kCUDA)>(input_scale_cpu_data,
+                                                       input_scale.dims());
+  osz.Assign<float, lite::DDim, TARGET(kCUDA)>(osz_cpu_data, osz_cpu.dims());
+
+  param.X = &x;
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[0]));
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[1]));
+  param.Scale = &input_scale;
+  param.OutSize = &osz;
+  param.Out = &out;
+
+  bilinear_interp_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  bilinear_interp_kernel.SetContext(std::move(ctx));
+  bilinear_interp_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  for (int i = 0; i < out.numel(); i++) {
+    LOG(INFO) << out_cpu_data[i];
+  }
+}
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/calib_compute_cuda_test.cc b/lite/kernels/cuda/calib_compute_cuda_test.cc
index 8703d8730a..fdb47f7dd3 100644
--- a/lite/kernels/cuda/calib_compute_cuda_test.cc
+++ b/lite/kernels/cuda/calib_compute_cuda_test.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/kernels/cuda/calib_compute.h"
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
@@ -58,12 +59,7 @@ void calib_ref(const operators::CalibParam& param, bool to_float = true) {
 }
 
 TEST(calib_cuda, int8_to_fp32) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto calib = std::move(*std::next(kernels.begin(), 1));
-  LOG(INFO) << "get kernel: " << calib->doc();
+  CalibComputeInt8ToFp32 calib;
   const int n = 64, c = 32, h = 18, w = 18;
   Tensor x;
   Tensor x_cpu;
@@ -87,14 +83,14 @@ TEST(calib_cuda, int8_to_fp32) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   context.SetExecStream(stream);
-  calib->SetContext(std::move(ctx));
+  calib.SetContext(std::move(ctx));
 
   operators::CalibParam param;
   param.scale = 0.013f;
   param.input = &x;
   param.output = &output;
-  calib->SetParam(param);
-  calib->Launch();
+  calib.SetParam(param);
+  calib.Launch();
   cudaDeviceSynchronize();
   // invoking ref implementation and compare results
   param.input = &x_cpu;
@@ -113,12 +109,7 @@ TEST(calib_cuda, int8_to_fp32) {
 }
 
 TEST(calib_cuda, fp32_to_int8) {
-  LOG(INFO) << "to get kernel ...";
-  auto kernels = KernelRegistry::Global().Create(
-      "calib", TARGET(kCUDA), PRECISION(kInt8), DATALAYOUT(kNCHW));
-  ASSERT_FALSE(kernels.empty());
-  auto calib = std::move(kernels.front());
-  LOG(INFO) << "get kernel: " << calib->doc();
+  CalibComputeFp32ToInt8 calib;
   const int n = 64, c = 32, h = 18, w = 18;
   Tensor x;
   Tensor x_cpu;
@@ -142,14 +133,14 @@ TEST(calib_cuda, fp32_to_int8) {
   cudaStream_t stream;
   cudaStreamCreate(&stream);
   context.SetExecStream(stream);
-  calib->SetContext(std::move(ctx));
+  calib.SetContext(std::move(ctx));
 
   operators::CalibParam param;
   param.scale = 0.013f;
   param.input = &x;
   param.output = &output;
-  calib->SetParam(param);
-  calib->Launch();
+  calib.SetParam(param);
+  calib.Launch();
   cudaDeviceSynchronize();
   // invoking ref implementation and compare results
   param.input = &x_cpu;
diff --git a/lite/kernels/cuda/concat_compute.cu b/lite/kernels/cuda/concat_compute.cu
index 9ec6936672..72d0af459b 100644
--- a/lite/kernels/cuda/concat_compute.cu
+++ b/lite/kernels/cuda/concat_compute.cu
@@ -51,9 +51,9 @@ void ConcatCompute<Dtype>::Run() {
   Tensor* output = param.output;
   auto* output_data = output->mutable_data<Dtype>(TARGET(kCUDA));
   int axis = param.axis;
-  auto* axis_tensor = param.axis_tensor;
+  Tensor* axis_tensor = param.axis_tensor;
   if (axis_tensor != nullptr) {
-    auto* axis_tensor_data = axis_tensor->data<int>();
+    const int* axis_tensor_data = axis_tensor->data<int>();
     axis = axis_tensor_data[0];
   }
   int inner_size = 1;
diff --git a/lite/kernels/cuda/conv_compute.cc b/lite/kernels/cuda/conv_compute.cc
index eea81602dd..468ed0cbd0 100644
--- a/lite/kernels/cuda/conv_compute.cc
+++ b/lite/kernels/cuda/conv_compute.cc
@@ -21,10 +21,14 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-inline int ConvOutputSize(
-    int input_size, int filter_size, int dilation, int padding, int stride) {
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  int output_size = (input_size + pad_left + pad_right - dkernel) / stride + 1;
   CHECK_GT_OR_FALSE(output_size, 0);
 
   return output_size;
@@ -50,11 +54,15 @@ void ConvComputeInt8<Ptype_out>::PrepareForRun() {
   const auto filter_dims = param.filter->dims();
   std::vector<int64_t> output_shape({in_dims[0]});
 
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
+
   for (size_t i = 0; i < param.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 1],
                                           filter_dims[i + 1],
-                                          param.dilations[i],
-                                          param.paddings[i],
+                                          dilations[i],
+                                          paddings[2 * i],
+                                          paddings[2 * i + 1],
                                           param.strides[i]));
   }
   output_shape.push_back(filter_dims[0]);
@@ -71,12 +79,15 @@ void ConvComputeInt8<Ptype_out>::Run() {
   const auto in_dims = param.x->dims();
   const auto filter_dims = param.filter->dims();
   std::vector<int64_t> output_shape({in_dims[0]});
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
 
   for (size_t i = 0; i < param.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 1],
                                           filter_dims[i + 1],
-                                          param.dilations[i],
-                                          param.paddings[i],
+                                          dilations[i],
+                                          paddings[2 * i],
+                                          paddings[2 * i + 1],
                                           param.strides[i]));
   }
   output_shape.push_back(filter_dims[0]);
diff --git a/lite/kernels/cuda/conv_compute_test.cc b/lite/kernels/cuda/conv_compute_test.cc
index 05175a0deb..2ebd7e33ba 100644
--- a/lite/kernels/cuda/conv_compute_test.cc
+++ b/lite/kernels/cuda/conv_compute_test.cc
@@ -41,7 +41,10 @@ TEST(conv_compute, fp32) {
   act_param.Leaky_relu_alpha = 0.1;
   operators::ConvParam param;
   param.activation_param = act_param;
-  param.paddings = {1, 1};
+  std::vector<int> pads = {1, 1, 1, 1};
+  std::vector<int> dilations = {1, 1, 1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   param.groups = 1;
 
   Tensor x, filter, bias, y, x_cpu, filter_cpu, bias_cpu, y_cpu;
@@ -148,6 +151,10 @@ TEST(conv_compute, int8) {
   bias.Assign<float, lite::DDim, TARGET(kCUDA)>(bias_cpu_data,
                                                 filter_cpu.dims());
 
+  std::vector<int> pads = {0, 0, 0, 0};
+  std::vector<int> dilations = {1, 1, 1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   param.x = &x;
   param.filter = &filter;
   param.output = &y;
@@ -202,12 +209,10 @@ TEST(conv_compute, int8_int8_out) {
   std::cout << "input" << std::endl;
   for (int i = 0; i < x_cpu.numel(); i++) {
     x_cpu_data[i] = static_cast<int8_t>(random(-36, 36));
-    std::cout << float(x_cpu_data[i]) << std::endl;
   }
   std::cout << "filter" << std::endl;
   for (int i = 0; i < filter_cpu.numel(); i++) {
     filter_cpu_data[i] = static_cast<int8_t>(random(-10, 10));
-    std::cout << float(filter_cpu_data[i]) << std::endl;
   }
   for (int i = 0; i < bias_cpu.numel(); i++) {
     bias_cpu_data[i] = i + 1.0;
@@ -220,6 +225,10 @@ TEST(conv_compute, int8_int8_out) {
   bias.Assign<float, lite::DDim, TARGET(kCUDA)>(bias_cpu_data,
                                                 filter_cpu.dims());
 
+  std::vector<int> pads = {0, 0, 0, 0};
+  std::vector<int> dilations = {1, 1, 1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   param.x = &x;
   param.filter = &filter;
   param.output = &y;
diff --git a/lite/kernels/cuda/elementwise_compute.cu b/lite/kernels/cuda/elementwise_compute.cu
new file mode 100644
index 0000000000..64759f86f5
--- /dev/null
+++ b/lite/kernels/cuda/elementwise_compute.cu
@@ -0,0 +1,318 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <map>
+#include <vector>
+#include "lite/backends/cuda/math/elementwise.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/elementwise_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+inline DDim trim_trailing_singular_dims(const DDim& dims) {
+  // Remove trailing dimensions of size 1 for y
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
+inline bool is_broadcast(const DDim& x_dims,
+                         const DDim& y_dims,
+                         int axis,
+                         int* pre,
+                         int* n,
+                         int* post) {
+  if (axis < 0) {
+    axis = x_dims.size() - y_dims.size();
+  }
+  DDim y_dim_trim = trim_trailing_singular_dims(y_dims);
+  axis = (y_dim_trim.size() == 0) ? x_dims.size() : axis;
+  if (x_dims.size() == y_dim_trim.size()) {
+    return false;
+  }
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+  for (int i = 0; i < y_dim_trim.size(); ++i) {
+    CHECK_EQ(x_dims[i + axis], y_dim_trim[i])
+        << "Broadcast dimension mismatch.";
+    (*n) *= y_dim_trim[i];
+  }
+  for (int i = axis + y_dim_trim.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+  return true;
+}
+
+#define ELEMENTWISE_COMPUTE(OP, WITH_RELU)                           \
+  auto& param = this->Param<param_t>();                              \
+  auto& ctx = this->ctx_->template As<CUDAContext>();                \
+  auto stream = ctx.exec_stream();                                   \
+  const lite::Tensor* x = param.X;                                   \
+  const lite::Tensor* y = param.Y;                                   \
+  lite::Tensor* out = param.Out;                                     \
+  int axis = param.axis;                                             \
+  auto* x_data = x->data<float>();                                   \
+  auto* y_data = y->data<float>();                                   \
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));           \
+  int pixel_num = x->numel();                                        \
+  int pre = 1;                                                       \
+  int n = pixel_num;                                                 \
+  int post = 1;                                                      \
+  if (WITH_RELU) {                                                   \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  } else {                                                           \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  }
+
+#define ELEMENTWISE_COMPUTE_NHWC(OP, WITH_RELU)                      \
+  std::map<int, int> pos_map = {{0, 0}, {1, 3}, {2, 1}, {3, 2}};     \
+  auto& param = this->Param<param_t>();                              \
+  auto& ctx = this->ctx_->template As<CUDAContext>();                \
+  auto stream = ctx.exec_stream();                                   \
+  const lite::Tensor* x = param.X;                                   \
+  const lite::Tensor* y = param.Y;                                   \
+  lite::Tensor* out = param.Out;                                     \
+  int axis = param.axis;                                             \
+  if (axis < 0) axis = x->dims().size() - y->dims().size();          \
+  CHECK(axis >= 0) << "invalid axis of elementwise op";              \
+  axis = pos_map[axis];                                              \
+  auto* x_data = x->data<float>();                                   \
+  auto* y_data = y->data<float>();                                   \
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));           \
+  int pixel_num = x->numel();                                        \
+  int pre = 1;                                                       \
+  int n = pixel_num;                                                 \
+  int post = 1;                                                      \
+  if (WITH_RELU) {                                                   \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise_relu(                            \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  } else {                                                           \
+    if (is_broadcast(x->dims(), y->dims(), axis, &pre, &n, &post)) { \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, pre, n, post, OP, stream);       \
+    } else {                                                         \
+      lite::cuda::math::elementwise(                                 \
+          x_data, y_data, out_data, 1, pixel_num, 1, OP, stream);    \
+    }                                                                \
+  }
+
+void ElementwiseAddCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseAddComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, false)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseAddReluCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kADD, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseAddReluComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kADD, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulReluCompute::Run() {
+  ELEMENTWISE_COMPUTE(lite::cuda::math::BinaryOperation::kMUL, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+void ElementwiseMulReluComputeNHWC::Run() {
+  ELEMENTWISE_COMPUTE_NHWC(lite::cuda::math::BinaryOperation::kMUL, true)
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseAddCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_add,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseAddComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseMulCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_mul,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseMulComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseAddReluCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_add_activation,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseAddReluComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::ElementwiseMulReluCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::ElementwiseMulReluComputeNHWC,
+                     nhwc_format)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/cuda/elementwise_compute.h b/lite/kernels/cuda/elementwise_compute.h
new file mode 100644
index 0000000000..986a4db227
--- /dev/null
+++ b/lite/kernels/cuda/elementwise_compute.h
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class ElementwiseAddCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddCompute() = default;
+};
+
+class ElementwiseAddComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddComputeNHWC() = default;
+};
+
+class ElementwiseMulCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulCompute() = default;
+};
+
+class ElementwiseMulComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::ElementwiseParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulComputeNHWC() = default;
+};
+
+class ElementwiseAddReluCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddReluCompute() = default;
+};
+
+class ElementwiseAddReluComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseAddReluComputeNHWC() = default;
+};
+
+class ElementwiseMulReluCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulReluCompute() = default;
+};
+
+class ElementwiseMulReluComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::FusionElementwiseActivationParam;
+
+  void Run() override;
+  virtual ~ElementwiseMulReluComputeNHWC() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/elementwise_compute_test.cc b/lite/kernels/cuda/elementwise_compute_test.cc
new file mode 100644
index 0000000000..9fd0b7754f
--- /dev/null
+++ b/lite/kernels/cuda/elementwise_compute_test.cc
@@ -0,0 +1,252 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/elementwise_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include "lite/api/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+static void ElementwiseAddRef(float* x, float* y, float* out, int num) {
+  for (int i = 0; i < num; ++i) {
+    out[i] = x[i] + y[i];
+  }
+}
+
+static void ElementwiseBroadcastRef(
+    float* x, float* y, float* out, int pre, int n, int post) {
+  for (int i = 0; i < pre * n * post; ++i) {
+    int idx = (i / post) % n;
+    out[i] = x[i] + y[idx];
+  }
+}
+
+TEST(elementwise_add, normal) {
+  ElementwiseAddCompute elementwise_add_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ElementwiseParam param;
+  Tensor x, y, out;
+  Tensor x_cpu, y_cpu, out_cpu;
+  Tensor x_ref, y_ref, out_ref;
+
+  const int n = 1;
+  const int c = 3;
+  const int h = 2000;
+  const int w = 2000;
+
+  x.Resize({n, c, h, w});
+  y.Resize({n, c, h, w});
+  out.Resize({n, c, h, w});
+  x_cpu.Resize({n, c, h, w});
+  y_cpu.Resize({n, c, h, w});
+  out_cpu.Resize({n, c, h, w});
+  x_ref.Resize({n, c, h, w});
+  y_ref.Resize({n, c, h, w});
+  out_ref.Resize({n, c, h, w});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* y_cpu_data = y_cpu.mutable_data<float>();
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* y_ref_data = y_ref.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  for (int i = 0; i < y_cpu.numel(); ++i) {
+    y_cpu_data[i] = i - 5.0;
+    y_ref_data[i] = i - 5.0;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  elementwise_add_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  elementwise_add_kernel.SetContext(std::move(ctx));
+  elementwise_add_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  ElementwiseAddRef(x_ref_data, y_ref_data, out_ref_data, out.numel());
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(elementwise_add, bias) {
+  ElementwiseAddCompute elementwise_add_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ElementwiseParam param;
+  Tensor x, y, out;
+  Tensor x_cpu, y_cpu, out_cpu;
+  Tensor x_ref, y_ref, out_ref;
+
+  const int n = 1;
+  const int c = 3;
+  const int h = 2000;
+  const int w = 2000;
+
+  x.Resize({n, c, h, w});
+  y.Resize({c, 1, 1});
+  out.Resize({n, c, h, w});
+  x_cpu.Resize({n, c, h, w});
+  y_cpu.Resize({c, 1, 1});
+  out_cpu.Resize({n, c, h, w});
+  x_ref.Resize({n, c, h, w});
+  y_ref.Resize({c, 1, 1});
+  out_ref.Resize({n, c, h, w});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* y_cpu_data = y_cpu.mutable_data<float>();
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* y_ref_data = y_ref.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  for (int i = 0; i < y_cpu.numel(); ++i) {
+    y_cpu_data[i] = i - 5.0;
+    y_ref_data[i] = i - 5.0;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.axis = -1;
+  elementwise_add_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  elementwise_add_kernel.SetContext(std::move(ctx));
+  elementwise_add_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  ElementwiseBroadcastRef(x_ref_data, y_ref_data, out_ref_data, n, c, h * w);
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+TEST(elementwise_add_nhwc, bias) {
+  ElementwiseAddComputeNHWC elementwise_add_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::ElementwiseParam param;
+  Tensor x, y, out;
+  Tensor x_cpu, y_cpu, out_cpu;
+  Tensor x_ref, y_ref, out_ref;
+
+  const int n = 1;
+  const int c = 3;
+  const int h = 2000;
+  const int w = 2000;
+
+  x.Resize({n, h, w, c});
+  y.Resize({c, 1, 1});
+  out.Resize({n, h, w, c});
+  x_cpu.Resize({n, h, w, c});
+  y_cpu.Resize({c, 1, 1});
+  out_cpu.Resize({n, h, w, c});
+  x_ref.Resize({n, h, w, c});
+  y_ref.Resize({c, 1, 1});
+  out_ref.Resize({n, h, w, c});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* y_cpu_data = y_cpu.mutable_data<float>();
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* y_ref_data = y_ref.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  for (int i = 0; i < y_cpu.numel(); ++i) {
+    y_cpu_data[i] = i - 5.0;
+    y_ref_data[i] = i - 5.0;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.axis = -1;
+  elementwise_add_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  elementwise_add_kernel.SetContext(std::move(ctx));
+  elementwise_add_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  ElementwiseBroadcastRef(
+      x_ref_data, y_ref_data, out_ref_data, n * h * w, c, 1);
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/feed_compute.cc b/lite/kernels/cuda/feed_compute.cc
index cffa8a573d..e54c5b9b03 100644
--- a/lite/kernels/cuda/feed_compute.cc
+++ b/lite/kernels/cuda/feed_compute.cc
@@ -20,21 +20,22 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-void FeedCompute::Run() {
-  auto& param = this->Param<param_t>();
+template <typename T, PrecisionType Ptype>
+void FeedCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
   auto& ctx = this->ctx_->template As<CUDAContext>();
   auto stream = ctx.exec_stream();
   VLOG(4) << "feed_list.size: " << param.feed_list->size();
   const lite::Tensor& feed_item = (*param.feed_list)[param.col];
 
   int num = static_cast<int>(feed_item.numel());
-  auto input = feed_item.data<float>();
+  auto input = feed_item.data<T>();
   param.out->Resize(feed_item.dims());
-  auto output = param.out->mutable_data<float>(TARGET(kCUDA));
+  auto output = param.out->template mutable_data<T>(TARGET(kCUDA));
   VLOG(4) << "col: " << param.col << " num:" << num;
 
   TargetW::MemcpyAsync(
-      output, input, num * sizeof(float), IoDirection::HtoD, stream);
+      output, input, num * sizeof(T), IoDirection::HtoD, stream);
 }
 
 }  // namespace cuda
@@ -42,8 +43,13 @@ void FeedCompute::Run() {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    feed, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::FeedCompute, nchw)
+typedef paddle::lite::kernels::cuda::FeedCompute<float, PRECISION(kFloat)>
+    FeedFp32;
+
+typedef paddle::lite::kernels::cuda::FeedCompute<int64_t, PRECISION(kInt64)>
+    FeedInt64;
+
+REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNCHW, FeedFp32, nchw)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
@@ -54,8 +60,7 @@ REGISTER_LITE_KERNEL(
                                        DATALAYOUT(kNCHW))})
     .Finalize();
 
-REGISTER_LITE_KERNEL(
-    feed, kCUDA, kFloat, kNHWC, paddle::lite::kernels::cuda::FeedCompute, nhwc)
+REGISTER_LITE_KERNEL(feed, kCUDA, kFloat, kNHWC, FeedFp32, nhwc)
     .BindInput("X",
                {LiteType::GetTensorTy(TARGET(kHost),
                                       PRECISION(kFloat),
@@ -65,3 +70,25 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNHWC))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNCHW, FeedInt64, nchw)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kInt64),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(feed, kCUDA, kInt64, kNHWC, FeedInt64, nhwc)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kInt64),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kInt64),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/cuda/feed_compute.h b/lite/kernels/cuda/feed_compute.h
index 0510404b2b..9c42dcc1ca 100644
--- a/lite/kernels/cuda/feed_compute.h
+++ b/lite/kernels/cuda/feed_compute.h
@@ -20,7 +20,8 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
-class FeedCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+template <typename T, PrecisionType Ptype>
+class FeedCompute : public KernelLite<TARGET(kCUDA), Ptype> {
  public:
   using param_t = operators::FeedParam;
   using TargetW = TargetWrapper<TARGET(kCUDA)>;
diff --git a/lite/kernels/cuda/layout_compute.cc b/lite/kernels/cuda/layout_compute.cc
index e2d0ae4f2e..6b56d9e1de 100644
--- a/lite/kernels/cuda/layout_compute.cc
+++ b/lite/kernels/cuda/layout_compute.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/kernels/cuda/layout_compute.h"
+#include <vector>
 #include "lite/backends/cuda/math/transpose.h"
 #include "lite/core/op_registry.h"
 
@@ -21,11 +22,32 @@ namespace lite {
 namespace kernels {
 namespace cuda {
 
+inline DDim trim_singular_dims(const DDim& dims) {
+  auto actual_dims_size = dims.size();
+  for (; actual_dims_size != 0; --actual_dims_size) {
+    if (dims[actual_dims_size - 1] != 1) break;
+  }
+  std::vector<int64_t> trim_dims;
+  trim_dims.resize(actual_dims_size);
+  for (int i = 0; i < actual_dims_size; ++i) {
+    trim_dims[i] = dims[i];
+  }
+  if (trim_dims.size() == 0) {
+    return DDim();
+  }
+  return DDim(trim_dims);
+}
+
 #define NCHWTONHWC(type)                                                  \
   auto& param = this->template Param<param_t>();                          \
   auto& ctx = this->ctx_->template As<CUDAContext>();                     \
   auto input = param.x->template data<type>();                            \
   auto input_dim = param.x->dims();                                       \
+  DDim input_trim_dim = trim_singular_dims(input_dim);                    \
+  if (input_trim_dim.size() == 1) {                                       \
+    param.y->CopyDataFrom(*param.x);                                      \
+    return;                                                               \
+  }                                                                       \
   CHECK(input_dim.size() == 4)                                            \
       << "NCHW to NHWC should guarantee that the input dims should be 4"; \
   int n = input_dim[0];                                                   \
@@ -41,6 +63,11 @@ namespace cuda {
   auto& ctx = this->ctx_->template As<CUDAContext>();                     \
   auto input = param.x->template data<type>();                            \
   auto input_dim = param.x->dims();                                       \
+  DDim input_trim_dim = trim_singular_dims(input_dim);                    \
+  if (input_trim_dim.size() == 1) {                                       \
+    param.y->CopyDataFrom(*param.x);                                      \
+    return;                                                               \
+  }                                                                       \
   CHECK(input_dim.size() == 4)                                            \
       << "NHWC to NCHW should guarantee that the input dims should be 4"; \
   int n = input_dim[0];                                                   \
diff --git a/lite/kernels/cuda/lookup_table_compute.cu b/lite/kernels/cuda/lookup_table_compute.cu
index 34b6de0e10..3c3bb952ca 100644
--- a/lite/kernels/cuda/lookup_table_compute.cu
+++ b/lite/kernels/cuda/lookup_table_compute.cu
@@ -98,3 +98,14 @@ REGISTER_LITE_KERNEL(lookup_table,
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
     .Finalize();
+REGISTER_LITE_KERNEL(lookup_table_v2,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .Finalize();
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.cu b/lite/kernels/cuda/match_matrix_tensor_compute.cu
new file mode 100644
index 0000000000..f89b9c9578
--- /dev/null
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.cu
@@ -0,0 +1,145 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/match_matrix_tensor_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+void MatchMatrixTensorCompute::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+}
+
+void MatchMatrixTensorCompute::Run() {
+  CHECK(ctx_) << "running context should be set first";
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+
+  auto* x = param.x;
+  auto* w = param.w;
+  auto* y = param.y;
+  auto* out = param.out;
+  auto* tmp = param.tmp;
+  int dim_t = param.dim_t;
+  int dim_in = x->dims()[1];
+
+  const auto& offset_l = x->lod()[0];
+  const auto& offset_r = y->lod()[0];
+
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    int len_l = offset_l[b + 1] - offset_l[b];
+    int len_r = offset_r[b + 1] - offset_r[b];
+    top_size += dim_t * len_l * len_r;
+    top_offset.push_back(top_size);
+  }
+
+  auto* bottom_l_data = x->data<float>();
+  auto* bottom_r_data = y->data<float>();
+  auto* t_data = w->data<float>();
+  auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
+  auto* bottom_l_trans_data = tmp->mutable_data<float>(TARGET(kCUDA));
+
+  gemm_impl_->init(
+      false, false, x->dims()[0], dim_t * dim_in, dim_in, &context);
+  gemm_impl_->run(
+      1.0f, 0.0f, bottom_l_data, t_data, bottom_l_trans_data, &context);
+
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    for (int t = 0; t < dim_t; t++) {
+      int len_l = offset_l[b + 1] - offset_l[b];
+      int len_r = offset_r[b + 1] - offset_r[b];
+      auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
+      const auto* l_t_data =
+          bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
+      const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
+
+      gemm_impl_->init(false,
+                       true,
+                       len_l,
+                       len_r,
+                       dim_in,
+                       dim_t * dim_in,
+                       dim_in,
+                       len_r,
+                       &context);
+      gemm_impl_->run(1.0f, 0.0f, l_t_data, r_data, top_data, &context);
+    }
+  }
+
+  int batch_size = x->lod()[0].size() - 1;
+  int lod_lv1_size = batch_size * dim_t;
+  int lod_lv2_size = x->lod()[0].back() * dim_t;
+  std::vector<size_t> out_lod0(batch_size + 1, 0);
+  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  for (int i = 0; i < batch_size; i++) {
+    out_lod0[i + 1] = out_lod0[i] + dim_t;
+    int len_l = offset_l[i + 1] - offset_l[i];
+
+    for (int j = 0; j < dim_t; j++) {
+      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
+      int len_r = offset_r[i + 1] - offset_r[i];
+
+      for (int k = 0; k < len_l; k++) {
+        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+      }
+    }
+  }
+
+  LoD out_lod;
+  out_lod.push_back(top_offset);
+  out_lod.push_back(offset_l);
+  out_lod.push_back(offset_r);
+  out->set_lod(out_lod);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(match_matrix_tensor,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::MatchMatrixTensorCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("W",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Tmp",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute.h b/lite/kernels/cuda/match_matrix_tensor_compute.h
new file mode 100644
index 0000000000..09db326ff3
--- /dev/null
+++ b/lite/kernels/cuda/match_matrix_tensor_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/blas.h"
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class MatchMatrixTensorCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::MatchMatrixTensorParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~MatchMatrixTensorCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/match_matrix_tensor_compute_test.cc b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc
new file mode 100644
index 0000000000..ce0ae2a7a8
--- /dev/null
+++ b/lite/kernels/cuda/match_matrix_tensor_compute_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/match_matrix_tensor_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/api/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+TEST(match_matrix_tensor, normal) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  MatchMatrixTensorCompute kernel;
+  operators::MatchMatrixTensorParam param;
+
+  // prepare ins and outs tensor in gpu, including size and lod
+  int ix = 5, iy = 4, h = 2, dim_t = 2;
+  Tensor x, w, y, out, tmp;
+  x.Resize({ix, h});
+  w.Resize({h, dim_t, h});
+  y.Resize({iy, h});
+  out.Resize({18, 1});
+  tmp.Resize({20, 1});
+  LoD x_lod{};
+  x_lod.push_back({0, 2, 5});
+  x.set_lod(x_lod);
+  LoD y_lod{};
+  y_lod.push_back({0, 3, 4});
+  y.set_lod(y_lod);
+
+  // init ins tensor in cpu
+  Tensor x_cpu, w_cpu, y_cpu, out_cpu, tmp_cpu;
+  x_cpu.Resize({ix, h});
+  w_cpu.Resize({h, dim_t, h});
+  y_cpu.Resize({iy, h});
+  out_cpu.Resize({18, 1});
+  tmp_cpu.Resize({20, 1});
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* w_cpu_data = w_cpu.mutable_data<float>();
+  auto* y_cpu_data = y_cpu.mutable_data<float>();
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = static_cast<float>(i);
+  }
+  for (int i = 0; i < w_cpu.numel(); ++i) {
+    w_cpu_data[i] = static_cast<float>(i);
+  }
+  for (int i = 0; i < y_cpu.numel(); ++i) {
+    y_cpu_data[i] = static_cast<float>(i);
+  }
+
+  // cpu tensor data assigin to gpu tensor
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  w.Assign<float, lite::DDim, TARGET(kCUDA)>(w_cpu_data, w_cpu.dims());
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  param.x = &x;
+  param.w = &w;
+  param.y = &y;
+  param.dim_t = dim_t;
+  param.out = &out;
+  param.tmp = &tmp;
+  kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  kernel.SetContext(std::move(ctx));
+  kernel.Launch();
+  cudaDeviceSynchronize();
+
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  std::vector<float> ref_results = {5,
+                                    23,
+                                    41,
+                                    17,
+                                    75,
+                                    133,
+                                    7,
+                                    33,
+                                    59,
+                                    27,
+                                    125,
+                                    223,
+                                    323,
+                                    455,
+                                    587,
+                                    557,
+                                    793,
+                                    1029};
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/mul_compute_test.cc b/lite/kernels/cuda/mul_compute_test.cc
index d1c1d63e7d..f521a12e2d 100644
--- a/lite/kernels/cuda/mul_compute_test.cc
+++ b/lite/kernels/cuda/mul_compute_test.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
+#include "lite/backends/cuda/blas.h"
 
 namespace paddle {
 namespace lite {
@@ -26,6 +27,7 @@ TEST(mul_compute, normal) {
   MulCompute mul_kernel;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   auto& context = ctx->As<CUDAContext>();
+  context.InitOnce();
 
   Tensor x, y, out, x_cpu, y_cpu, out_cpu;
   int x_h = 2, x_w_y_h = 3, y_w = 4;
diff --git a/lite/kernels/cuda/nearest_interp_compute.cu b/lite/kernels/cuda/nearest_interp_compute.cu
index 1a614e0656..adae034a1d 100644
--- a/lite/kernels/cuda/nearest_interp_compute.cu
+++ b/lite/kernels/cuda/nearest_interp_compute.cu
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #pragma once
 #include <vector>
+#include "lite/backends/cuda/target_wrapper.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/cuda/nearest_interp_compute.h"
 
@@ -20,6 +21,43 @@ namespace kernels {
 namespace cuda {
 using Tensor = lite::Tensor;
 
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    lite::Tensor temp;
+    auto temp_data = temp.mutable_data<float>();
+    auto tensor_data = tensor->data<float>();
+    cudaMemcpy(temp_data,
+               tensor_data,
+               tensor->dims().production() * sizeof(float),
+               cudaMemcpyDeviceToHost);
+
+    vec_new_shape.push_back(static_cast<int32_t>(*temp_data));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  auto cpu_starts_tensor_data = cpu_starts_tensor.mutable_data<T>();
+  cudaMemcpy(cpu_starts_tensor_data,
+             new_data,
+             new_data_tensor->dims().production() * sizeof(T),
+             cudaMemcpyDeviceToHost);
+
+  auto new_data_ = cpu_starts_tensor.data<T>();
+  vec_new_data = std::vector<T>(
+      new_data_, new_data_ + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 __global__ void KeNearestNeighborInterp(const float* in,
                                         const size_t in_img_h,
                                         const size_t in_img_w,
@@ -79,19 +117,34 @@ void NearestInterpCompute::Run() {
   int out_w = param.out_w;
   float scale = param.scale;
   bool align_corners = param.align_corners;
-  if (scale > 0) {
-    out_h = static_cast<int>(in_h * scale);
-    out_w = static_cast<int>(in_w * scale);
-  }
-
-  if (out_size != nullptr) {
-    Tensor sizes;
-    float* size_data = sizes.mutable_data<float>();
-    float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
-    cudaMemcpy(
-        size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
-    out_h = static_cast<int>(size_data[0]);
-    out_w = static_cast<int>(size_data[1]);
+  auto align_mode = param.align_mode;
+
+  auto list_new_shape_tensor = param.SizeTensor;
+  if (list_new_shape_tensor.size() > 0) {
+    // have size tensor
+    auto new_size = get_new_shape(list_new_shape_tensor);
+    out_h = new_size[0];
+    out_w = new_size[1];
+  } else {
+    auto scale_tensor = param.Scale;
+    if (scale_tensor != nullptr) {
+      auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+      scale = scale_data[0];
+    }
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+
+    if (out_size != nullptr) {
+      lite::Tensor sizes;
+      float* size_data = sizes.mutable_data<float>();
+      float* outsize_data = out_size->mutable_data<float>(TARGET(kCUDA));
+      cudaMemcpy(
+          size_data, outsize_data, sizeof(float) * 2, cudaMemcpyDeviceToHost);
+      out_h = static_cast<int>(size_data[0]);
+      out_w = static_cast<int>(size_data[1]);
+    }
   }
 
   auto output_data = output->mutable_data<float>(TARGET(kCUDA));
@@ -162,6 +215,14 @@ REGISTER_LITE_KERNEL(nearest_interp,
                {LiteType::GetTensorTy(TARGET(kCUDA),
                                       PRECISION(kFloat),
                                       DATALAYOUT(kNCHW))})
+    .BindInput("SizeTensor",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Scale",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
     .BindOutput("Out",
                 {LiteType::GetTensorTy(TARGET(kCUDA),
                                        PRECISION(kFloat),
diff --git a/lite/kernels/cuda/nearest_interp_compute_test.cc b/lite/kernels/cuda/nearest_interp_compute_test.cc
index 85032016d6..ad2ef9294e 100644
--- a/lite/kernels/cuda/nearest_interp_compute_test.cc
+++ b/lite/kernels/cuda/nearest_interp_compute_test.cc
@@ -16,6 +16,7 @@
 #include <gtest/gtest.h>
 #include <memory>
 #include <utility>
+#include <vector>
 
 namespace paddle {
 namespace lite {
@@ -143,6 +144,116 @@ TEST(nearest_interp, normal) {
   }
 }
 
+TEST(nearest_interp, update) {
+  NearestInterpCompute nearest_interp_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::InterpolateParam param;
+
+  std::vector<Tensor> size_tensor(2);
+  std::vector<Tensor> size_tensor_cpu(2), size_tensor_ref(2);
+  Tensor x, input_scale, osz, out;
+  Tensor x_cpu, input_scale_cpu, osz_cpu, out_cpu;
+  Tensor x_ref, input_scale_ref, osz_ref, out_ref;
+
+  int n = 1, c = 3, in_h = 40, in_w = 40;
+  int out_h = 80, out_w = 80;
+  float scale = 2.0;
+
+  param.out_h = out_h;
+  param.out_w = out_w;
+  param.scale = scale;
+  param.align_corners = false;
+  param.align_mode = 0;
+
+  x.Resize({n, c, in_h, in_w});
+  size_tensor[0].Resize({1});
+  size_tensor[1].Resize({1});
+  input_scale.Resize({1});
+  osz.Resize({2});
+  out.Resize({n, c, out_h, out_w});
+
+  x_cpu.Resize({n, c, in_h, in_w});
+  size_tensor_cpu[0].Resize({1});
+  size_tensor_cpu[1].Resize({1});
+  input_scale_cpu.Resize({1});
+  osz_cpu.Resize({2});
+  out_cpu.Resize({n, c, out_h, out_w});
+
+  x_ref.Resize({n, c, in_h, in_w});
+  size_tensor_ref[0].Resize({1});
+  size_tensor_ref[1].Resize({1});
+  input_scale_ref.Resize({1});
+  osz_ref.Resize({2});
+  out_ref.Resize({n, c, out_h, out_w});
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* size_tensor0_cpu_data = size_tensor_cpu[0].mutable_data<float>();
+  float* size_tensor1_cpu_data = size_tensor_cpu[1].mutable_data<float>();
+  float* input_scale_cpu_data = input_scale_cpu.mutable_data<float>();
+  float* osz_cpu_data = osz_cpu.mutable_data<float>();
+  float* out_cpu_data = out_cpu.mutable_data<float>();
+
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* size_tensor0_ref_data = size_tensor_ref[0].mutable_data<float>();
+  float* size_tensor1_ref_data = size_tensor_ref[1].mutable_data<float>();
+  float* input_scale_ref_data = input_scale_ref.mutable_data<float>();
+  float* osz_ref_data = osz_ref.mutable_data<float>();
+  float* out_ref_data = out_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i + 5.0;
+    x_ref_data[i] = i + 5.0;
+  }
+  osz_cpu_data[0] = out_h;
+  osz_cpu_data[1] = out_w;
+  size_tensor0_cpu_data[0] = out_h;
+  size_tensor1_cpu_data[0] = out_w;
+  input_scale_cpu_data[0] = scale;
+  osz_ref_data[0] = out_h;
+  osz_ref_data[1] = out_w;
+  size_tensor0_ref_data[0] = out_h;
+  size_tensor1_ref_data[0] = out_w;
+  input_scale_ref_data[0] = scale;
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  size_tensor[0].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor0_cpu_data, size_tensor[0].dims());
+  size_tensor[1].Assign<float, lite::DDim, TARGET(kCUDA)>(
+      size_tensor1_cpu_data, size_tensor[1].dims());
+  input_scale.Assign<float, lite::DDim, TARGET(kCUDA)>(input_scale_cpu_data,
+                                                       input_scale.dims());
+  osz.Assign<float, lite::DDim, TARGET(kCUDA)>(osz_cpu_data, osz_cpu.dims());
+
+  param.X = &x;
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[0]));
+  param.SizeTensor.emplace_back(
+      reinterpret_cast<const Tensor*>(&size_tensor[1]));
+  param.Scale = &input_scale;
+  param.OutSize = &osz;
+  param.Out = &out;
+  nearest_interp_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  nearest_interp_kernel.SetContext(std::move(ctx));
+  nearest_interp_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  NearestInterpRef(&x_ref, &out_ref, false);
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/pool_compute.cu b/lite/kernels/cuda/pool_compute.cu
index a2483a2c75..d7e3739ddb 100644
--- a/lite/kernels/cuda/pool_compute.cu
+++ b/lite/kernels/cuda/pool_compute.cu
@@ -256,6 +256,7 @@ void PoolCompute::Run() {
   bool adaptive = param.adaptive;
   auto x_dims = param.x->dims();
   auto out_dims = param.output->dims();
+  auto paddings = *param.paddings;
   const int in_h = x_dims[2];
   const int in_w = x_dims[3];
   const int out_h = out_dims[2];
@@ -266,8 +267,8 @@ void PoolCompute::Run() {
   const int win_w = param.ksize[1];
   const int stride_h = param.strides[0];
   const int stride_w = param.strides[1];
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
+  const int pad_h = paddings[0];
+  const int pad_w = paddings[2];
   const int total_threads = out_dims.production();
   const int threads = 512;
   const int blocks = (total_threads + threads - 1) / threads;
@@ -357,6 +358,61 @@ void PoolCompute::Run() {
   if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error);
 }
 
+inline int PoolOutputSize(
+    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+  }
+  return output_size;
+}
+
+void PoolComputeNHWC::PrepareForRun() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  pool_impl_.reset(new lite::cuda::math::CudnnPool2DNHWC<PRECISION(kFloat)>);
+  pool_impl_->init(param, &ctx);
+}
+
+void PoolComputeNHWC::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  const auto x_dims = param.x->dims();
+  std::vector<int>& ksize = param.ksize;
+  if (param.global_pooling) {
+    ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      (*param.paddings)[i] = 0;
+      ksize[i] = static_cast<int>(x_dims[i + 1]);
+    }
+  }
+
+  std::vector<int64_t> output_shape({x_dims[0]});
+  if (param.adaptive) {
+    output_shape.insert(
+        output_shape.end(), param.ksize.begin(), param.ksize.end());
+  } else {
+    for (size_t i = 0; i < param.ksize.size(); ++i) {
+      output_shape.push_back(PoolOutputSize(x_dims[i + 1],
+                                            param.ksize[i],
+                                            (*param.paddings)[i],
+                                            param.strides[i],
+                                            param.ceil_mode));
+    }
+  }
+  output_shape.push_back(x_dims[3]);
+  param.output->Resize(lite::DDim(output_shape));
+
+  pool_impl_->run(param);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(FATAL) << cudaGetErrorString(error);
+}
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
@@ -373,3 +429,19 @@ REGISTER_LITE_KERNEL(
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
+
+REGISTER_LITE_KERNEL(pool2d,
+                     kCUDA,
+                     kFloat,
+                     kNHWC,
+                     paddle::lite::kernels::cuda::PoolComputeNHWC,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNHWC))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNHWC))})
+    .Finalize();
diff --git a/lite/kernels/cuda/pool_compute.h b/lite/kernels/cuda/pool_compute.h
index 55b346bfaf..5c3a1bc2b9 100644
--- a/lite/kernels/cuda/pool_compute.h
+++ b/lite/kernels/cuda/pool_compute.h
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
+#include <vector>
+#include "lite/backends/cuda/math/cudnn_pool.h"
 #include "lite/core/kernel.h"
 
 namespace paddle {
@@ -29,6 +32,20 @@ class PoolCompute
   virtual ~PoolCompute() = default;
 };
 
+class PoolComputeNHWC
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
+ public:
+  using param_t = operators::PoolParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~PoolComputeNHWC() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::CudnnPool2DNHWC<PRECISION(kFloat)>>
+      pool_impl_;
+};
+
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/pool_compute_test.cc b/lite/kernels/cuda/pool_compute_test.cc
index fe6ff92c0c..0e5aeec8c0 100644
--- a/lite/kernels/cuda/pool_compute_test.cc
+++ b/lite/kernels/cuda/pool_compute_test.cc
@@ -27,42 +27,123 @@ namespace cuda {
 using Tensor = lite::Tensor;
 using DDim = lite::DDim;
 
-static int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+#define IN(n, c, h, w)                                 \
+  input_data[w + h * input_w + c * input_h * input_w + \
+             n * input_c * input_h * input_w]
+#define OUT(n, c, h, w)                                    \
+  output_data[w + h * output_w + c * output_h * output_w + \
+              n * output_c * output_h * output_w]
+
+template <typename Dtype>
+void nchw2nhwc_ref(lite::Tensor* input, lite::Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_c = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, h, w, c) = IN(n, c, h, w);
+        }
+      }
+    }
+  }
+}
+
+#undef IN
+#undef OUT
+
+#define IN(n, h, w, c)                                 \
+  input_data[c + w * input_c + h * input_w * input_c + \
+             n * input_h * input_w * input_c]
+#define OUT(n, h, w, c)                                    \
+  output_data[c + w * output_c + h * output_w * output_c + \
+              n * output_h * output_w * output_c]
+
+template <typename Dtype>
+void nhwc2nchw_ref(lite::Tensor* input, lite::Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_h = input->dims()[1];
+  int input_w = input->dims()[2];
+  int input_c = input->dims()[3];
+  int output_h = output->dims()[1];
+  int output_w = output->dims()[2];
+  int output_c = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, c, h, w) = IN(n, h, w, c);
+        }
+      }
+    }
+  }
+}
+
+static int PoolOutputSize(int input_size,
+                          int filter_size,
+                          int pad_left,
+                          int pad_right,
+                          int stride,
+                          bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
   }
   return output_size;
 }
 
-static std::vector<int64_t> compute_output_shape(operators::PoolParam* param_) {
+static std::vector<int64_t> compute_output_shape(operators::PoolParam* param_,
+                                                 bool is_nchw) {
+  int axis = 2;
+  if (!is_nchw) axis = 1;
   const auto x_dims = param_->x->dims();
   std::vector<int>& ksize = param_->ksize;
   if (param_->global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
+    auto paddings = *param_->paddings;
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_->paddings[i] = 0;
+      paddings[2 * i] = 0;
+      paddings[2 * i + 1] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
 
-  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  std::vector<int64_t> output_shape({x_dims[0]});
+  if (is_nchw) output_shape.push_back(x_dims[1]);
   if (param_->adaptive) {
     output_shape.insert(
         output_shape.end(), param_->ksize.begin(), param_->ksize.end());
   } else {
+    auto paddings = *param_->paddings;
     for (size_t i = 0; i < param_->ksize.size(); ++i) {
-      output_shape.push_back(PoolOutputSize(x_dims[i + 2],
+      output_shape.push_back(PoolOutputSize(x_dims[i + axis],
                                             param_->ksize[i],
-                                            param_->paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                             param_->strides[i],
                                             param_->ceil_mode));
     }
   }
+  if (!is_nchw) output_shape.push_back(x_dims[3]);
   return output_shape;
 }
 
@@ -75,7 +156,7 @@ static void pool_compute_ref(const operators::PoolParam& param) {
 
   std::vector<int> ksize = param.ksize;
   std::vector<int> strides = param.strides;
-  std::vector<int> paddings = param.paddings;
+  std::vector<int> paddings = *param.paddings;
 
   std::string pooling_type = param.pooling_type;
   bool global_pooling = param.global_pooling;
@@ -99,7 +180,7 @@ static void pool_compute_ref(const operators::PoolParam& param) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -195,15 +276,15 @@ TEST(pool_cuda, compute) {
               for (auto pad : {0, 1}) {
                 for (auto n : {1, 2}) {
                   for (auto c : {1, 3}) {
-                    for (auto h : {2, 3, 4, 11}) {
-                      for (auto w : {2, 3, 4, 11}) {
-                        VLOG(3) << "n:" << n << " c:" << c << " h:" << h
-                                << " w:" << w << " ksize:" << ksize
-                                << " stride:" << stride << " pad:" << pad
-                                << " exclusive:" << exclusive
-                                << " global_pooling:" << global_pooling
-                                << " ceil_mode: " << ceil_mode
-                                << " pooling_type:" << pooling_type;
+                    for (auto h : {3}) {
+                      for (auto w : {3}) {
+                        LOG(INFO) << "n:" << n << " c:" << c << " h:" << h
+                                  << " w:" << w << " ksize:" << ksize
+                                  << " stride:" << stride << " pad:" << pad
+                                  << " exclusive:" << exclusive
+                                  << " global_pooling:" << global_pooling
+                                  << " ceil_mode: " << ceil_mode
+                                  << " pooling_type:" << pooling_type;
 
                         // init x, output
                         x.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
@@ -226,14 +307,16 @@ TEST(pool_cuda, compute) {
                         }
                         param.global_pooling = global_pooling;
                         param.strides = {stride, stride};
-                        param.paddings = {pad, pad};
+                        std::vector<int> paddings = {pad, pad, pad, pad};
+                        param.paddings =
+                            std::make_shared<std::vector<int>>(paddings);
                         param.exclusive = exclusive;
                         param.ceil_mode = ceil_mode;
                         param.adaptive = false;
                         param.use_quantizer = false;
 
                         const std::vector<int64_t>& output_shape =
-                            compute_output_shape(&param);
+                            compute_output_shape(&param, true);
                         if (output_shape[2] * output_shape[3] == 0) continue;
                         output.Resize(DDim(output_shape));
                         output_ref.Resize(DDim(output_shape));
@@ -277,6 +360,131 @@ TEST(pool_cuda, compute) {
     }
   }
 }
+
+TEST(pool_cuda, nhwc) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  PoolComputeNHWC pool;
+  operators::PoolParam param;
+  pool.SetContext(std::move(ctx));
+
+  lite::Tensor x, temp;
+  lite::Tensor x_cpu;
+  lite::Tensor output;
+  lite::Tensor output_cpu, output_temp;
+  lite::Tensor output_ref;
+  for (auto pooling_type : {"max", "avg"}) {
+    for (auto ceil_mode : {false}) {
+      for (auto global_pooling : {true, false}) {
+        for (auto exclusive : {false, true}) {
+          for (auto ksize : {3}) {
+            for (auto stride : {3}) {
+              for (auto pad : {1}) {
+                for (auto n : {1}) {
+                  for (auto c : {3}) {
+                    for (auto h : {8}) {
+                      for (auto w : {8}) {
+                        LOG(INFO) << "n:" << n << " c:" << c << " h:" << h
+                                  << " w:" << w << " ksize:" << ksize
+                                  << " stride:" << stride << " pad:" << pad
+                                  << " exclusive:" << exclusive
+                                  << " global_pooling:" << global_pooling
+                                  << " ceil_mode: " << ceil_mode
+                                  << " pooling_type:" << pooling_type;
+
+                        // init x, output
+                        x.Resize(DDim(std::vector<int64_t>({n, h, w, c})));
+                        temp.Resize(DDim(std::vector<int64_t>({n, h, w, c})));
+                        x_cpu.Resize(DDim(std::vector<int64_t>({n, c, h, w})));
+
+                        auto* x_cpu_data = x_cpu.mutable_data<float>();
+                        for (int i = 0; i < x_cpu.dims().production(); ++i) {
+                          float sign = i % 3 == 0 ? -0.03 : 0.05f;
+                          x_cpu_data[i] = sign * (i % 128);
+                        }
+
+                        nchw2nhwc_ref<float>(&x_cpu, &temp);
+                        auto* temp_cpu_data = temp.mutable_data<float>();
+
+                        x.Assign<float, DDim, TARGET(kCUDA)>(temp_cpu_data,
+                                                             temp.dims());
+                        // fill param
+                        param.x = &x;
+                        param.output = &output;
+                        param.pooling_type = pooling_type;
+                        if (global_pooling) {
+                          param.ksize = {h, w};
+                        } else {
+                          param.ksize = {ksize, ksize};
+                        }
+                        param.global_pooling = global_pooling;
+                        param.strides = {stride, stride};
+                        std::vector<int> paddings = {pad, pad, pad, pad};
+                        param.paddings =
+                            std::make_shared<std::vector<int>>(paddings);
+                        param.exclusive = exclusive;
+                        param.ceil_mode = ceil_mode;
+                        param.adaptive = false;
+                        param.use_quantizer = false;
+
+                        const std::vector<int64_t>& output_shape =
+                            compute_output_shape(&param, false);
+                        if (output_shape[2] * output_shape[3] == 0) continue;
+                        output.Resize(DDim(output_shape));
+                        output_temp.Resize(DDim(output_shape));
+                        output_cpu.Resize(DDim(output_shape));
+
+                        auto* output_data =
+                            output.mutable_data<float>(TARGET(kCUDA));
+                        auto* output_cpu_data =
+                            output_cpu.mutable_data<float>();
+
+                        // compute
+                        pool.SetParam(param);
+                        pool.Launch();
+
+                        // compute ref
+                        param.x = &x_cpu;
+                        // nchw
+                        const std::vector<int64_t>& output_shape_ref =
+                            compute_output_shape(&param, true);
+
+                        output_ref.Resize(DDim(output_shape_ref));
+                        // auto* output_ref_data =
+                        //    output_ref.mutable_data<float>();
+                        param.output = &output_ref;
+                        pool_compute_ref(param);
+                        nchw2nhwc_ref<float>(&output_ref, &output_temp);
+                        auto* output_temp_data =
+                            output_temp.mutable_data<float>();
+
+                        cudaDeviceSynchronize();
+                        CopySync<TARGET(kCUDA)>(output_cpu_data,
+                                                output_data,
+                                                sizeof(float) * output.numel(),
+                                                IoDirection::DtoH);
+                        // compare
+                        for (int i = 0; i < output.dims().production(); i++) {
+                          EXPECT_NEAR(
+                              output_cpu_data[i], output_temp_data[i], 1e-4);
+                        }
+                        VLOG(3) << "compare pass";
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
 }  // namespace cuda
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
new file mode 100644
index 0000000000..ddefb608dd
--- /dev/null
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_aligned_mat_mul_compute.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_aligned_mat_mul,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchAlignedMatMulCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute.h b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
new file mode 100644
index 0000000000..b1c4552d9c
--- /dev/null
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/math/batched_gemm.h"
+#include "lite/core/context.h"
+#include "lite/core/kernel.h"
+#include "lite/core/types.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchAlignedMatMulCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  void PrepareForRun() override {
+    auto& param = this->Param<param_t>();
+    CHECK(ctx_) << "running context should be set first";
+    auto& cuda_ctx = ctx_->template As<CUDAContext>();
+    bool x_transpose = param.transpose_X;
+    bool y_transpose = param.transpose_Y;
+    int seq_num = param.X->lod()[0].size() - 1;
+    batched_gemm_impl_.reset(new lite::cuda::math::BatchedGemm<float, float>);
+    CHECK(
+        batched_gemm_impl_->init(x_transpose, y_transpose, seq_num, &cuda_ctx));
+    A_ = static_cast<float**>(malloc(3 * seq_num * sizeof(float*)));
+    CHECK(A_);
+  }
+
+  void Run() override {
+    auto& param = this->Param<param_t>();
+    auto x = param.X;
+    auto y = param.Y;
+    auto out = param.Out;
+    bool x_transpose = param.transpose_X;
+    bool y_transpose = param.transpose_Y;
+    float alpha = param.alpha;
+    const auto& x_dims = x->dims();
+    const auto& y_dims = y->dims();
+    const auto& x_lod = x->lod();
+    const auto& y_lod = y->lod();
+    const auto& x_lod_0 = x_lod[0];
+    const auto& y_lod_0 = y_lod[0];
+    int seq_num = x_lod_0.size() - 1;
+    int x_inner_size = x_dims[1];
+    int y_inner_size = y_dims[1];
+    int x_batch_size = x_lod_0[1];
+    int y_batch_size = y_lod_0[1];
+    int M = x_transpose ? x_inner_size : x_batch_size;
+    int N = y_transpose ? y_batch_size : y_inner_size;
+    int X_K = x_transpose ? x_batch_size : x_inner_size;
+    int Y_K = y_transpose ? y_inner_size : y_batch_size;
+    CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+    int K = X_K;
+
+    auto x_data = x->data<float>();
+    auto y_data = y->data<float>();
+    auto out_data = out->mutable_data<float>(TARGET(kCUDA));
+    auto x_stride = x_batch_size * x_inner_size;
+    auto y_stride = y_batch_size * y_inner_size;
+    auto out_stride = M * N;
+    for (int seq = 0; seq < seq_num; seq++) {
+      A_[seq] = const_cast<float*>(x_data) + seq * x_stride;
+      A_[seq + seq_num] = const_cast<float*>(y_data) + seq * y_stride;
+      A_[seq + seq_num * 2] = out_data + seq * out_stride;
+    }
+    batched_gemm_impl_->run(
+        alpha, 0.0f, const_cast<const float**>(A_), M, N, K, seq_num);
+  }
+
+  ~SearchAlignedMatMulCompute() {
+    if (A_ != nullptr) {
+      free(A_);
+    }
+  }
+
+ private:
+  std::unique_ptr<lite::cuda::math::BatchedGemm<float, float>>
+      batched_gemm_impl_;
+  float** A_{nullptr};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
new file mode 100644
index 0000000000..f08333b310
--- /dev/null
+++ b/lite/kernels/cuda/search_aligned_mat_mul_compute_test.cc
@@ -0,0 +1,221 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_aligned_mat_mul_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+void search_aligned_mat_mul_compute_ref(const operators::MatMulParam& param) {
+  auto x = param.X;
+  auto y = param.Y;
+  auto out = param.Out;
+  bool x_transpose = param.transpose_X;
+  bool y_transpose = param.transpose_Y;
+  T alpha = static_cast<T>(param.alpha);
+  const auto x_dims = x->dims();
+  const auto y_dims = y->dims();
+  const auto& x_lod = x->lod();
+  const auto& y_lod = y->lod();
+  const auto& x_lod_0 = x_lod[0];
+  const auto& y_lod_0 = y_lod[0];
+  int seq_num = x_lod_0.size() - 1;
+  int x_inner_size = x_dims[1];
+  int y_inner_size = y_dims[1];
+  int x_batch_size = x_lod_0[1];
+  int y_batch_size = y_lod_0[1];
+  int M = x_transpose ? x_inner_size : x_batch_size;
+  int N = y_transpose ? y_batch_size : y_inner_size;
+  int X_K = x_transpose ? x_batch_size : x_inner_size;
+  int Y_K = y_transpose ? y_inner_size : y_batch_size;
+  CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+  int K = X_K;
+  int lda = x_transpose ? M : K;
+  int ldb = y_transpose ? K : N;
+  int ldc = N;
+  int x_stride = x_batch_size * x_inner_size;
+  int y_stride = y_batch_size * y_inner_size;
+  int out_stride = M * N;
+  auto x_data = x->data<T>();
+  auto y_data = y->data<T>();
+  auto out_data = out->mutable_data<T>();
+
+  for (int seq = 0; seq < seq_num; seq++) {
+    auto a = x_data + seq * x_stride;
+    auto b = y_data + seq * y_stride;
+    auto c = out_data + seq * out_stride;
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        auto sum = static_cast<T>(0);
+        for (int l = 0; l < K; l++) {
+          T av;
+          T bv;
+          if (x_transpose) {
+            av = a[l * lda + i];
+          } else {
+            av = a[i * lda + l];
+          }
+          if (y_transpose) {
+            bv = b[j * ldb + l];
+          } else {
+            bv = b[l * ldb + j];
+          }
+          sum += av * bv;
+        }
+        c[i * ldc + j] = alpha * sum;
+      }
+    }
+  }
+}
+
+TEST(search_aligned_mat_mul_compute, normal) {
+  Env<TargetType::kCUDA>::Init();
+  for (int seq_num : {1, 2}) {
+    for (int x_batch_size : {1, 3}) {
+      for (int x_inner_size : {1, 5}) {
+        for (int out_inner_size : {1, 4}) {
+          for (bool x_transpose : {true, false}) {
+            for (bool y_transpose : {true, false}) {
+              for (float alpha : {1., 2.}) {
+                // infer x_dims and y_dims
+                int y_batch_size;
+                int y_inner_size;
+                int out_batch_size;
+                if (x_transpose) {
+                  if (y_transpose) {
+                    y_batch_size = out_inner_size;
+                    y_inner_size = x_batch_size;
+                    out_batch_size = x_inner_size;
+                  } else {
+                    y_batch_size = x_batch_size;
+                    y_inner_size = out_inner_size;
+                    out_batch_size = x_inner_size;
+                  }
+                } else {
+                  if (y_transpose) {
+                    y_batch_size = out_inner_size;
+                    y_inner_size = x_inner_size;
+                    out_batch_size = x_batch_size;
+                  } else {
+                    y_batch_size = x_inner_size;
+                    y_inner_size = out_inner_size;
+                    out_batch_size = x_batch_size;
+                  }
+                }
+                std::vector<uint64_t> x_lod_0(seq_num + 1);
+                std::vector<uint64_t> y_lod_0(seq_num + 1);
+                std::vector<uint64_t> out_lod_0(seq_num + 1);
+                x_lod_0[0] = 0;
+                y_lod_0[0] = 0;
+                out_lod_0[0] = 0;
+                for (int i = 0; i < seq_num; i++) {
+                  x_lod_0[i + 1] = x_lod_0[i] + x_batch_size;
+                  y_lod_0[i + 1] = y_lod_0[i] + y_batch_size;
+                  out_lod_0[i + 1] = out_lod_0[i] + out_batch_size;
+                }
+                LoD x_lod;
+                LoD y_lod;
+                LoD out_lod;
+                x_lod.push_back(x_lod_0);
+                y_lod.push_back(y_lod_0);
+                out_lod.push_back(out_lod_0);
+                DDim x_dims({static_cast<int64_t>(x_lod_0.back()),
+                             static_cast<int64_t>(x_inner_size)});
+                DDim y_dims({static_cast<int64_t>(y_lod_0.back()),
+                             static_cast<int64_t>(y_inner_size)});
+                DDim out_dims({static_cast<int64_t>(out_lod_0.back()),
+                               static_cast<int64_t>(out_inner_size)});
+                // prepare input&output tensors
+                Tensor x_dev, x_host, y_dev, y_host, out_dev, out_host, out_ref;
+                x_host.Resize(x_dims);
+                y_host.Resize(y_dims);
+                out_host.Resize(out_dims);
+                x_dev.Resize(x_dims);
+                y_dev.Resize(y_dims);
+                out_dev.Resize(out_dims);
+                out_ref.Resize(out_dims);
+                x_host.set_lod(x_lod);
+                y_host.set_lod(y_lod);
+                out_host.set_lod(out_lod);
+                x_dev.set_lod(x_lod);
+                y_dev.set_lod(y_lod);
+                out_dev.set_lod(out_lod);
+                out_ref.set_lod(out_lod);
+                auto out_dev_data = out_dev.mutable_data<float>(TARGET(kCUDA));
+                auto x_host_data = x_host.mutable_data<float>();
+                auto y_host_data = y_host.mutable_data<float>();
+                auto out_host_data = out_host.mutable_data<float>();
+                auto out_ref_data = out_ref.mutable_data<float>();
+                for (int i = 0; i < x_host.dims().production(); i++) {
+                  x_host_data[i] = i * 0.125f;
+                }
+                for (int i = 0; i < y_host.dims().production(); i++) {
+                  y_host_data[i] = i * 0.5f;
+                }
+                x_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(x_host_data,
+                                                               x_host.dims());
+                y_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(y_host_data,
+                                                               y_host.dims());
+                // prepare cuda context, initialize param, and run kernel
+                operators::MatMulParam param;
+                param.X = &x_dev;
+                param.Y = &y_dev;
+                param.Out = &out_dev;
+                param.alpha = alpha;
+                param.transpose_X = x_transpose;
+                param.transpose_Y = y_transpose;
+                std::unique_ptr<KernelContext> ctx(new KernelContext);
+                auto& cuda_ctx = ctx->As<CUDAContext>();
+                cuda_ctx.InitOnce();
+                int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
+                cuda_ctx.Init(dev_id);
+                SearchAlignedMatMulCompute search_aligned_mat_mul;
+                search_aligned_mat_mul.SetParam(param);
+                search_aligned_mat_mul.SetContext(std::move(ctx));
+                search_aligned_mat_mul.Launch();
+                cudaDeviceSynchronize();
+                CopySync<TARGET(kCUDA)>(
+                    out_host_data,
+                    out_dev_data,
+                    sizeof(float) * out_dev.dims().production(),
+                    IoDirection::DtoH);
+                // run reference
+                param.X = &x_host;
+                param.Y = &y_host;
+                param.Out = &out_ref;
+                search_aligned_mat_mul_compute_ref<float>(param);
+                // verify result
+                for (int i = 0; i < out_ref.dims().production(); i++) {
+                  EXPECT_NEAR(out_host_data[i], out_ref_data[i], 1e-5);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_fc_compute.cu b/lite/kernels/cuda/search_fc_compute.cu
new file mode 100644
index 0000000000..591e2474a4
--- /dev/null
+++ b/lite/kernels/cuda/search_fc_compute.cu
@@ -0,0 +1,170 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_fc_compute.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+template <typename T>
+static void anakin_NV_gemv(cublasHandle_t handle,
+                           const bool TransA,
+                           const int M,
+                           const int N,
+                           const T alpha,
+                           const T* A,
+                           const T* x,
+                           const T beta,
+                           T* y);
+template <>
+void anakin_NV_gemv<float>(cublasHandle_t handle,
+                           const bool TransA,
+                           const int M,
+                           const int N,
+                           const float alpha,
+                           const float* A,
+                           const float* x,
+                           const float beta,
+                           float* y) {
+  cublasOperation_t cuTransA = (TransA == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  CUBLAS_CHECK(
+      cublasSgemv(handle, cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1));
+}
+template <typename T>
+static void anakin_NV_gemm(cublasHandle_t handle,
+                           const bool TransA,
+                           const bool TransB,
+                           const int M,
+                           const int N,
+                           const int K,
+                           const T alpha,
+                           const T* A,
+                           const T* B,
+                           const T beta,
+                           T* C);
+
+template <>
+void anakin_NV_gemm<float>(cublasHandle_t handle,
+                           const bool TransA,
+                           const bool TransB,
+                           const int M,
+                           const int N,
+                           const int K,
+                           const float alpha,
+                           const float* A,
+                           const float* B,
+                           const float beta,
+                           float* C) {
+  // Note that cublas follows fortran order.
+  int lda = (!TransA /* == CblasNoTrans*/) ? K : M;
+  int ldb = (!TransB /* == CblasNoTrans*/) ? N : K;
+  cublasOperation_t cuTransA =
+      (!TransA /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (!TransB /* == CblasNoTrans*/) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBLAS_CHECK(cublasSgemm(handle,
+                           cuTransB,
+                           cuTransA,
+                           N,
+                           M,
+                           K,
+                           &alpha,
+                           B,
+                           ldb,
+                           A,
+                           lda,
+                           &beta,
+                           C,
+                           N));
+}
+
+template <>
+void anakin_NV_gemm<char>(cublasHandle_t handle,
+                          const bool TransA,
+                          const bool TransB,
+                          const int M,
+                          const int N,
+                          const int K,
+                          const char alpha,
+                          const char* A,
+                          const char* B,
+                          const char beta,
+                          char* C) {
+  LOG(FATAL) << "int8 gemm is not implemented";
+}
+
+template <typename T>
+static __global__ void add_bias(int n,
+                                int output_size,
+                                const T* bias,
+                                T* dout) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int bias_index = index % output_size;
+  if (index < n) {
+    dout[index] = dout[index] + bias[bias_index];
+  }
+}
+
+template <typename T>
+void SearchFcCompute<T>::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  const Tensor* x_tensor = param.X;
+  param.Out->Resize({x_tensor->dims()[0], param.out_size});
+  _M = x_tensor->dims().count(0, 1);
+  _K = x_tensor->dims().count(1, x_tensor->numel());
+  _N = param.out_size;
+  const T* din = x_tensor->data<T>();
+  Tensor* out_tensor = param.Out;
+  T* dout = out_tensor->mutable_data<T>(TARGET(kCUDA));
+  const Tensor* w_tensor = param.W;
+  const T* weight = w_tensor->data<T>();
+  const Tensor* b_tensor = param.b;
+  const T* bias = b_tensor->data<T>();
+  cublasCreate(&_handle);
+  if (_M == 1 && _K > 50000) {
+    anakin_NV_gemv<T>(_handle, false, _N, _K, (T)1, weight, din, (T)0, dout);
+  } else {
+    anakin_NV_gemm<T>(_handle,
+                      false,
+                      !_flag_trans_weights,
+                      _M,
+                      _N,
+                      _K,
+                      (T)1,
+                      din,
+                      weight,
+                      (T)0,
+                      dout);
+  }
+  int total_size = _M * _N;
+  add_bias<T><<<CUDA_GET_BLOCKS(total_size), CUDA_NUM_THREADS, 0, stream>>>(
+      total_size, _N, bias, dout);
+}
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_fc,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchFcCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_fc_compute.h b/lite/kernels/cuda/search_fc_compute.h
new file mode 100644
index 0000000000..db09362734
--- /dev/null
+++ b/lite/kernels/cuda/search_fc_compute.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+const int CUDA_NUM_THREADS = 512;
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+inline int CUDA_GET_BLOCKS(const int N, const int base) {
+  return (N + base - 1) / base;
+}
+
+template <typename T>
+class SearchFcCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchFcParam;
+  void Run() override;
+  virtual ~SearchFcCompute() = default;
+
+ private:
+  bool _flag_trans_weights{false};
+  int _M;
+  int _K;
+  int _N;
+  cublasHandle_t _handle;
+  bool _is_continue_buf{true};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_fc_compute_test.cc b/lite/kernels/cuda/search_fc_compute_test.cc
new file mode 100644
index 0000000000..f06028fbe1
--- /dev/null
+++ b/lite/kernels/cuda/search_fc_compute_test.cc
@@ -0,0 +1,110 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_fc_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+void fc_cpu_base(const lite::Tensor* X,
+                 const lite::Tensor* W,
+                 const lite::Tensor* b,
+                 int out_size,
+                 lite::Tensor* Out) {
+  const float* data_in = X->data<float>();
+  const float* bias = b->data<float>();
+  const float* weights = W->data<float>();
+  float* data_out = Out->mutable_data<float>();
+  int out_rows = X->dims()[0];
+  int in_cols = X->numel() / out_rows;
+  int out_cols = W->numel() / in_cols;
+  int index_out;
+
+  for (int i = 0; i < out_rows; i++) {
+    for (int j = 0; j < out_cols; j++) {
+      index_out = i * out_cols + j;
+      data_out[index_out] = bias ? bias[j] : 0;
+
+      for (int k = 0; k < in_cols; k++) {
+        data_out[index_out] +=
+            data_in[i * in_cols + k] * weights[j * in_cols + k];
+      }
+    }
+  }
+}
+
+TEST(search_fc, normal) {
+  SearchFcCompute<float> search_fc_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  operators::SearchFcParam param;
+  lite::Tensor X, X_gpu, W, W_gpu, b, b_gpu;
+  lite::Tensor Out, Out_cpu, out_ref;
+  std::vector<int64_t> x_shape{1, 4};
+  X.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> w_shape{3, 4};
+  W.Resize(lite::DDim(w_shape));
+  std::vector<int64_t> b_shape{3};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{1, 4};
+  Out.Resize(lite::DDim(out_shape));
+  out_ref.Resize(lite::DDim(out_shape));
+  auto x_data = X.mutable_data<float>();
+  auto w_data = W.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data_ref = out_ref.mutable_data<float>();
+  for (int64_t i = 0; i < X.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < W.dims().production(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = static_cast<float>(i);
+  }
+  X_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(x_data, X.dims());
+  W_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(w_data, W.dims());
+  b_gpu.Assign<float, lite::DDim, TARGET(kCUDA)>(b_data, b.dims());
+  param.X = &X_gpu;
+  param.W = &W_gpu;
+  param.b = &b_gpu;
+  param.out_size = 4;
+  param.Out = &Out;
+  search_fc_kernel.SetParam(param);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  search_fc_kernel.SetContext(std::move(ctx));
+  search_fc_kernel.Run();
+  fc_cpu_base(&X, &W, &b, 4, &out_ref);
+  cudaDeviceSynchronize();
+  const float* out_data = Out.data<float>();
+  float* out_cpu_data = Out_cpu.mutable_data<float>();
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * Out.numel(), IoDirection::DtoH);
+  for (int i = 0; i < Out.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_data[i], out_data_ref[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_grnn_compute.cu b/lite/kernels/cuda/search_grnn_compute.cu
new file mode 100644
index 0000000000..468b66e568
--- /dev/null
+++ b/lite/kernels/cuda/search_grnn_compute.cu
@@ -0,0 +1,351 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_grnn_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+template <typename T>
+T sigmoid(T z) {
+  return 1 / (1 + std::exp(-z));
+}
+
+template <typename T>
+__global__ void PreComputeKernel(
+    const int num, const T* w_x_e, const T* wz_x_e, T* tilde, T* z, T* hidden) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < num) {
+    tilde[index] = std::tanh(w_x_e[index]);
+    z[index] = 1 / (1 + std::exp(-wz_x_e[index]));
+    hidden[index] = (1. - z[index]) * tilde[index];
+  }
+}
+
+template <typename T>
+__global__ void PostComputeKernel(const int start,
+                                  const int end,
+                                  const int cap_h,
+                                  const int w_tm1,
+                                  const T* wr_x_e,
+                                  const T* ur_x_h,
+                                  const T* wz_x_e,
+                                  const T* uz_x_h,
+                                  const T* w_x_e,
+                                  const T* u_x_h,
+                                  T* r,
+                                  T* z,
+                                  T* tilde,
+                                  T* hidden) {
+  int j = start + blockIdx.x * blockDim.x + threadIdx.x;
+  if (j < end) {
+    r[j] = 1 / (1 + std::exp(-(wr_x_e[j] + ur_x_h[j])));
+    z[j] = 1 / (1 + std::exp(-(wz_x_e[j] + uz_x_h[j])));
+    tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]);
+    hidden[j] = z[j] * hidden[j - cap_h * w_tm1] + (1.0 - z[j]) * tilde[j];
+  }
+}
+
+void SearchGrnnCompute::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+}
+
+void SearchGrnnCompute::PrepareLayout(const Tensor* input_blob) {
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = context.exec_stream();
+
+  auto* _input = input_blob;
+  int dim0 = _input->dims()[0];
+  int dim1 = 1;
+  if (_input->dims().size() > 1) {
+    dim1 = _input->dims()[1];
+  }
+  int batch = _input->lod()[0].size() - 1;
+  auto& offset = _input->lod()[0];
+
+  idx_sorted_by_width_cpu = std::make_shared<Tensor>();
+  idx_sorted_by_width_cpu->Resize({batch});
+  int* idx_sorted_by_width_cpu_data =
+      idx_sorted_by_width_cpu->mutable_data<int>();
+
+  Tensor _width;
+  _width.Resize({batch});
+  int* width_data = _width.mutable_data<int>();
+  // sort sequence by width (descending) and find the largest width in the
+  // batch
+  for (int i = 0; i < batch; i++) {
+    width_data[i] = offset[i + 1] - offset[i];
+    idx_sorted_by_width_cpu_data[i] = i;
+  }
+  std::sort(idx_sorted_by_width_cpu_data,
+            idx_sorted_by_width_cpu_data + batch,
+            [&_width](int a, int b) {
+              return _width.data<int>()[a] > _width.data<int>()[b];
+            });
+  int max_width = width_data[idx_sorted_by_width_cpu_data[0]];
+
+  // start of reorganizing the input
+  std::vector<size_t> new_offset;
+  new_offset.resize(max_width + 1);
+  new_offset[0] = 0;
+  int j = batch - 1;
+  int last_width = 0;
+  int sub_row = 0;
+  int sub_col = 0;
+
+  for (int i = 1; i <= max_width;) {
+    for (int k = j; k >= 0; --k) {
+      if (width_data[idx_sorted_by_width_cpu_data[k]] > last_width) {
+        sub_row = width_data[idx_sorted_by_width_cpu_data[k]] - last_width;
+        sub_col = k + 1;
+        for (int s = 0; s < sub_row; s++) {
+          new_offset[i] = new_offset[i - 1] + sub_col;
+          i++;
+        }
+        // move on
+        last_width = width_data[idx_sorted_by_width_cpu_data[k]];
+        j = k - 1;
+        break;
+      }
+    }
+  }
+
+  // copying to the reorganized buffer
+  auto* _layout_input = new Tensor();
+  auto* _layout_input_gpu = param.layout_input;
+  if (_input->dims().size() == 1) {
+    // _layout_input.reshape_batch_sequence({dim0}, new_offset);
+    LOG(FATAL) << "_input->dims().size() = 1, error.";
+  } else {
+    // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset);
+    LoD new_lod;
+    new_lod.push_back(new_offset);
+    _layout_input->set_lod(new_lod);
+    _layout_input->Resize({dim0, dim1});
+    _layout_input_gpu->set_lod(new_lod);
+    _layout_input_gpu->Resize({dim0, dim1});
+  }
+
+  auto* new_emb = _layout_input->mutable_data<float>();
+  auto* input_cpu = new Tensor();
+  input_cpu->Resize(_input->dims());
+  auto* input_cpu_data = input_cpu->mutable_data<float>();
+  TargetW::MemcpyAsync(input_cpu_data,
+                       _input->data<float>(),
+                       _input->numel() * sizeof(float),
+                       IoDirection::DtoH,
+                       cuda_stream);
+  for (int i = 0; i < max_width; i++) {
+    int w = new_offset[i + 1] - new_offset[i];
+    auto* emb_start = new_emb + dim1 * new_offset[i];
+    for (int j = 0; j < w; ++j) {
+      memcpy(emb_start + dim1 * j,
+             input_cpu_data + dim1 * offset[idx_sorted_by_width_cpu_data[j]] +
+                 dim1 * i,
+             dim1 * sizeof(float));
+    }
+  }
+
+  auto* _layout_input_gpu_data =
+      _layout_input_gpu->mutable_data<float>(TARGET(kCUDA));
+  TargetW::MemcpyAsync(_layout_input_gpu_data,
+                       new_emb,
+                       _layout_input->numel() * sizeof(float),
+                       IoDirection::HtoD,
+                       cuda_stream);
+  delete _layout_input;
+  delete input_cpu;
+}
+
+void SearchGrnnCompute::CopyBack(float* from, float* to, int step) {
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+  auto* _input = param.x;
+  auto* _layout_input = param.layout_input;
+
+  const auto& offset = _input->lod()[0];
+  const auto& new_offset = _layout_input->lod()[0];
+  const auto* idx_sorted_by_width_cpu_data =
+      idx_sorted_by_width_cpu->data<int>();
+  for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) {
+    int w = new_offset[i + 1] - new_offset[i];
+    for (int j = 0; j < w; j++) {
+      TargetW::MemcpyAsync(
+          to + step * (offset[idx_sorted_by_width_cpu_data[j]] + i),
+          from + (new_offset[i] + j) * step,
+          step * sizeof(float),
+          IoDirection::DtoD,
+          stream);
+    }
+  }
+}
+
+void SearchGrnnCompute::Run() {
+  CHECK(ctx_) << "running context should be set first";
+  auto& param = this->Param<param_t>();
+  auto& context = this->ctx_->template As<CUDAContext>();
+  auto stream = context.exec_stream();
+
+  auto* bottom = param.x;
+  auto* wi = param.wi;
+  auto* wh = param.wh;
+  auto* top = param.out;
+  auto* _buffer = param.tmp_buffer;
+  int _cap_h = param.num_hidden;
+  int _cap_e = param.num_input;
+
+  int _cap_l = bottom->dims()[0];
+  int batch = bottom->lod()[0].size() - 1;
+
+  const auto& offset = bottom->lod()[0];
+  LoD top_lod;
+  top_lod.push_back(offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{_cap_l, _cap_h};
+  top->Resize(top_dims_vec);
+  auto* top_hidden = top->mutable_data<float>(TARGET(kCUDA));
+
+  const auto* dense_e2h = wi->data<float>();
+  const auto* dense_h2h = wh->data<float>();
+
+  const auto* e2h = dense_e2h;
+  const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h;
+  const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h;
+  const auto* h2h = dense_h2h;
+  const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h;
+  const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h;
+
+  PrepareLayout(bottom);
+
+  auto* _layout_input = param.layout_input;
+  auto* new_emb = _layout_input->data<float>();
+  const auto& new_offset = _layout_input->lod()[0];
+  int max_width = _layout_input->lod()[0].size() - 1;
+
+  // this buffer is used for book keeping info which will be used in bp
+  // buffer also needed in bp, so make it larger
+  _buffer->Resize({20, _cap_l, _cap_h});
+  auto* buffer_data = _buffer->mutable_data<float>(TARGET(kCUDA));
+  auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h;
+  auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h;
+  auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h;
+  auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h;
+  auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h;
+  auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h;
+  auto* r = buffer_data + 6 * _cap_l * _cap_h;
+  auto* z = buffer_data + 7 * _cap_l * _cap_h;
+  auto* tilde = buffer_data + 8 * _cap_l * _cap_h;
+  // the internal hidden
+  auto* hidden = buffer_data + 19 * _cap_l * _cap_h;
+
+  gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context);
+  gemm_impl_->run(1.0f, 0.0f, new_emb, e2h, w_x_e, &context);
+  gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context);
+  gemm_impl_->run(1.0f, 0.0f, new_emb, e2hr, wr_x_e, &context);
+  gemm_impl_->init(false, true, _cap_l, _cap_h, _cap_e, &context);
+  gemm_impl_->run(1.0f, 0.0f, new_emb, e2hz, wz_x_e, &context);
+
+  // precompute hidden0
+  int num = batch * _cap_h;
+  int threads = 512;
+  int blocks = (num + threads - 1) / threads;
+  PreComputeKernel<<<blocks, threads, 0, stream>>>(
+      num, w_x_e, wz_x_e, tilde, z, hidden);
+
+  // recurrence
+  for (int i = 1; i < max_width; i++) {
+    int w_tm1 = new_offset[i] - new_offset[i - 1];
+    int w = new_offset[i + 1] - new_offset[i];
+
+    // precompute hidden i-1 to hidden i
+    auto* htm1 = hidden + new_offset[i - 1] * _cap_h;
+
+    gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context);
+    gemm_impl_->run(
+        1.0f, 0.0f, htm1, h2h, u_x_h + new_offset[i] * _cap_h, &context);
+    gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context);
+    gemm_impl_->run(
+        1.0f, 0.0f, htm1, h2hr, ur_x_h + new_offset[i] * _cap_h, &context);
+    gemm_impl_->init(false, true, w, _cap_h, _cap_h, &context);
+    gemm_impl_->run(
+        1.0f, 0.0f, htm1, h2hz, uz_x_h + new_offset[i] * _cap_h, &context);
+
+    // compute the gate and hidden
+    int start = new_offset[i] * _cap_h;
+    int end = (new_offset[i] + w) * _cap_h;
+    PostComputeKernel<<<blocks, threads, 0, stream>>>(start,
+                                                      end,
+                                                      _cap_h,
+                                                      w_tm1,
+                                                      wr_x_e,
+                                                      ur_x_h,
+                                                      wz_x_e,
+                                                      uz_x_h,
+                                                      w_x_e,
+                                                      u_x_h,
+                                                      r,
+                                                      z,
+                                                      tilde,
+                                                      hidden);
+  }
+
+  CopyBack(hidden, top_hidden, _cap_h);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_grnn,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchGrnnCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Wi",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Wh",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("tmp_buffer",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("idx_sorted_by_width",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("layout_input",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_grnn_compute.h b/lite/kernels/cuda/search_grnn_compute.h
new file mode 100644
index 0000000000..73d84635d0
--- /dev/null
+++ b/lite/kernels/cuda/search_grnn_compute.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/blas.h"
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchGrnnCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SearchGrnnParam;
+  using TargetW = TargetWrapper<TARGET(kCUDA)>;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SearchGrnnCompute() = default;
+
+ private:
+  std::shared_ptr<Tensor> idx_sorted_by_width_cpu;
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+  void PrepareLayout(const Tensor* input);
+  void CopyBack(float* from, float* to, int step);
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_grnn_compute_test.cc b/lite/kernels/cuda/search_grnn_compute_test.cc
new file mode 100644
index 0000000000..08b96e1f1e
--- /dev/null
+++ b/lite/kernels/cuda/search_grnn_compute_test.cc
@@ -0,0 +1,103 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_grnn_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/api/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+TEST(search_grnn, normal) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  SearchGrnnCompute kernel;
+  operators::SearchGrnnParam param;
+
+  int num_input = 6;
+  int num_hidden = 6;
+  int num_batch = 3;
+  Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer;
+  x.Resize({num_batch, num_input});
+  wi.Resize({3, num_hidden, num_input});
+  wh.Resize({3, num_hidden, num_hidden});
+  LoD x_lod{};
+  x_lod.push_back({0, 1, 3});
+  x.set_lod(x_lod);
+
+  Tensor x_cpu, wi_cpu, wh_cpu, out_cpu, layout_input_cpu, tmp_buffer_cpu;
+  x_cpu.Resize({num_batch, num_input});
+  wi_cpu.Resize({3, num_hidden, num_input});
+  wh_cpu.Resize({3, num_hidden, num_hidden});
+  out_cpu.Resize({num_batch, num_hidden});
+  layout_input_cpu.Resize({num_batch, num_input});
+  tmp_buffer_cpu.Resize({20, num_batch, num_hidden});
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = static_cast<float>(i);
+  }
+  auto* wi_cpu_data = wi_cpu.mutable_data<float>();
+  for (int i = 0; i < wi_cpu.numel(); ++i) {
+    wi_cpu_data[i] = static_cast<float>(i);
+  }
+  auto* wh_cpu_data = wh_cpu.mutable_data<float>();
+  for (int i = 0; i < wh_cpu.numel(); ++i) {
+    wh_cpu_data[i] = static_cast<float>(i);
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  wi.Assign<float, lite::DDim, TARGET(kCUDA)>(wi_cpu_data, wi_cpu.dims());
+  wh.Assign<float, lite::DDim, TARGET(kCUDA)>(wh_cpu_data, wh_cpu.dims());
+
+  param.x = &x;
+  param.wi = &wi;
+  param.wh = &wh;
+  param.out = &out;
+  param.idx_sorted_by_width = &idx_sorted_by_width;
+  param.layout_input = &layout_input;
+  param.tmp_buffer = &tmp_buffer;
+  param.num_input = num_input;
+  param.num_hidden = num_hidden;
+  kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  kernel.SetContext(std::move(ctx));
+  kernel.Launch();
+  cudaDeviceSynchronize();
+
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+  LOG(INFO) << "out_data:";
+  for (int i = 0; i < out.numel(); i++) {
+    // EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5);
+    LOG(INFO) << out_cpu_data[i];
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_group_padding_compute.cu b/lite/kernels/cuda/search_group_padding_compute.cu
new file mode 100644
index 0000000000..697e53dbb6
--- /dev/null
+++ b/lite/kernels/cuda/search_group_padding_compute.cu
@@ -0,0 +1,164 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_group_padding_compute.h"
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+template <typename Dtype>
+__global__ void ker_search_group_padding(Dtype* out_emb_padding_data,
+                                         Dtype* out_padding_data,
+                                         const Dtype* in_data,
+                                         const uint64_t* offset,
+                                         const int seq_num,
+                                         const int max_len,
+                                         const int emb_size,
+                                         const Dtype pad_id,
+                                         const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % emb_size;
+    int word_id = tid / emb_size;
+    int seq_id = word_id / max_len;
+    int word_id_in_seq = word_id % max_len;
+    int cur_len = offset[seq_id + 1] - offset[seq_id];
+    if (word_id_in_seq < cur_len) {
+      out_emb_padding_data[tid] =
+          in_data[(offset[seq_id] + word_id_in_seq) * emb_size + emb_id];
+    } else {
+      out_emb_padding_data[tid] = 0.f;
+      if (emb_id == 0) {
+        out_padding_data[word_id] = pad_id;
+      }
+    }
+  }
+}
+
+void SearchGroupPaddingCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+
+  const Tensor* x = param.x;
+  Tensor* out_emb_padding = param.out_emb_padding;
+  Tensor* out_new = param.out_new;
+  Tensor* out_padding = param.out_padding;
+  const float pad_id = static_cast<float>(param.pad_id);
+  const float* in_data = x->data<float>();
+  const auto& in_seq_offset = x->lod()[0];
+  int batch = in_seq_offset.size() - 1;
+  int max_seq = 0;
+  for (int i = 0; i < batch; ++i) {
+    if (in_seq_offset[i + 1] - in_seq_offset[i] > max_seq) {
+      max_seq = in_seq_offset[i + 1] - in_seq_offset[i];
+    }
+  }
+  std::vector<size_t> new_offset;
+  new_offset.resize(batch + 1);
+  for (int i = 0; i < batch + 1; ++i) {
+    new_offset[i] = i * max_seq;
+  }
+  std::vector<int64_t> x_dims = x->dims().Vectorize();
+  LoD out_emb_padding_lod;
+  out_emb_padding_lod.push_back(new_offset);
+  out_emb_padding->set_lod(out_emb_padding_lod);
+  out_emb_padding->Resize({batch * max_seq, x_dims[1]});
+  float* out_emb_padding_data =
+      out_emb_padding->mutable_data<float>(TARGET(kCUDA));
+
+  LoD out_new_lod;
+  out_new_lod.push_back(in_seq_offset);
+  out_new->set_lod(out_new_lod);
+  out_new->Resize({x_dims[0], 1});
+  float* out_new_data = out_new->mutable_data<float>(TARGET(kCUDA));
+
+  LoD out_padding_lod;
+  out_padding_lod.push_back(new_offset);
+  out_padding->set_lod(out_padding_lod);
+  out_padding->Resize({batch * max_seq, 1});
+  float* out_padding_data = out_padding->mutable_data<float>(TARGET(kCUDA));
+
+  const int count = out_emb_padding->numel();
+  const auto& out_emb_padding_seq_offset = out_emb_padding->lod()[0];
+  int max_len = out_emb_padding_seq_offset[1];
+  int seq_num = out_emb_padding_seq_offset.size() - 1;
+  int emb_size = x->dims()[1];
+  _in_seq_offset.Resize({seq_num + 1, 1, 1, 1});
+  uint64_t* offset_data = _in_seq_offset.mutable_data<uint64_t>(TARGET(kCUDA));
+
+  TargetWrapperCuda::MemcpyAsync(offset_data,
+                                 in_seq_offset.data(),
+                                 sizeof(uint64_t) * in_seq_offset.size(),
+                                 IoDirection::HtoD,
+                                 cuda_stream);
+
+  TargetWrapperCuda::MemsetSync(
+      out_new_data, 0, out_new->dims()[0] * out_new->dims()[1] * sizeof(float));
+  TargetWrapperCuda::MemsetSync(
+      out_padding_data,
+      0,
+      out_padding->dims()[0] * out_padding->dims()[1] * sizeof(float));
+
+  ker_search_group_padding<
+      float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, cuda_stream>>>(
+      out_emb_padding_data,
+      out_padding_data,
+      in_data,
+      offset_data,
+      seq_num,
+      max_len,
+      emb_size,
+      pad_id,
+      count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_group_padding,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchGroupPaddingCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out_emb_padding",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_new",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_padding",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_group_padding_compute.h b/lite/kernels/cuda/search_group_padding_compute.h
new file mode 100644
index 0000000000..88391e6d65
--- /dev/null
+++ b/lite/kernels/cuda/search_group_padding_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchGroupPaddingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGroupPaddingParam;
+
+  void Run() override;
+  virtual ~SearchGroupPaddingCompute() = default;
+
+ private:
+  lite::Tensor _in_seq_offset;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_group_padding_compute_test.cc b/lite/kernels/cuda/search_group_padding_compute_test.cc
new file mode 100644
index 0000000000..b831780c87
--- /dev/null
+++ b/lite/kernels/cuda/search_group_padding_compute_test.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_group_padding_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+TEST(search_group_padding_cuda, run_test) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  lite::Tensor x, x_cpu, x_ref;
+  lite::Tensor out_emb_padding, out_emb_padding_cpu, out_emb_padding_ref;
+  lite::Tensor out_new, out_new_cpu, out_new_ref;
+  lite::Tensor out_padding, out_padding_cpu, out_padding_ref;
+
+  int x_dims0 = 2;
+  int x_dims1 = 3;
+
+  x.Resize({x_dims0, x_dims1});
+  x_cpu.Resize({x_dims0, x_dims1});
+  x_ref.Resize({x_dims0, x_dims1});
+  out_emb_padding.Resize({1, x_dims1});
+  out_emb_padding_cpu.Resize({1, x_dims1});
+  out_emb_padding_ref.Resize({1, x_dims1});
+  out_new.Resize({x_dims0, 1});
+  out_new_cpu.Resize({x_dims0, 1});
+  out_new_ref.Resize({x_dims0, 1});
+  out_padding.Resize({1, 1});
+  out_padding_cpu.Resize({1, 1});
+  out_padding_ref.Resize({1, 1});
+
+  LoD x_lod{};
+  x_lod.push_back({0, 1});
+  x.set_lod(x_lod);
+
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  auto* x_ref_data = x_ref.mutable_data<float>();
+  auto* out_emb_padding_data =
+      out_emb_padding.mutable_data<float>(TARGET(kCUDA));
+  auto* out_emb_padding_cpu_data = out_emb_padding_cpu.mutable_data<float>();
+  auto* out_emb_padding_ref_data = out_emb_padding_ref.mutable_data<float>();
+  auto* out_new_data = out_new.mutable_data<float>(TARGET(kCUDA));
+  auto* out_new_cpu_data = out_new_cpu.mutable_data<float>();
+  auto* out_new_ref_data = out_new_ref.mutable_data<float>();
+  auto* out_padding_data = out_padding.mutable_data<float>(TARGET(kCUDA));
+  auto* out_padding_cpu_data = out_padding_cpu.mutable_data<float>();
+  auto* out_padding_ref_data = out_padding_ref.mutable_data<float>();
+
+  for (int64_t i = 0; i < x_cpu.dims().production(); i++) {
+    x_cpu_data[i] = static_cast<float>(i);
+    x_ref_data[i] = static_cast<float>(i);
+  }
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  out_emb_padding_ref_data[0] = 0.f;
+  out_emb_padding_ref_data[1] = 1.f;
+  out_emb_padding_ref_data[2] = 2.f;
+  out_new_ref_data[0] = 0.f;
+  out_new_ref_data[1] = 0.f;
+  out_padding_ref_data[0] = 0.f;
+
+  SearchGroupPaddingCompute sgp_kernel;
+  operators::SearchGroupPaddingParam param;
+
+  param.x = &x;
+  param.out_emb_padding = &out_emb_padding;
+  param.out_new = &out_new;
+  param.out_padding = &out_padding;
+
+  sgp_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  sgp_kernel.SetContext(std::move(ctx));
+  sgp_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(out_emb_padding_cpu_data,
+                          out_emb_padding_data,
+                          sizeof(float) * out_emb_padding.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(out_new_cpu_data,
+                          out_new_data,
+                          sizeof(float) * out_new.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(out_padding_cpu_data,
+                          out_padding_data,
+                          sizeof(float) * out_padding.numel(),
+                          IoDirection::DtoH);
+
+  for (int i = 0; i < out_emb_padding_cpu.dims().production(); i++) {
+    EXPECT_NEAR(out_emb_padding_cpu_data[i], out_emb_padding_ref_data[i], 1e-5);
+  }
+  for (int i = 0; i < out_new_cpu.dims().production(); i++) {
+    EXPECT_NEAR(out_new_cpu_data[i], out_new_ref_data[i], 1e-5);
+  }
+  for (int i = 0; i < out_padding_cpu.dims().production(); i++) {
+    EXPECT_NEAR(out_padding_cpu_data[i], out_padding_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_group_padding, kCUDA, kFloat, kNCHW, def);
diff --git a/lite/kernels/cuda/search_seq_depadding_compute.cu b/lite/kernels/cuda/search_seq_depadding_compute.cu
new file mode 100644
index 0000000000..ecadceab58
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_depadding_compute.cu
@@ -0,0 +1,115 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_seq_depadding_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename Dtype>
+__global__ void ker_sequence_depadding_fwd(Dtype* out_data,
+                                           const Dtype* in_data,
+                                           const int* seq_id_map,
+                                           const int seq_num,
+                                           const int max_len,
+                                           const int emb_size,
+                                           const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % emb_size;
+    int word_id = tid / emb_size;
+    int seq_id = seq_id_map[word_id];
+    out_data[tid] = in_data[seq_id * emb_size + emb_id];
+  }
+}
+
+void SearchSeqDepaddingCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+
+  auto* pad = param.pad;
+  auto* src = param.src;
+  auto* out = param.out;
+
+  auto* in_data = pad->data<float>();
+  out->Resize({src->dims()[0], pad->dims()[1]});
+  auto* out_data = out->mutable_data<float>(TARGET(kCUDA));
+  const int count = out->numel();
+
+  const auto& pad_seq_offset = pad->lod()[0];
+  const auto& src_seq_offset = src->lod()[0];
+  int max_len = pad_seq_offset[1];
+  int seq_num = pad_seq_offset.size() - 1;
+  int emb_size = pad->dims()[1];
+
+  LoD out_lod;
+  out_lod.push_back(src_seq_offset);
+  out->set_lod(out_lod);
+  std::vector<int> seq_id_map;
+  for (int i = 0; i < seq_num; i++) {
+    int cur_len = src_seq_offset[i + 1] - src_seq_offset[i];
+    for (int j = 0; j < cur_len; j++) {
+      seq_id_map.push_back(i * max_len + j);
+    }
+  }
+
+  int map_size = seq_id_map.size();
+  seq_id_map_tensor.Resize({map_size, 1, 1, 1});
+  int* seq_id_map_data = seq_id_map_tensor.mutable_data<int>(TARGET(kCUDA));
+  TargetW::MemcpyAsync(seq_id_map_data,
+                       &seq_id_map[0],
+                       seq_id_map.size() * sizeof(int),
+                       IoDirection::HtoD,
+                       cuda_stream);
+
+  int threads = 512;
+  int blocks = (count + threads - 1) / threads;
+  ker_sequence_depadding_fwd<<<blocks, threads, 0, cuda_stream>>>(
+      out_data, in_data, seq_id_map_data, seq_num, max_len, emb_size, count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_seq_depadding,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchSeqDepaddingCompute,
+                     def)
+    .BindInput("Src",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("Pad",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_seq_depadding_compute.h b/lite/kernels/cuda/search_seq_depadding_compute.h
new file mode 100644
index 0000000000..a06f39bee2
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_depadding_compute.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchSeqDepaddingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SearchSeqDepaddingParam;
+  using TargetW = TargetWrapper<TARGET(kCUDA)>;
+
+  void Run() override;
+  virtual ~SearchSeqDepaddingCompute() = default;
+
+ private:
+  Tensor seq_id_map_tensor;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_seq_depadding_compute_test.cc b/lite/kernels/cuda/search_seq_depadding_compute_test.cc
new file mode 100644
index 0000000000..9c23ff14ab
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_depadding_compute_test.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_seq_depadding_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/api/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+TEST(search_seq_depadding, normal) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  SearchSeqDepaddingCompute kernel;
+  operators::SearchSeqDepaddingParam param;
+
+  Tensor pad, src, out;
+  pad.Resize({2 * 3, 4});
+  src.Resize({3, 1});
+  out.Resize({3, 4});
+  LoD pad_lod{};
+  pad_lod.push_back({0, 4, 6});
+  pad.set_lod(pad_lod);
+  LoD src_lod{};
+  src_lod.push_back({0, 2, 3});
+  src.set_lod(src_lod);
+
+  Tensor pad_cpu, src_cpu, out_cpu;
+  pad_cpu.Resize({2 * 3, 4});
+  src_cpu.Resize({3, 1});
+  out_cpu.Resize({3, 4});
+
+  auto* pad_cpu_data = pad_cpu.mutable_data<float>();
+  auto* src_cpu_data = src_cpu.mutable_data<float>();
+  for (int i = 0; i < pad_cpu.numel(); ++i) {
+    pad_cpu_data[i] = static_cast<float>(i);
+  }
+
+  pad.Assign<float, lite::DDim, TARGET(kCUDA)>(pad_cpu_data, pad_cpu.dims());
+  src.Assign<float, lite::DDim, TARGET(kCUDA)>(src_cpu_data, src_cpu.dims());
+
+  param.pad = &pad;
+  param.src = &src;
+  param.out = &out;
+  kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  kernel.SetContext(std::move(ctx));
+  kernel.Launch();
+  cudaDeviceSynchronize();
+
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+
+  std::vector<float> ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19};
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-5);
+    // LOG(INFO) << out_cpu_data[i];
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_seq_fc_compute.cu b/lite/kernels/cuda/search_seq_fc_compute.cu
new file mode 100644
index 0000000000..e3ac75afee
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_fc_compute.cu
@@ -0,0 +1,98 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/search_seq_fc_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename dtype>
+__global__ void add_bias(int n,
+                         int output_size,
+                         const dtype* bias,
+                         dtype* dout) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int bias_index = index % output_size;
+  if (index < n) {
+    dout[index] = dout[index] + bias[bias_index];
+  }
+}
+
+void SearchSeqFcCompute::PrepareForRun() {
+  gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+}
+
+void SearchSeqFcCompute::Run() {
+  auto& param = this->Param<param_t>();
+  CHECK(ctx_) << "running context should be set first";
+  auto& cuda_ctx = ctx_->template As<CUDAContext>();
+  auto cuda_stream = cuda_ctx.exec_stream();
+
+  auto x = param.x;
+  auto w = param.w;
+  auto b = param.b;
+  auto out = param.out;
+  auto out_size = param.out_size;
+  const auto x_dims = x->dims();
+  const auto w_dims = w->dims();
+  const auto out_dims = out->dims();
+  CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+  CHECK_EQ(out_dims.size(), 2) << "The Output(Out) should be 2-D tensor.";
+  CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+  CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size";
+  CHECK_EQ(out_dims[0], x_dims[0]) << "Wrong shape: out_dims[0] != x_dims[0]";
+  CHECK_EQ(out_dims[1], out_size) << "Wrong shape: out_dims[1] != out_size";
+  int M = x_dims[0];
+  int K = x_dims[1];
+  int N = w_dims[0];
+  auto x_data = x->data<float>();
+  auto w_data = w->data<float>();
+  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
+
+  CHECK(gemm_impl_->init(false, true, M, N, K, &cuda_ctx));
+  gemm_impl_->run(1.0f, 0.0f, x_data, w_data, out_data, &cuda_ctx);
+
+  if (b != nullptr) {
+    auto b_dims = b->dims();
+    CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+    CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+    auto b_data = b->mutable_data<float>();
+    int total_size = M * N;
+    add_bias<float><<<CUDA_GET_BLOCKS(total_size),
+                      CUDA_NUM_THREADS,
+                      0,
+                      cuda_stream>>>(total_size, N, b_data, out_data);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_seq_fc,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SearchSeqFcCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/search_seq_fc_compute.h b/lite/kernels/cuda/search_seq_fc_compute.h
new file mode 100644
index 0000000000..dff8ba2acf
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_fc_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/context.h"
+#include "lite/core/kernel.h"
+#include "lite/core/types.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SearchSeqFcCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchSeqFcParam;
+
+  void PrepareForRun() override;
+  void Run() override;
+  virtual ~SearchSeqFcCompute() = default;
+
+ private:
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_{nullptr};
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/search_seq_fc_compute_test.cc b/lite/kernels/cuda/search_seq_fc_compute_test.cc
new file mode 100644
index 0000000000..354d1bb5bc
--- /dev/null
+++ b/lite/kernels/cuda/search_seq_fc_compute_test.cc
@@ -0,0 +1,175 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/search_seq_fc_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+void search_seq_fc_compute_ref(const operators::SearchSeqFcParam& param) {
+  auto x = param.x;
+  auto w = param.w;
+  auto b = param.b;
+  auto out = param.out;
+  auto out_size = param.out_size;
+  const auto x_dims = x->dims();
+  const auto w_dims = w->dims();
+  const auto& x_lod = x->lod();
+  CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+  CHECK(!x_lod.empty()) << "The Input(X) must hold lod info.";
+  const auto& x_lod_0 = x_lod[0];
+  CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted.";
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+  CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+  CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size";
+  int M = x_dims[0];
+  int K = x_dims[1];
+  int N = w_dims[0];
+  auto x_data = x->data<T>();
+  auto w_data = w->data<T>();
+  auto out_data = out->mutable_data<T>();
+
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < N; j++) {
+      auto sum = static_cast<T>(0);
+      for (int l = 0; l < K; l++) {
+        T xv = x_data[i * K + l];
+        T wv = w_data[j * K + l];
+        sum += xv * wv;
+      }
+      out_data[i * N + j] = sum;
+    }
+  }
+
+  if (b != nullptr) {
+    auto b_dims = b->dims();
+    CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+    CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+    auto b_data = b->data<T>();
+    for (int i = 0; i < M; i++) {
+      for (int j = 0; j < N; j++) {
+        out_data[i * N + j] += b_data[j];
+      }
+    }
+  }
+}
+
+TEST(search_seq_fc_compute, normal) {
+  Env<TargetType::kCUDA>::Init();
+  for (auto x_lod_0 : {std::vector<uint64_t>({0, 1, 3}),
+                       std::vector<uint64_t>({0, 3, 4, 5})}) {
+    for (auto feature_size : {2, 9}) {
+      for (auto out_size : {3, 5}) {
+        for (auto has_bias : {true, false}) {
+          // infer x_dims, w_dims, b_dims and out_dims
+          DDim x_dims({static_cast<int64_t>(x_lod_0.back()), feature_size});
+          DDim w_dims({out_size, feature_size});
+          DDim b_dims({has_bias ? out_size : 0});
+          DDim out_dims({static_cast<int64_t>(x_lod_0.back()), out_size});
+          LoD x_lod;
+          x_lod.push_back(x_lod_0);
+          LoD out_lod;
+          out_lod.push_back(x_lod_0);
+          // prepare input&output tensors
+          Tensor x_dev, x_host, w_dev, w_host, b_dev, b_host, out_dev, out_host,
+              out_ref;
+          x_host.Resize(x_dims);
+          w_host.Resize(w_dims);
+          b_host.Resize(b_dims);
+          out_host.Resize(out_dims);
+          x_dev.Resize(x_dims);
+          w_dev.Resize(w_dims);
+          b_dev.Resize(b_dims);
+          out_dev.Resize(out_dims);
+          out_ref.Resize(out_dims);
+          x_host.set_lod(x_lod);
+          out_host.set_lod(out_lod);
+          x_dev.set_lod(x_lod);
+          out_dev.set_lod(out_lod);
+          out_ref.set_lod(out_lod);
+          auto out_dev_data = out_dev.mutable_data<float>(TARGET(kCUDA));
+          auto x_host_data = x_host.mutable_data<float>();
+          auto w_host_data = w_host.mutable_data<float>();
+          auto out_host_data = out_host.mutable_data<float>();
+          auto out_ref_data = out_ref.mutable_data<float>();
+          for (int i = 0; i < x_host.dims().production(); i++) {
+            x_host_data[i] = i * 0.125f;
+          }
+          for (int i = 0; i < w_host.dims().production(); i++) {
+            w_host_data[i] = i * 0.5f;
+          }
+          x_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(x_host_data,
+                                                         x_host.dims());
+          w_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(w_host_data,
+                                                         w_host.dims());
+          // prepare cuda context, initialize param, and run kernel
+          operators::SearchSeqFcParam param;
+          param.x = &x_dev;
+          param.w = &w_dev;
+          param.out = &out_dev;
+          param.out_size = out_size;
+          if (has_bias) {
+            auto b_host_data = b_host.mutable_data<float>();
+            for (int i = 0; i < b_host.dims().production(); i++) {
+              b_host_data[i] = i * 0.5f;
+            }
+            b_dev.Assign<float, lite::DDim, TARGET(kCUDA)>(b_host_data,
+                                                           b_host.dims());
+            param.b = &b_dev;
+          }
+          std::unique_ptr<KernelContext> ctx(new KernelContext);
+          auto& cuda_ctx = ctx->As<CUDAContext>();
+          cuda_ctx.InitOnce();
+          int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
+          cuda_ctx.Init(dev_id);
+          SearchSeqFcCompute search_seq_fc;
+          search_seq_fc.SetParam(param);
+          search_seq_fc.SetContext(std::move(ctx));
+          search_seq_fc.Launch();
+          cudaDeviceSynchronize();
+          CopySync<TARGET(kCUDA)>(out_host_data,
+                                  out_dev_data,
+                                  sizeof(float) * out_dev.dims().production(),
+                                  IoDirection::DtoH);
+          // run reference
+          param.x = &x_host;
+          param.w = &w_host;
+          param.out = &out_ref;
+          if (has_bias) {
+            param.b = &b_host;
+          }
+          search_seq_fc_compute_ref<float>(param);
+          // verify result
+          for (int i = 0; i < out_ref.dims().production(); i++) {
+            EXPECT_NEAR(out_host_data[i], out_ref_data[i], 1e-5);
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_arithmetic_compute.cu b/lite/kernels/cuda/sequence_arithmetic_compute.cu
new file mode 100644
index 0000000000..7593632a14
--- /dev/null
+++ b/lite/kernels/cuda/sequence_arithmetic_compute.cu
@@ -0,0 +1,249 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_arithmetic_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+const int CUDA_NUM_THREADS = 512;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+inline int CUDA_GET_BLOCKS(const int N) {
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename Dtype>
+__global__ void ker_arithmetic_sum(Dtype* out_data,
+                                   const Dtype* in_data_0,
+                                   const Dtype* in_data_1,
+                                   const int* offset_0,
+                                   const int* offset_1,
+                                   const int* word_id_to_seq_id,
+                                   const int seq_num,
+                                   const int inner_size,
+                                   const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % inner_size;
+    int word_id = tid / inner_size;
+    int seq_id = word_id_to_seq_id[word_id];
+    int word_id_in_cur_seq = word_id - offset_0[seq_id];
+    int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id];
+    if (word_id_in_cur_seq < seq_len_1) {
+      out_data[tid] =
+          in_data_0[tid] +
+          in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size +
+                    emb_id];
+    } else {
+      out_data[tid] = in_data_0[tid];
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void ker_arithmetic_sub(Dtype* out_data,
+                                   const Dtype* in_data_0,
+                                   const Dtype* in_data_1,
+                                   const int* offset_0,
+                                   const int* offset_1,
+                                   const int* word_id_to_seq_id,
+                                   const int seq_num,
+                                   const int inner_size,
+                                   const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % inner_size;
+    int word_id = tid / inner_size;
+    int seq_id = word_id_to_seq_id[word_id];
+    int word_id_in_cur_seq = word_id - offset_0[seq_id];
+    int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id];
+    if (word_id_in_cur_seq < seq_len_1) {
+      out_data[tid] =
+          in_data_0[tid] -
+          in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size +
+                    emb_id];
+    } else {
+      out_data[tid] = in_data_0[tid];
+    }
+  }
+}
+
+template <typename Dtype>
+__global__ void ker_arithmetic_mul(Dtype* out_data,
+                                   const Dtype* in_data_0,
+                                   const Dtype* in_data_1,
+                                   const int* offset_0,
+                                   const int* offset_1,
+                                   const int* word_id_to_seq_id,
+                                   const int seq_num,
+                                   const int inner_size,
+                                   const int count) {
+  CUDA_KERNEL_LOOP(tid, count) {
+    int emb_id = tid % inner_size;
+    int word_id = tid / inner_size;
+    int seq_id = word_id_to_seq_id[word_id];
+    int word_id_in_cur_seq = word_id - offset_0[seq_id];
+    int seq_len_1 = offset_1[seq_id + 1] - offset_1[seq_id];
+    if (word_id_in_cur_seq < seq_len_1) {
+      out_data[tid] =
+          in_data_0[tid] *
+          in_data_1[(offset_1[seq_id] + word_id_in_cur_seq) * inner_size +
+                    emb_id];
+    } else {
+      out_data[tid] = in_data_0[tid];
+    }
+  }
+}
+
+void SequenceArithmeticCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  auto x_data = param.X->data<float>();
+  auto x_lod = param.X->lod()[0];
+  auto y_data = param.Y->data<float>();
+  auto y_lod = param.Y->lod()[0];
+  auto out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+
+  offset_x.Resize({static_cast<int64_t>(x_lod.size())});
+  auto offset_x_data = offset_x.mutable_data<int>(TARGET(kCUDA));
+
+  offset_y.Resize({static_cast<int64_t>(y_lod.size())});
+  auto offset_y_data = offset_y.mutable_data<int>(TARGET(kCUDA));
+
+  word_id_to_seq_id.Resize({param.X->numel()});
+  auto word_id_to_seq_id_data =
+      word_id_to_seq_id.mutable_data<int>(TARGET(kCUDA));
+
+  std::vector<int> word_seq_map;
+  for (int i = 0; i < x_lod.size() - 1; i++) {
+    for (int j = x_lod[i]; j < x_lod[i + 1]; j++) {
+      word_seq_map.push_back(i);
+    }
+  }
+
+  std::vector<int> offset_x_data_cpu(x_lod.size(), 0);
+  auto x_lod_data = x_lod.data();
+  for (int i = 0; i < offset_x_data_cpu.size(); i++) {
+    offset_x_data_cpu[i] = x_lod_data[i];
+  }
+
+  std::vector<int> offset_y_data_cpu(y_lod.size(), 0);
+  auto y_lod_data = y_lod.data();
+  for (int i = 0; i < offset_y_data_cpu.size(); i++) {
+    offset_y_data_cpu[i] = y_lod_data[i];
+  }
+
+  TargetWrapperCuda::MemcpyAsync(offset_x_data,
+                                 offset_x_data_cpu.data(),
+                                 sizeof(int) * x_lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  TargetWrapperCuda::MemcpyAsync(offset_y_data,
+                                 offset_y_data_cpu.data(),
+                                 sizeof(int) * y_lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  TargetWrapperCuda::MemcpyAsync(word_id_to_seq_id_data,
+                                 word_seq_map.data(),
+                                 sizeof(int) * word_seq_map.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  int seq_num = x_lod.size() - 1;
+  int count = param.X->numel();
+  int inner_size = param.X->dims()[1];
+  switch (param.op_type) {
+    case 1:  // sum
+      ker_arithmetic_sum<
+          float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+          out_data,
+          x_data,
+          y_data,
+          offset_x_data,
+          offset_y_data,
+          word_id_to_seq_id_data,
+          seq_num,
+          inner_size,
+          count);
+      break;
+    case 2:  // sub
+      ker_arithmetic_sub<
+          float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+          out_data,
+          x_data,
+          y_data,
+          offset_x_data,
+          offset_y_data,
+          word_id_to_seq_id_data,
+          seq_num,
+          inner_size,
+          count);
+      break;
+    case 3:  // mul
+      ker_arithmetic_mul<
+          float><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
+          out_data,
+          x_data,
+          y_data,
+          offset_x_data,
+          offset_y_data,
+          word_id_to_seq_id_data,
+          seq_num,
+          inner_size,
+          count);
+      break;
+    default:
+      break;
+  }
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_arithmetic,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequenceArithmeticCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+REGISTER_LITE_KERNEL(search_seq_arithmetic,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequenceArithmeticCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_arithmetic_compute.h b/lite/kernels/cuda/sequence_arithmetic_compute.h
new file mode 100644
index 0000000000..a180c50eaa
--- /dev/null
+++ b/lite/kernels/cuda/sequence_arithmetic_compute.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequenceArithmeticCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceArithmeticParam;
+
+  void Run() override;
+  virtual ~SequenceArithmeticCompute() = default;
+
+ private:
+  lite::Tensor offset_x;
+  lite::Tensor offset_y;
+  lite::Tensor word_id_to_seq_id;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_arithmetic_compute_test.cc b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc
new file mode 100644
index 0000000000..c0746d375d
--- /dev/null
+++ b/lite/kernels/cuda/sequence_arithmetic_compute_test.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_arithmetic_compute.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+void sequence_arithmetic_compute_ref(const Tensor& x,
+                                     const Tensor& y,
+                                     Tensor* out,
+                                     int op_type) {
+  auto x_data = x.data<float>();
+  auto y_data = y.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  auto x_seq_offset = x.lod()[0];
+  auto y_seq_offset = y.lod()[0];
+  int seq_num = x_seq_offset.size() - 1;
+  int inner_size = x.numel() / x.dims()[0];
+
+  for (int i = 0; i < seq_num; i++) {
+    int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+    int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+    auto input_x = x_data + x_seq_offset[i] * inner_size;
+    auto input_y = y_data + y_seq_offset[i] * inner_size;
+    auto t_out = out_data + x_seq_offset[i] * inner_size;
+    int len = std::min(len_x, len_y);
+    for (int j = 0; j < len; j++) {
+      switch (op_type) {
+        case 1:
+          t_out[j] = input_x[j] + input_y[j];
+          break;
+        case 2:
+          t_out[j] = input_x[j] - input_y[j];
+          break;
+        case 3:
+          t_out[j] = input_x[j] * input_y[j];
+          break;
+        default:
+          break;
+      }
+    }
+    if (len_x > len) {
+      memcpy(t_out + len, input_x + len, sizeof(float) * (len_x - len));
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& x_lod) {
+  x->Resize({static_cast<int64_t>(x_lod[0].back()), 3});
+  x->set_lod(x_lod);
+  auto x_data = x->mutable_data<float>();
+  for (int i = 0; i < x->numel(); i++) {
+    x_data[i] = (i - x->numel() / 2) * 1.1;
+  }
+}
+
+TEST(sequence_arithmetic_cuda, run_test) {
+  lite::Tensor x, y, x_cpu, y_cpu;
+  lite::Tensor out, out_cpu, out_ref;
+  lite::LoD x_lod{{0, 2, 5, 9}}, y_lod{{0, 2, 5, 9}};
+
+  prepare_input(&x_cpu, x_lod);
+  prepare_input(&y_cpu, y_lod);
+
+  x.Resize(x_cpu.dims());
+  x.set_lod(x_cpu.lod());
+  auto x_cpu_data = x_cpu.mutable_data<float>();
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  y.Resize(y_cpu.dims());
+  y.set_lod(y_cpu.lod());
+  auto y_cpu_data = y_cpu.mutable_data<float>();
+  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
+
+  operators::SequenceArithmeticParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.op_type = 1;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  SequenceArithmeticCompute sequence_arithmetic;
+  sequence_arithmetic.SetContext(std::move(ctx));
+  sequence_arithmetic.SetParam(param);
+  sequence_arithmetic.Run();
+  cudaDeviceSynchronize();
+
+  auto out_data = out.mutable_data<float>(TARGET(kCUDA));
+  out_cpu.Resize(out.dims());
+  auto out_cpu_data = out_cpu.mutable_data<float>();
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+
+  sequence_arithmetic_compute_ref(x_cpu, y_cpu, &out_ref, param.op_type);
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-3);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_concat_compute.cu b/lite/kernels/cuda/sequence_concat_compute.cu
new file mode 100644
index 0000000000..d4390046b0
--- /dev/null
+++ b/lite/kernels/cuda/sequence_concat_compute.cu
@@ -0,0 +1,151 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_concat_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+const int CUDA_NUM_THREADS = 512;
+
+template <typename T>
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs) {
+  std::vector<size_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+template <typename Dtype>
+__global__ void ker_sequence_concat(Dtype* out_data,
+                                    const uint64_t* in_locate_data,
+                                    const int* o2i_map,
+                                    const int* o2i_w_map,
+                                    const int seq_num,
+                                    const int emb_size,
+                                    const int count) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int tid = idx; tid < count; tid += blockDim.x * gridDim.x) {
+    int emb_id = tid % emb_size;
+    int word_id = tid / emb_size;
+    int input_id = o2i_map[word_id];
+    int cur_work_id = o2i_w_map[word_id];
+    const Dtype* in_data = reinterpret_cast<const Dtype*>(
+        reinterpret_cast<uintptr_t>(in_locate_data[input_id]));
+    out_data[tid] = in_data[cur_work_id * emb_size + emb_id];
+  }
+}
+
+void SequenceConcatCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+
+  int seq_num = param.X[0]->lod()[0].size() - 1;
+  const int emb_size = param.X[0]->numel() / param.X[0]->dims()[0];
+  std::vector<uint64_t> in_locate_vec;
+  for (size_t i = 0; i < param.X.size(); ++i) {
+    in_locate_vec.push_back(
+        reinterpret_cast<uintptr_t>(param.X[i]->data<float>()));
+  }
+  in_locate_tensor.Resize({static_cast<int64_t>(in_locate_vec.size())});
+
+  std::vector<int> out2in_map;
+  std::vector<int> out2in_word_map;
+  for (int i = 0; i < seq_num; ++i) {
+    for (int j = 0; j < param.X.size(); ++j) {
+      auto offset = param.X[j]->lod()[0];
+      int cur_len = offset[i + 1] - offset[i];
+      for (int k = 0; k < cur_len; ++k) {
+        out2in_map.push_back(j);
+        out2in_word_map.push_back(offset[i] + k);
+      }
+    }
+  }
+  int word_num = out2in_map.size();
+  out2in_map_tensor.Resize({word_num});
+  out2in_word_map_tensor.Resize({word_num});
+  int* gpu_o2i_map_data = out2in_map_tensor.mutable_data<int>(TARGET(kCUDA));
+  int* gpu_o2i_w_map_data =
+      out2in_word_map_tensor.mutable_data<int>(TARGET(kCUDA));
+  uint64_t* gpu_in_locate_data =
+      in_locate_tensor.mutable_data<uint64_t>(TARGET(kCUDA));
+
+  TargetWrapperCuda::MemcpyAsync(gpu_o2i_map_data,
+                                 out2in_map.data(),
+                                 sizeof(int) * out2in_map.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  TargetWrapperCuda::MemcpyAsync(gpu_o2i_w_map_data,
+                                 out2in_word_map.data(),
+                                 sizeof(int) * out2in_word_map.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  TargetWrapperCuda::MemcpyAsync(gpu_in_locate_data,
+                                 in_locate_vec.data(),
+                                 sizeof(uint64_t) * in_locate_vec.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  param.Out->set_lod(ConcatLoD<float>(param.X));
+
+  int count = param.X[0]->numel();
+  for (int i = 1; i < param.X.size(); ++i) {
+    count += param.X[i]->numel();
+  }
+
+  int blocks = (count + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+  ker_sequence_concat<float><<<blocks, CUDA_NUM_THREADS, 0, stream>>>(
+      out_data,
+      gpu_in_locate_data,
+      gpu_o2i_map_data,
+      gpu_o2i_w_map_data,
+      seq_num,
+      emb_size,
+      count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_concat,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequenceConcatCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_concat_compute.h b/lite/kernels/cuda/sequence_concat_compute.h
new file mode 100644
index 0000000000..1737c18dd3
--- /dev/null
+++ b/lite/kernels/cuda/sequence_concat_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequenceConcatCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConcatParam;
+
+  void Run() override;
+  virtual ~SequenceConcatCompute() = default;
+
+ private:
+  lite::Tensor out2in_map_tensor;
+  lite::Tensor out2in_word_map_tensor;
+  lite::Tensor in_locate_tensor;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_concat_compute_test.cc b/lite/kernels/cuda/sequence_concat_compute_test.cc
new file mode 100644
index 0000000000..477dc48dbb
--- /dev/null
+++ b/lite/kernels/cuda/sequence_concat_compute_test.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_concat_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+namespace {
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<size_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<float>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+static void sequence_concat_ref(const std::vector<lite::Tensor*>& xs,
+                                lite::Tensor* out) {
+  std::vector<int64_t> out_dims;
+  int64_t batch_size = 0;
+  int64_t feature_size = 0;
+  for (const auto& tensor : xs) {
+    const auto x_dims = tensor->dims();
+    if (out_dims.empty()) {
+      out_dims = x_dims.Vectorize();
+    }
+    batch_size += x_dims[0];
+    if (feature_size == 0) {
+      feature_size = x_dims.production() / x_dims[0];
+    } else {
+      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+          << "Inputs of sequence concat must have same feature size";
+    }
+  }
+  out_dims[0] = batch_size;
+  out->Resize(out_dims);
+  std::vector<lite::Tensor> x_in_order;
+  out->set_lod(ConcatLoD(xs, &x_in_order));
+
+  int num = x_in_order.size();
+  std::vector<int64_t> input_cols(num);
+  for (int i = 0; i < num; ++i) {
+    input_cols[i] = x_in_order[i].numel();
+  }
+  float* out_data = out->mutable_data<float>();
+  int col_idx = 0;
+  for (int j = 0; j < num; ++j) {
+    int col_len = input_cols[j];
+    auto input_data = x_in_order[j].data<float>();
+    memcpy(out_data + col_idx, input_data, sizeof(float) * col_len);
+    col_idx += col_len;
+  }
+}
+
+#define PREPARE_INPUT_DATA(name)                                 \
+  name.Resize({name##_lod_len, feature_len});                    \
+  name##_cpu.Resize({name##_lod_len, feature_len});              \
+  name##_ref.Resize({name##_lod_len, feature_len});              \
+  name.set_lod(lod_info_##name);                                 \
+  name##_cpu.set_lod(lod_info_##name);                           \
+  name##_ref.set_lod(lod_info_##name);                           \
+  float* name##_cpu_data = name##_cpu.mutable_data<float>();     \
+  float* name##_ref_data = name##_ref.mutable_data<float>();     \
+  for (int i = 0; i < name##_cpu.numel(); ++i) {                 \
+    name##_cpu_data[i] = (i - 2.0) * 1.0;                        \
+    name##_ref_data[i] = (i - 2.0) * 1.0;                        \
+  }                                                              \
+  name.Assign<float, lite::DDim, TARGET(kCUDA)>(name##_cpu_data, \
+                                                name##_cpu.dims());
+
+#define PREPARE_OUTPUT_INFO(name)              \
+  name##_cpu.Resize({y_lod_len, feature_len}); \
+  name##_ref.Resize({y_lod_len, feature_len}); \
+  name.Resize({y_lod_len, feature_len});       \
+  float* name##_cpu_data = name##_cpu.mutable_data<float>();
+
+}  // namespace
+
+TEST(sequence_concat_cuda, normal) {
+  SequenceConcatCompute seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::SequenceConcatParam param;
+  lite::Tensor x1, x2, x3, x1_cpu, x2_cpu, x3_cpu, x1_ref, x2_ref, x3_ref;
+  lite::Tensor y, y_cpu, y_ref;
+
+  int32_t x1_lod_len = 10, feature_len = 4;
+  int32_t x2_lod_len = 4, x3_lod_len = 8;
+  int32_t y_lod_len = x1_lod_len + x2_lod_len + x3_lod_len;
+  LoD lod_info_x1{{0, 3, 5, 6, 10}};
+  LoD lod_info_x2{{0, 1, 2, 3, 4}};
+  LoD lod_info_x3{{0, 2, 4, 6, 8}};
+  LoD lod_info_y{{0, 0, 0, 0, 0}};
+  for (size_t i = 0; i < lod_info_x1[0].size(); ++i) {
+    lod_info_y[0][i] =
+        lod_info_x1[0][i] + lod_info_x2[0][i] + lod_info_x3[0][i];
+  }
+
+  PREPARE_INPUT_DATA(x1);
+  PREPARE_INPUT_DATA(x2);
+  PREPARE_INPUT_DATA(x3);
+  PREPARE_OUTPUT_INFO(y);
+
+  param.X = std::vector<lite::Tensor*>({&x1, &x2, &x3});
+  param.Out = &y;
+  seq_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  seq_kernel.SetContext(std::move(ctx));
+  seq_kernel.Run();
+  cudaDeviceSynchronize();
+
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+
+  std::vector<lite::Tensor*> input_ref({&x1_ref, &x2_ref, &x3_ref});
+  sequence_concat_ref(input_ref, &y_ref);
+  float* y_ref_data = y_ref.mutable_data<float>();
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], y_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_pool_compute.cu b/lite/kernels/cuda/sequence_pool_compute.cu
new file mode 100644
index 0000000000..97876ec32f
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pool_compute.cu
@@ -0,0 +1,258 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_pool_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename Dtype>
+__global__ void seq_pool_average_kernel(Dtype* dst,
+                                        const Dtype* src_in,
+                                        const int batch_size,
+                                        const uint64_t* seq_offset,
+                                        const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_slice_num = static_cast<int>(seq_offset[out_batch_id + 1] -
+                                        seq_offset[out_batch_id]);
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    src_in += in_offset + out_id;
+    Dtype sum = (Dtype)0;
+    for (int i = 0; i < in_slice_num; ++i) {
+      sum += src_in[i * slice_size];
+    }
+    dst[out_batch_id * slice_size + out_id] = sum / in_slice_num;
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_sum_kernel(Dtype* dst,
+                                    const Dtype* src_in,
+                                    const int batch_size,
+                                    const uint64_t* seq_offset,
+                                    const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_slice_num = static_cast<int>(seq_offset[out_batch_id + 1] -
+                                        seq_offset[out_batch_id]);
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    src_in += in_offset + out_id;
+    Dtype sum = (Dtype)0;
+    for (int i = 0; i < in_slice_num; ++i) {
+      sum += src_in[i * slice_size];
+    }
+    dst[out_batch_id * slice_size + out_id] = sum;
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_sqrt_kernel(Dtype* dst,
+                                     const Dtype* src_in,
+                                     const int batch_size,
+                                     const uint64_t* seq_offset,
+                                     const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_slice_num = static_cast<int>(seq_offset[out_batch_id + 1] -
+                                        seq_offset[out_batch_id]);
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    src_in += in_offset + out_id;
+    Dtype sum = (Dtype)0;
+    for (int i = 0; i < in_slice_num; ++i) {
+      sum += src_in[i * slice_size];
+    }
+    dst[out_batch_id * slice_size + out_id] = sum * rsqrtf(in_slice_num);
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_max_kernel(Dtype* dst,
+                                    const Dtype* src_in,
+                                    const int batch_size,
+                                    const uint64_t* seq_offset,
+                                    const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_slice_num = static_cast<int>(seq_offset[out_batch_id + 1] -
+                                        seq_offset[out_batch_id]);
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    src_in += in_offset + out_id;
+    Dtype max = src_in[0];
+    for (int i = 1; i < in_slice_num; ++i) {
+      Dtype val = src_in[i * slice_size];
+      if (val > max) {
+        max = val;
+      }
+    }
+    dst[out_batch_id * slice_size + out_id] = max;
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_last_kernel(Dtype* dst,
+                                     const Dtype* src_in,
+                                     const int batch_size,
+                                     const uint64_t* seq_offset,
+                                     const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_offset =
+        (static_cast<int>(seq_offset[out_batch_id + 1]) - 1) * slice_size;
+    dst[tid] = src_in[in_offset + out_id];
+  }
+}
+
+template <typename Dtype>
+__global__ void seq_pool_first_kernel(Dtype* dst,
+                                      const Dtype* src_in,
+                                      const int batch_size,
+                                      const uint64_t* seq_offset,
+                                      const int slice_size) {
+  int total = slice_size * batch_size;
+  CUDA_KERNEL_LOOP(tid, total) {
+    int out_batch_id = tid / slice_size;
+    int out_id = tid % slice_size;
+    int in_offset = static_cast<int>(seq_offset[out_batch_id] * slice_size);
+    dst[tid] = src_in[in_offset + out_id];
+  }
+}
+
+void SequencePoolCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  std::vector<uint64_t> seq_offset = param.X->lod()[0];
+  int batch_size = param.X->lod()[0].size() - 1;
+  int slice_size = param.Out->dims().production() / batch_size;
+
+  float* out_data = param.Out->mutable_data<float>(TARGET(kCUDA));
+  const float* in_data = param.X->data<float>();
+
+  lite::Tensor seq_offset_D;
+  seq_offset_D.Resize({static_cast<int64_t>(seq_offset.size())});
+  TargetWrapperCuda::MemcpyAsync(
+      seq_offset_D.mutable_data<uint64_t>(TARGET(kCUDA)),
+      seq_offset.data(),
+      sizeof(uint64_t) * seq_offset.size(),
+      IoDirection::HtoD,
+      stream);
+
+  if (param.pool_type == "MAX") {
+    seq_pool_max_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                 CUDA_NUM_THREADS,
+                                 0,
+                                 stream>>>(out_data,
+                                           in_data,
+                                           batch_size,
+                                           seq_offset_D.data<uint64_t>(),
+                                           slice_size);
+  } else if (param.pool_type == "AVERAGE") {
+    seq_pool_average_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                     CUDA_NUM_THREADS,
+                                     0,
+                                     stream>>>(out_data,
+                                               in_data,
+                                               batch_size,
+                                               seq_offset_D.data<uint64_t>(),
+                                               slice_size);
+  } else if (param.pool_type == "SUM") {
+    seq_pool_sum_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                 CUDA_NUM_THREADS,
+                                 0,
+                                 stream>>>(out_data,
+                                           in_data,
+                                           batch_size,
+                                           seq_offset_D.data<uint64_t>(),
+                                           slice_size);
+  } else if (param.pool_type == "SQRT") {
+    seq_pool_sqrt_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                  CUDA_NUM_THREADS,
+                                  0,
+                                  stream>>>(out_data,
+                                            in_data,
+                                            batch_size,
+                                            seq_offset_D.data<uint64_t>(),
+                                            slice_size);
+  } else if (param.pool_type == "FIRST") {
+    seq_pool_first_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                   CUDA_NUM_THREADS,
+                                   0,
+                                   stream>>>(out_data,
+                                             in_data,
+                                             batch_size,
+                                             seq_offset_D.data<uint64_t>(),
+                                             slice_size);
+  } else if (param.pool_type == "LAST") {
+    seq_pool_last_kernel<float><<<CUDA_GET_BLOCKS(batch_size * slice_size),
+                                  CUDA_NUM_THREADS,
+                                  0,
+                                  stream>>>(out_data,
+                                            in_data,
+                                            batch_size,
+                                            seq_offset_D.data<uint64_t>(),
+                                            slice_size);
+  } else {
+    LOG(ERROR) << "pool type " << param.pool_type << " is not supoorted.";
+  }
+
+  std::vector<uint64_t> offset_new(static_cast<uint64_t>(batch_size + 1));
+
+  for (int i = 0; i <= batch_size; ++i) {
+    offset_new[i] = i;
+  }
+  std::vector<std::vector<uint64_t>> voffset_new;
+  voffset_new.push_back(offset_new);
+  param.Out->set_lod(voffset_new);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(sequence_pool,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SequencePoolCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("MaxIndex", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_pool_compute.h b/lite/kernels/cuda/sequence_pool_compute.h
new file mode 100644
index 0000000000..9309454d18
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pool_compute.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class SequencePoolCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequencePoolParam;
+
+  void Run() override;
+  virtual ~SequencePoolCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_pool_compute_test.cc b/lite/kernels/cuda/sequence_pool_compute_test.cc
new file mode 100644
index 0000000000..0f2656cd1d
--- /dev/null
+++ b/lite/kernels/cuda/sequence_pool_compute_test.cc
@@ -0,0 +1,104 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_pool_compute.h"
+#include <gtest/gtest.h>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+TEST(sequence_pool_cuda, normal) {
+  SequencePoolCompute seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  lite::Tensor x, x_cpu, out, out_cpu;
+  lite::LoD lod;
+  lod.push_back(std::vector<uint64_t>{0, 10});
+
+  x.set_lod(lod);
+  x_cpu.set_lod(lod);
+  const size_t second_dim = 8u;
+  std::vector<int64_t> input_shape{static_cast<int64_t>(lod[0].back()),
+                                   static_cast<int64_t>(second_dim)};
+  lite::DDim in_dims(input_shape);
+  x.Resize(in_dims);
+  x_cpu.Resize(in_dims);
+
+  const size_t out_first_dim = lod[0].size() - 1;
+  std::vector<int64_t> output_shape{static_cast<int64_t>(out_first_dim),
+                                    static_cast<int64_t>(second_dim)};
+  lite::DDim out_dims(output_shape);
+  out.Resize(out_dims);
+  out_cpu.Resize(out_dims);
+
+  auto x_cpu_data = x_cpu.mutable_data<float>();
+  auto out_data = out.mutable_data<float>(TARGET(kCUDA));
+  auto out_cpu_data = out_cpu.mutable_data<float>();
+
+  for (int64_t i = 0; i < x_cpu.dims().production(); i++) {
+    x_cpu_data[i] = 1.1f * i;
+  }
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  operators::SequencePoolParam param;
+  param.X = &x;
+  param.Out = &out;
+  std::vector<std::string> pool_types(
+      {"MAX", "AVERAGE", "SUM", "SQRT", "FIRST", "LAST"});
+  std::map<std::string, std::vector<float>> type_map;
+  type_map["MAX"] = {79.2, 80.3, 81.4, 82.5, 83.6, 84.7, 85.8, 86.9};
+  type_map["AVERAGE"] = {39.6, 40.7, 41.8, 42.9, 44, 45.1, 46.2, 47.3};
+  type_map["SUM"] = {396, 407, 418, 429, 440, 451, 462, 473};
+  type_map["SQRT"] = {
+      125.226, 128.705, 132.183, 135.662, 139.14, 142.619, 146.097, 149.576};
+  type_map["FIRST"] = {0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7};
+  type_map["LAST"] = {79.2, 80.3, 81.4, 82.5, 83.6, 84.7, 85.8, 86.9};
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  seq_kernel.SetContext(std::move(ctx));
+  for (std::string pool_type : pool_types) {
+    param.pool_type = pool_type;
+    seq_kernel.SetParam(param);
+
+    seq_kernel.Run();
+    cudaDeviceSynchronize();
+
+    CopySync<TARGET(kCUDA)>(out_cpu_data,
+                            out_data,
+                            sizeof(float) * out_cpu.numel(),
+                            IoDirection::DtoH);
+
+    std::vector<float> ref_results = type_map[pool_type];
+
+    for (int i = 0; i < out_cpu.numel(); i++) {
+      EXPECT_NEAR(out_cpu_data[i], ref_results[i], 1e-3);
+    }
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_reverse_compute.cu b/lite/kernels/cuda/sequence_reverse_compute.cu
new file mode 100644
index 0000000000..68447fcebb
--- /dev/null
+++ b/lite/kernels/cuda/sequence_reverse_compute.cu
@@ -0,0 +1,130 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/kernels/cuda/sequence_reverse_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+__host__ __device__ inline size_t UpperBound(const T* x,
+                                             size_t num,
+                                             const T& val) {
+  // The following code is from
+  // https://en.cppreference.com/w/cpp/algorithm/upper_bound
+  auto* first = x;
+  int64_t count = static_cast<int64_t>(num);
+  while (count > 0) {
+    auto step = (count >> 1);
+    auto* it = first + step;
+    if (val < *it) {
+      count = step;
+    } else {
+      first = ++it;
+      count -= (step + 1);
+    }
+  }
+  return static_cast<size_t>(first - x);
+}
+
+template <typename T>
+__global__ void SequenceReverseKernelGridIsOne(
+    const T* x, T* y, const int64_t* lod, size_t lod_count, int64_t row_numel) {
+  int64_t idx = static_cast<int64_t>(threadIdx.x);
+  auto row_idx_x = idx / row_numel;
+  auto lod_idx = UpperBound(lod, lod_count, row_idx_x);
+  auto row_idx_y = lod[lod_idx - 1] + (lod[lod_idx] - 1 - row_idx_x);
+  auto idx_y = row_idx_y * row_numel + idx % row_numel;
+  y[idx_y] = x[idx];
+}
+
+template <typename T>
+__global__ void SequenceReverseKernel(const T* x,
+                                      T* y,
+                                      const int64_t* lod,
+                                      size_t lod_count,
+                                      int64_t row_numel,
+                                      size_t limit) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < limit) {
+    auto row_idx_x = idx / row_numel;
+    auto lod_idx = UpperBound(lod, lod_count, row_idx_x);
+    auto row_idx_y = lod[lod_idx - 1] + (lod[lod_idx] - 1 - row_idx_x);
+    auto idx_y = row_idx_y * row_numel + idx % row_numel;
+    y[idx_y] = x[idx];
+  }
+}
+
+template <typename T, PrecisionType Ptype>
+void SequenceReverseCompute<T, Ptype>::Run() {
+  auto& param = this->template Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  size_t limit = static_cast<size_t>(param.X->numel());
+  int64_t row_numel = static_cast<int64_t>(limit / param.X->dims()[0]);
+  const auto* x_data = param.X->template data<T>();
+  auto y_data = param.Out->template mutable_data<T>(TARGET(kCUDA));
+  CHECK_NE(x_data, y_data)
+      << "SequenceReverse Op does not support in-place operation";
+  const auto lod = param.X->lod()[param.X->lod().size() - 1];
+  const size_t lod_count = lod.size();
+  param.Out->set_lod(param.X->lod());
+
+  lod_cuda.Resize({static_cast<int64_t>(lod.size())});
+  int64_t* lod_data = lod_cuda.mutable_data<int64_t>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(lod_data,
+                                 lod.data(),
+                                 sizeof(int64_t) * lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+  constexpr int num_threads = 1024;
+  int block_size = limit <= num_threads ? limit : num_threads;
+  int grid_size = (limit + num_threads - 1) / num_threads;
+  if (grid_size == 1) {
+    SequenceReverseKernelGridIsOne<<<1, block_size, 0, stream>>>(
+        x_data, y_data, lod_data, lod_count, row_numel);
+  } else {
+    SequenceReverseKernel<<<grid_size, block_size, 0, stream>>>(
+        x_data, y_data, lod_data, lod_count, row_numel, limit);
+  }
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+typedef paddle::lite::kernels::cuda::SequenceReverseCompute<float,
+                                                            PRECISION(kFloat)>
+    ReverseFp32;
+
+typedef paddle::lite::kernels::cuda::SequenceReverseCompute<int64_t,
+                                                            PRECISION(kInt64)>
+    ReverseInt64;
+
+REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kFloat, kNCHW, ReverseFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_reverse, kCUDA, kInt64, kNCHW, ReverseInt64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_reverse_compute.h b/lite/kernels/cuda/sequence_reverse_compute.h
new file mode 100644
index 0000000000..6b6199e020
--- /dev/null
+++ b/lite/kernels/cuda/sequence_reverse_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T, PrecisionType Ptype>
+class SequenceReverseCompute : public KernelLite<TARGET(kCUDA), Ptype> {
+ public:
+  using param_t = operators::SequenceReverseParam;
+
+  void Run() override;
+  virtual ~SequenceReverseCompute() = default;
+
+ private:
+  lite::Tensor lod_cuda;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_reverse_compute_test.cc b/lite/kernels/cuda/sequence_reverse_compute_test.cc
new file mode 100644
index 0000000000..3317b52303
--- /dev/null
+++ b/lite/kernels/cuda/sequence_reverse_compute_test.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/sequence_reverse_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) {
+  const auto* x_data = x->data<float>();
+  auto seq_offset = x->lod()[x->lod().size() - 1];
+  int width = x->numel() / x->dims()[0];
+  auto* y_data = y->mutable_data<float>();
+  for (int i = 0; i < static_cast<int>(seq_offset.size()) - 1; ++i) {
+    auto start_pos = seq_offset[i];
+    auto end_pos = seq_offset[i + 1];
+    for (auto pos = start_pos; pos < end_pos; ++pos) {
+      auto cur_pos = end_pos - pos - 1 + start_pos;
+      std::memcpy(y_data + pos * width,
+                  x_data + cur_pos * width,
+                  width * sizeof(float));
+    }
+  }
+}
+
+TEST(sequence_reverse_cuda, normal) {
+  SequenceReverseCompute<float, PRECISION(kFloat)> seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::SequenceReverseParam param;
+  lite::Tensor x, x_cpu, x_ref;
+  lite::Tensor y, y_cpu, y_ref;
+
+  int32_t lod_len = 10, feature_len = 4;
+  LoD lod_info{{0, 2, 4}, {0, 3, 5, 6, 10}};
+
+  x.Resize({lod_len, feature_len});
+  x_cpu.Resize({lod_len, feature_len});
+  x_ref.Resize({lod_len, feature_len});
+  y.Resize({lod_len, feature_len});
+  y_cpu.Resize({lod_len, feature_len});
+  y_ref.Resize({lod_len, feature_len});
+  x.set_lod(lod_info);
+  x_cpu.set_lod(lod_info);
+  x_ref.set_lod(lod_info);
+  y.set_lod(lod_info);
+  y_cpu.set_lod(lod_info);
+  y_ref.set_lod(lod_info);
+
+  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
+
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* y_cpu_data = y_cpu.mutable_data<float>();
+  float* y_ref_data = y_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = (i - 2.0) * 1.0;
+    x_ref_data[i] = (i - 2.0) * 1.0;
+  }
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+
+  param.X = &x;
+  param.Out = &y;
+  seq_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  seq_kernel.SetContext(std::move(ctx));
+  seq_kernel.Run();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      y_cpu_data, y_data, sizeof(float) * y.numel(), IoDirection::DtoH);
+
+  sequence_reverse_ref(&x_ref, &y_ref);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_cpu_data[i], y_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
new file mode 100644
index 0000000000..8ea3edb30d
--- /dev/null
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.cu
@@ -0,0 +1,209 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <limits>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/sequence_topk_avg_pooling_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename Dtype>
+__global__ void topk_avg_pooling_kernel_by_row_improve(
+    Dtype *output_data,
+    const Dtype *input,
+    const int *gpu_input_offset_l,
+    const int *gpu_input_offset_r,
+    const int topk_size,
+    const int *topks,
+    const int feat_map_num) {
+  int row =
+      gpu_input_offset_l[blockIdx.x + 1] - gpu_input_offset_l[blockIdx.x];  // 8
+  int col = gpu_input_offset_r[blockIdx.x + 1] -
+            gpu_input_offset_r[blockIdx.x];  // 30
+  int max_k = topks[topk_size - 1];
+  max_k = max_k < col ? max_k : col;
+
+  extern __shared__ Dtype smem[];  // H*W
+
+  const Dtype *fm_row_in_data = input;
+  for (int i = 0; i < blockIdx.x; ++i) {
+    int tmp_row = gpu_input_offset_l[i + 1] - gpu_input_offset_l[i];
+    int tmp_col = gpu_input_offset_r[i + 1] - gpu_input_offset_r[i];
+    fm_row_in_data += tmp_row * feat_map_num * tmp_col;
+  }
+  fm_row_in_data += blockIdx.y * row * col;
+
+  for (int i = threadIdx.x; i < row * col; i += blockDim.x) {
+    smem[i] = fm_row_in_data[i];
+  }
+  __syncthreads();
+
+  for (int idx = threadIdx.x; idx < row; idx += blockDim.x) {
+    Dtype *fm_row_out_data =
+        output_data +
+        (gpu_input_offset_l[blockIdx.x] + idx) * feat_map_num * topk_size +
+        blockIdx.y * topk_size;
+
+    Dtype *smem_start_col = smem + idx * col;
+
+    int counter = max_k;  // topk_size;
+    Dtype last_max_val = -20000.0;
+    while (counter) {
+      Dtype max_val = -10000.0;
+      int max_pos = 0;
+      int m = 0;
+      for (; m < col; m++) {
+        Dtype cur_data = smem_start_col[m];
+        if (cur_data > max_val) {
+          max_val = cur_data;
+          max_pos = m;
+          last_max_val = max_val;
+        }
+      }
+      if (max_val < -9999.0) {  // == -10000.0
+        max_val = last_max_val;
+      }
+      smem_start_col[max_pos] = -10000000.0;
+      int i = max_k - counter;
+      for (int c = 0; c < topk_size; c++) {
+        if (i <= topks[c] - 1) {
+          fm_row_out_data[c] += max_val;
+        }
+      }
+      counter--;
+    }
+    __syncthreads();
+    // compute avg
+    for (int i = 0; i < topk_size; i++) {
+      fm_row_out_data[i] = fm_row_out_data[i] / topks[i];
+    }
+  }
+}
+
+template <typename T>
+void SequenceTopkAvgPoolingCompute<T>::Run() {
+  auto &param = this->Param<param_t>();
+  auto &ctx = this->ctx_->template As<CUDAContext>();
+  auto cuda_stream = ctx.exec_stream();
+  int topk_num = param.topks.size();
+  lite::DDim top_ks_shape(std::vector<int64_t>{topk_num, 1, 1, 1});
+  _top_ks.Resize(top_ks_shape);
+  cudaMemcpyAsync(_top_ks.mutable_data<int>(TARGET(kCUDA)),
+                  &param.topks[0],
+                  sizeof(int) * topk_num,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
+
+  int width_offset_len = param.COLUMN->lod()[0].size();
+  lite::DDim width_offset_shape(
+      std::vector<int64_t>{width_offset_len, 1, 1, 1});
+  _width_offset.Resize(width_offset_shape);
+  std::vector<int> width_lod_0(width_offset_len, 0);
+  for (size_t i = 0; i < param.COLUMN->lod()[0].size(); ++i) {
+    width_lod_0[i] = static_cast<int>(param.COLUMN->lod()[0][i]);
+  }
+  cudaMemcpyAsync(_width_offset.mutable_data<int>(TARGET(kCUDA)),
+                  &width_lod_0[0],
+                  sizeof(int) * width_offset_len,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
+
+  int height_offset_len = param.ROW->lod()[0].size();
+  lite::DDim height_offset_shape(
+      std::vector<int64_t>{height_offset_len, 1, 1, 1});
+  _height_offset.Resize(height_offset_shape);
+  std::vector<int> height_lod_0(height_offset_len, 0);
+  for (size_t i = 0; i < param.ROW->lod()[0].size(); ++i) {
+    height_lod_0[i] = static_cast<int>(param.ROW->lod()[0][i]);
+  }
+  cudaMemcpyAsync(_height_offset.mutable_data<int>(TARGET(kCUDA)),
+                  &height_lod_0[0],
+                  sizeof(int) * height_offset_len,
+                  cudaMemcpyHostToDevice,
+                  cuda_stream);
+
+  const Tensor *x_tensor = param.X;
+  Tensor *out_tensor = param.Out;
+  const T *in_data = x_tensor->data<T>();
+  T *out_data = out_tensor->mutable_data<T>(TARGET(kCUDA));
+  TargetWrapperCuda::MemsetAsync(out_tensor->mutable_data<T>(TARGET(kCUDA)),
+                                 0,
+                                 sizeof(T) * out_tensor->numel(),
+                                 cuda_stream);
+
+  int num = param.ROW->lod()[0].size() - 1;
+  int channel = param.channel_num;
+
+  const int *height_offset = _height_offset.data<int>();
+  const int *width_offset = _width_offset.data<int>();
+
+  int feat_map_size = 0;
+  for (size_t i = 0; i < height_lod_0.size() - 1; ++i) {
+    int height = height_lod_0[i + 1] - height_lod_0[i];
+    int width = width_lod_0[i + 1] - width_lod_0[i];
+    if (height * width > feat_map_size) {
+      feat_map_size = height * width;
+    }
+  }
+  dim3 blocks(num, channel);
+  dim3 threads(32, 1);
+  topk_avg_pooling_kernel_by_row_improve<
+      T><<<blocks, threads, feat_map_size * sizeof(T), cuda_stream>>>(
+      out_data,
+      in_data,
+      height_offset,
+      width_offset,
+      param.topks.size(),
+      _top_ks.data<int>(),
+      param.channel_num);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    sequence_topk_avg_pooling,
+    kCUDA,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::cuda::SequenceTopkAvgPoolingCompute<float>,
+    def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("ROW",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("COLUMN",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("pos",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
new file mode 100644
index 0000000000..321ec9cfce
--- /dev/null
+++ b/lite/kernels/cuda/sequence_topk_avg_pooling_compute.h
@@ -0,0 +1,43 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cudnn.h>
+#include "lite/backends/cuda/cuda_utils.h"
+#include "lite/core/kernel.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+template <typename T>
+class SequenceTopkAvgPoolingCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::SequenceTopkAvgPoolingParam;
+
+  void Run() override;
+
+  virtual ~SequenceTopkAvgPoolingCompute() = default;
+
+ protected:
+  lite::Tensor _height_offset;
+  lite::Tensor _width_offset;
+  lite::Tensor _top_ks;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/softmax_compute.cu b/lite/kernels/cuda/softmax_compute.cu
index d8d2987524..6293f7295e 100644
--- a/lite/kernels/cuda/softmax_compute.cu
+++ b/lite/kernels/cuda/softmax_compute.cu
@@ -173,9 +173,10 @@ void SoftmaxCompute::Run() {
   cudaGetDeviceProperties(&deviceProp, device_id);
   size_t sharedmem_size = deviceProp.sharedMemPerBlock;
   int max_dimsize = sharedmem_size / sizeof(float) / threads;
-
   auto input_data = param.x->data<float>();
   auto output_data = param.output->mutable_data<float>(TARGET(kCUDA));
+  TargetWrapperCuda::MemsetSync(
+      output_data, 0, param.output->numel() * sizeof(float));
   if (axis_size <= max_dimsize) {
     int use_sharemem_size = axis_size * threads * sizeof(float);
     sharemem_softmax_kernel<<<blocks, threads, use_sharemem_size, stream>>>(
@@ -194,7 +195,7 @@ void SoftmaxCompute::Run() {
     auto max_data = tmax_data.mutable_data<float>(TARGET(kCUDA));
     auto sum_data = tsum_data.mutable_data<float>(TARGET(kCUDA));
     //! firstly, get maximum data
-    float min_data = std::numeric_limits<float>::min();
+    float min_data = std::numeric_limits<float>::lowest();
     softmax_max_kernel<float><<<blocks, threads, 0, stream>>>(total_threads,
                                                               input_data,
                                                               max_data,
@@ -217,7 +218,7 @@ void SoftmaxCompute::Run() {
         total_threads, output_data, sum_data, inner_num, outer_num, axis_size);
   }
   cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
 }
 
 }  // namespace cuda
@@ -244,3 +245,19 @@ REGISTER_LITE_KERNEL(softmax,
                                        PRECISION(kFloat),
                                        DATALAYOUT(kNCHW))})
     .Finalize();
+REGISTER_LITE_KERNEL(search_seq_softmax,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::SoftmaxCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kCUDA),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/var_conv_2d_compute.cu b/lite/kernels/cuda/var_conv_2d_compute.cu
new file mode 100644
index 0000000000..f2588a8f53
--- /dev/null
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
@@ -0,0 +1,263 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <vector>
+#include "lite/backends/cuda/math/gemm.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/target_wrapper.h"
+#include "lite/core/tensor.h"
+#include "lite/kernels/cuda/var_conv_2d_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+const int CUDA_NUM_THREADS = 512;
+
+template <typename Dtype>
+__global__ void var_im2col_gpu_kernel(const int n,
+                                      const Dtype* data_im,
+                                      const int height,
+                                      const int width,
+                                      const int kernel_h,
+                                      const int kernel_w,
+                                      const int pad_h,
+                                      const int pad_w,
+                                      const int stride_h,
+                                      const int stride_w,
+                                      const int height_col,
+                                      const int width_col,
+                                      Dtype* data_col) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int index = idx; index < n; index += blockDim.x * gridDim.x) {
+    const int h_index = index / width_col;
+    const int h_col = h_index % height_col;
+    const int w_col = index % width_col;
+    const int c_im = h_index / height_col;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col * stride_h - pad_h;
+    const int w_offset = w_col * stride_w - pad_w;
+
+    Dtype* data_col_ptr = data_col;
+    data_col_ptr += (c_col * height_col + h_col) * width_col + w_col;
+    const Dtype* data_im_ptr = data_im;
+    data_im_ptr += (c_im * height + h_offset) * width + w_offset;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        int h_im = h_offset + i;
+        int w_im = w_offset + j;
+        *data_col_ptr =
+            (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width)
+                ? data_im_ptr[i * width + j]
+                : 0;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+void VarConv2DCompute::var_im2col(const cudaStream_t& stream) {
+  auto& param = this->Param<param_t>();
+  int input_channel = param.input_channel;
+  int kernel_h = param.kernel_h;
+  int kernel_w = param.kernel_w;
+  int stride_h = param.stride_h;
+  int stride_w = param.stride_w;
+  // auto* in_row = param.ROW;
+  // auto* in_col = param.COLUMN;
+  const auto* input = param.X;
+  auto* col = param.Col;
+
+  int batch = input->lod()[0].size() - 1;
+  const auto& bottom_offset = input->lod()[0];
+  // 2-D lod info.
+  // const auto& offset_x = in_col->lod()[0];
+  // const auto& offset_y = in_row->lod()[0];
+  const auto& offset_y = param.X->lod()[1];
+  const auto& offset_x = param.X->lod()[2];
+  // top offset is the whole size of each data sample
+  std::vector<uint64_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_x = top_im_x * top_im_y;
+    int top_y = input_channel * kernel_h * kernel_w;
+    top_size += top_y * top_x;
+    top_offset.push_back(top_size);
+  }
+
+  LoD col_lod;
+  col_lod.push_back(top_offset);
+  col->set_lod(col_lod);
+  std::vector<int64_t> col_dims_vec{top_size};
+  col_dims_vec.push_back(1);
+  col->Resize(col_dims_vec);
+  auto* top_data = col->mutable_data<float>(TARGET(kCUDA));
+  const auto* bottom_data = input->data<float>();
+
+  for (int b = 0; b < batch; ++b) {
+    int t_offset = top_offset[b];
+    int b_offset = bottom_offset[b];
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    if (width == 0 || height == 0) {
+      continue;
+    }
+    int width_col = (width - 1) / stride_w + 1;
+    int height_col = (height - 1) / stride_h + 1;
+    const float* data_im = bottom_data + b_offset;
+    float* data_col = top_data + t_offset;
+
+    // We are going to launch channels * height_col * width_col kernels, each
+    // kernel responsible for copying a single-channel grid.
+    int num_kernels = height_col * width_col * input_channel;
+    const int CUDA_NUM_BLOCKS =
+        (num_kernels + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+    var_im2col_gpu_kernel<
+        float><<<CUDA_NUM_BLOCKS, CUDA_NUM_THREADS, 0, stream>>>(
+        num_kernels,
+        data_im,
+        height,
+        width,
+        kernel_h,
+        kernel_w,
+        ((stride_h - 1) * height + kernel_h - 1) / 2,
+        ((stride_w - 1) * width + kernel_w - 1) / 2,
+        stride_h,
+        stride_w,
+        height_col,
+        width_col,
+        data_col);
+  }
+}
+
+void VarConv2DCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  auto* bottom = param.X;
+  // auto* in_row = param.ROW;
+  // auto* in_col = param.COLUMN;
+  auto* w = param.W;
+  auto* top = param.Out;
+  auto* col = param.Col;
+  int output_channel = param.output_channel;
+  int input_channel = param.input_channel;
+  int kernel_h = param.kernel_h;
+  int kernel_w = param.kernel_w;
+  int stride_h = param.stride_h;
+  int stride_w = param.stride_w;
+
+  var_im2col(stream);
+
+  int batch = bottom->lod()[0].size() - 1;
+  const auto& col_offset = col->lod()[0];
+  // const auto& offset_x = in_col->lod()[0];
+  // const auto& offset_y = in_row->lod()[0];
+  const auto& offset_y = param.X->lod()[1];
+  const auto& offset_x = param.X->lod()[2];
+  std::vector<size_t> top_offset;
+  std::vector<int64_t> height_vector;
+  std::vector<int64_t> width_vector;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    height_vector.push_back(top_im_y);
+    width_vector.push_back(top_im_x);
+    int top_im_size = top_im_y * top_im_x;
+    top_size += output_channel * top_im_size;
+    top_offset.push_back(top_size);
+  }
+
+  LoD top_lod;
+  top_lod.push_back(top_offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{top_size};
+  top_dims_vec.push_back(1);
+  top->Resize(top_dims_vec);
+
+  auto* top_data = top->mutable_data<float>(TARGET(kCUDA));
+  const auto* w_data = w->data<float>();
+  const auto* col_data = col->data<float>();
+
+  std::unique_ptr<lite::cuda::math::Gemm<float, float>> gemm_impl_;
+  for (int b = 0; b < batch; ++b) {
+    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+    if (top_im_size == 0) {
+      continue;
+    }
+    float* out_data = top_data + top_offset[b];
+    const float* in_data = col_data + col->lod()[0][b];
+    gemm_impl_.reset(new lite::cuda::math::Gemm<float, float>);
+    gemm_impl_->init(false,
+                     false,
+                     w->dims()[0],
+                     height_vector[b] * width_vector[b],
+                     input_channel * kernel_h * kernel_w,
+                     &ctx);
+    gemm_impl_->run(1., 0., w_data, in_data, out_data, &ctx);
+  }
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(var_conv_2d,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::VarConv2DCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/kernels/cuda/var_conv_2d_compute.h b/lite/kernels/cuda/var_conv_2d_compute.h
new file mode 100644
index 0000000000..e0b8e30c50
--- /dev/null
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class VarConv2DCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::VarConv2DParam;
+
+  void Run() override;
+  virtual ~VarConv2DCompute() = default;
+
+ private:
+  void var_im2col(const cudaStream_t& stream);
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/cuda/var_conv_2d_compute_test.cc b/lite/kernels/cuda/var_conv_2d_compute_test.cc
new file mode 100644
index 0000000000..98e9c73cdd
--- /dev/null
+++ b/lite/kernels/cuda/var_conv_2d_compute_test.cc
@@ -0,0 +1,360 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/var_conv_2d_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+static void im2col_ref(const lite::Tensor& input,
+                       const lite::Tensor* in_row,
+                       const lite::Tensor* in_col,
+                       const int kernel_h,
+                       const int kernel_w,
+                       const int stride_h,
+                       const int stride_w,
+                       const int input_channel,
+                       lite::Tensor* col) {
+  int batch = input.lod()[0].size() - 1;
+  const auto& bottom_offset = input.lod()[0];
+  // 2-D lod info.
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+
+  // top offset is the whole size of each data sample
+  std::vector<uint64_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_x = top_im_x * top_im_y;
+    int top_y = input_channel * kernel_h * kernel_w;
+    top_size += top_y * top_x;
+    top_offset.push_back(top_size);
+  }
+  LoD col_lod;
+  col_lod.push_back(top_offset);
+  col->set_lod(col_lod);
+  std::vector<int64_t> col_dims_vec{top_size};
+  col_dims_vec.push_back(1);
+  col->Resize(col_dims_vec);
+  auto* top_data = col->mutable_data<float>();
+  const auto* bottom_data = input.data<float>();
+
+  int kernel_win_size = kernel_h * kernel_w;
+  int half_kernel_h = kernel_h / 2;
+  int half_kernel_w = kernel_w / 2;
+  for (int b = 0; b < batch; ++b) {
+    int t_offset = top_offset[b];
+    int b_offset = bottom_offset[b];
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    if (width == 0 || height == 0) {
+      continue;
+    }
+    int top_im_x = (width - 1) / stride_w + 1;
+    int top_im_y = (height - 1) / stride_h + 1;
+    int top_x = top_im_y * top_im_x;
+    for (int z = 0; z < input_channel; ++z) {
+      int row_offset = kernel_win_size * z;
+      int im_offset = z * width * height;
+      for (int y = 0; y < height; y += stride_h) {
+        for (int x = 0; x < width; x += stride_w) {
+          int col_offset = x / stride_w + y / stride_h * top_im_x;
+          for (int ky = 0; ky < kernel_h; ++ky) {
+            for (int kx = 0; kx < kernel_w; ++kx) {
+              int im_y = y + ky - half_kernel_h;
+              int im_x = x + kx - half_kernel_w;
+              if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] =
+                    bottom_data[b_offset + im_offset + im_y * width + im_x];
+              } else {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void naive_sgemm(const bool transpose_A,
+                        const bool transpose_B,
+                        const int M,
+                        const int N,
+                        const int K,
+                        const float alpha,
+                        const float* A,  // m x k (after transpose if TransA)
+                        const int lda,   // leading dimension of a
+                        const float* B,  // k x n (after transpose if TransB)
+                        const int ldb,   // leading dimension of b
+                        const float beta,
+                        float* C,  // m x n
+                        const int ldc) {
+  for (int m = 0; m < M; ++m) {
+    for (int k = 0; k < K; ++k) {
+      for (int n = 0; n < N; ++n) {
+        C[m * N + n] += beta * C[m * N + n];
+        size_t A_idx = 0, B_idx = 0;
+        if (transpose_A) {
+          A_idx = k * M + m;  // A is k x m
+        } else {
+          A_idx = m * K + k;  // A is m x k
+        }
+
+        if (transpose_B) {
+          B_idx = n * K + k;  // B is n x k
+        } else {
+          B_idx = k * N + n;  // B is k x n
+        }
+
+        C[m * N + n] += alpha * A[A_idx] * B[B_idx];
+      }
+    }
+  }
+}
+
+static void var_conv_2d_ref(const lite::Tensor* bottom,
+                            const lite::Tensor* w,
+                            const lite::Tensor* in_row,
+                            const lite::Tensor* in_col,
+                            const int kernel_h,
+                            const int kernel_w,
+                            const int stride_h,
+                            const int stride_w,
+                            const int input_channel,
+                            const int output_channel,
+                            lite::Tensor* top,
+                            lite::Tensor* col) {
+  im2col_ref(*bottom,
+             in_row,
+             in_col,
+             kernel_h,
+             kernel_w,
+             stride_h,
+             stride_w,
+             input_channel,
+             col);
+  int batch = bottom->lod()[0].size() - 1;
+  const auto& col_offset = col->lod()[0];
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_im_size = top_im_y * top_im_x;
+    top_size += output_channel * top_im_size;
+    top_offset.push_back(top_size);
+  }
+
+  LoD top_lod;
+  top_lod.push_back(top_offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{top_size};
+  top_dims_vec.push_back(1);
+  top->Resize(top_dims_vec);
+  auto* top_data = top->mutable_data<float>();
+  const auto* w_data = w->data<float>();
+  const auto* col_data = col->data<float>();
+
+  for (int b = 0; b < batch; ++b) {
+    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+    if (top_im_size == 0) {
+      continue;
+    }
+
+    naive_sgemm(false,
+                false,
+                output_channel,
+                top_im_size,
+                input_channel * kernel_h * kernel_w,
+                1.0,
+                w_data,
+                input_channel * kernel_h * kernel_w,
+                col_data + col_offset[b],
+                top_im_size,
+                0.0,
+                top_data + top_offset[b],
+                top_im_size);
+  }
+}
+
+TEST(var_conv_2d_cuda, normal) {
+  VarConv2DCompute var_conv_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::VarConv2DParam param;
+
+  lite::Tensor X, W, ROW, COLUMN;
+  lite::Tensor x_cpu, w_cpu;
+  lite::Tensor Out, Col, out_cpu, col_cpu;
+  int kernel_h = 5, kernel_w = 5;
+  int stride_h = 1, stride_w = 1;
+  int input_channel = 5, output_channel = 5;
+
+  std::vector<int64_t> w_dims_vec;
+  w_dims_vec.push_back(output_channel);
+  w_dims_vec.push_back(input_channel * kernel_h * kernel_w);
+  W.Resize(w_dims_vec);
+  w_cpu.Resize(w_dims_vec);
+  auto* w_cpu_data = w_cpu.mutable_data<float>();
+  for (int i = 0; i < W.numel(); ++i) {
+    w_cpu_data[i] = i - 1.f;
+  }
+
+  std::vector<uint64_t> row_lod_vec{0, 10, 20};
+  LoD row_lod;
+  row_lod.push_back(row_lod_vec);
+  ROW.set_lod(row_lod);
+
+  std::vector<uint64_t> column_lod_vec{0, 10, 20};
+  LoD column_lod;
+  column_lod.push_back(column_lod_vec);
+  COLUMN.set_lod(column_lod);
+
+  int x_size = 0;
+  std::vector<uint64_t> x_lod_vec;
+  x_lod_vec.push_back(0);
+  for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) {
+    int height = row_lod_vec[i + 1] - row_lod_vec[i];
+    int width = column_lod_vec[i + 1] - column_lod_vec[i];
+    x_lod_vec.push_back(x_lod_vec.back() + height * width);
+    x_size += height * width;
+  }
+  for (size_t i = 0; i < x_lod_vec.size(); ++i) {
+    x_lod_vec[i] *= input_channel;
+  }
+  x_size *= input_channel;
+  std::vector<int64_t> x_dims_vec{x_size, 1};
+  LoD x_lod;
+  x_lod.push_back(x_lod_vec);
+  x_lod.push_back(row_lod_vec);
+  x_lod.push_back(column_lod_vec);
+  X.Resize(x_dims_vec);
+  x_cpu.Resize(x_dims_vec);
+  X.set_lod(x_lod);
+  x_cpu.set_lod(x_lod);
+  auto* x_cpu_data = x_cpu.mutable_data<float>();
+  for (int i = 0; i < X.numel(); ++i) {
+    x_cpu_data[i] = i % 20 * 1.f;
+  }
+
+  int sum_num = 0;
+  int out_sum_num = 0;
+  for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) {
+    int height = row_lod_vec[i + 1] - row_lod_vec[i];
+    int width = column_lod_vec[i + 1] - column_lod_vec[i];
+    sum_num += height * width * input_channel * kernel_h * kernel_w;
+    out_sum_num += height * width * output_channel;
+  }
+  col_cpu.Resize({sum_num, 1});
+  out_cpu.Resize({out_sum_num, 1});
+  float* out_cpu_data = out_cpu.mutable_data<float>();
+  float* col_cpu_data = col_cpu.mutable_data<float>();
+
+  X.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  W.Assign<float, lite::DDim, TARGET(kCUDA)>(w_cpu_data, w_cpu.dims());
+
+  param.X = &X;
+  param.W = &W;
+  // param.ROW = &ROW;
+  // param.COLUMN = &COLUMN;
+  param.Out = &Out;
+  param.Col = &Col;
+  param.stride_h = stride_h;
+  param.stride_w = stride_w;
+  param.kernel_h = kernel_h;
+  param.kernel_w = kernel_w;
+  param.input_channel = input_channel;
+  param.output_channel = output_channel;
+  var_conv_kernel.SetParam(param);
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+  var_conv_kernel.SetContext(std::move(ctx));
+  var_conv_kernel.Run();
+  cudaDeviceSynchronize();
+
+  const float* out_data = Out.data<float>();
+  const float* col_data = Col.data<float>();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * Out.numel(), IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(
+      col_cpu_data, col_data, sizeof(float) * Col.numel(), IoDirection::DtoH);
+
+  lite::Tensor top_ref, col_ref;
+  var_conv_2d_ref(&x_cpu,
+                  &w_cpu,
+                  &ROW,
+                  &COLUMN,
+                  kernel_h,
+                  kernel_w,
+                  stride_h,
+                  stride_w,
+                  input_channel,
+                  output_channel,
+                  &top_ref,
+                  &col_ref);
+
+  for (int i = 0; i < Out.numel(); ++i) {
+    EXPECT_NEAR(out_cpu_data[i], top_ref.data<float>()[i], 1e-5);
+  }
+  for (int i = 0; i < Col.numel(); ++i) {
+    EXPECT_NEAR(col_cpu_data[i], col_ref.data<float>()[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
index 3e06e103bb..8bc171dd67 100644
--- a/lite/kernels/fpga/conv_compute.cc
+++ b/lite/kernels/fpga/conv_compute.cc
@@ -36,8 +36,15 @@ void ConvCompute::PrepareForRun() {
   conv_param.filter = param.filter->ZynqTensor();
   conv_param.groups = param.groups;
   conv_param.strides = param.strides;
+  auto paddings = *param.paddings;
   conv_param.paddings = param.paddings;
   conv_param.dilations = param.dilations;
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+  if (!pad_equal) {
+    LOG(FATA) << "This pad not support ! " << paddings[0] << ", " << paddings[1]
+              << ", " << paddings[2] << ", " << paddings[3];
+  }
   fill_scale_bias_const(&conv_param);
   conv_param.bias()->copyFrom(param.bias->ZynqTensor());
   conv_param.relu.enabled = param.fuse_relu;
diff --git a/lite/kernels/fpga/conv_compute_test.cc b/lite/kernels/fpga/conv_compute_test.cc
index f166974cc9..1e05c1fa0c 100644
--- a/lite/kernels/fpga/conv_compute_test.cc
+++ b/lite/kernels/fpga/conv_compute_test.cc
@@ -141,13 +141,15 @@ void conv_compute_ref(const operators::ConvParam& param) {
   int group = param.groups;
   int kernel_w = param.filter->dims()[2];
   int kernel_h = param.filter->dims()[3];
+
+  auto paddings = *param.paddings;
+  auto dilations = *para.dilations;
   int stride_w = param.strides[0];
   int stride_h = param.strides[1];
-  int dila_w = param.dilations[0];
-  int dila_h = param.dilations[1];
-
-  int pad_w = param.paddings[0];
-  int pad_h = param.paddings[1];
+  int dila_w = dilations[0];
+  int dila_h = dilations[1];
+  int pad_w = paddings[2];
+  int pad_h = paddings[0];
   bool flag_bias = (param.bias != nullptr);
   bool flag_relu = param.fuse_relu;
 
@@ -277,10 +279,14 @@ TEST(conv_fpga, compute) {
                             param.bias = &bias;
                           }
                           param.fuse_relu = flag_relu;
-                          param.paddings = std::vector<int>({padding, padding});
+                          std::vector<int> paddings = {
+                              padding, padding, padding, padding};
                           param.strides = std::vector<int>({stride, stride});
+                          std::vector<int> dilations = {dilation, dilation};
+                          param.paddings =
+                              std::make_shared<std::vector<int>>(paddings);
                           param.dilations =
-                              std::vector<int>({dilation, dilation});
+                              std::make_shared<std::vector<int>>(dilations);
                           param.groups = group;
                           conv.SetParam(param);
                           conv.Launch();
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index 032de81974..79d1bf2fd5 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -19,6 +19,9 @@ lite_cc_library(npu_bridge_split_op SRCS split_op.cc DEPS ${npu_bridge_deps})
 lite_cc_library(npu_bridge_concat_op SRCS concat_op.cc DEPS ${npu_bridge_deps})
 lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${npu_bridge_deps})
 lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
+lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps})
+lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps})
+lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps})
 
 set(npu_bridges
         npu_bridge_registry
@@ -39,6 +42,9 @@ set(npu_bridges
         npu_bridge_concat_op
         npu_bridge_shuffle_channel_op
         npu_bridge_pad2d_op
+        npu_bridge_square_op
+        npu_bridge_sqrt_op
+        npu_bridge_reduce_mean_op
         CACHE INTERNAL "npu_bridges")
 
 set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops})
@@ -60,5 +66,8 @@ lite_cc_test(test_npu_bridge_split_op SRCS split_op_test.cc test_helper.cc DEPS
 lite_cc_test(test_npu_bridge_concat_op SRCS concat_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 
 message(STATUS "+++++ npu_bridges: ${npu_bridges}")
diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc
index 51b49091cd..ac62891113 100644
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
@@ -41,6 +41,19 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
   // clipped_relu etc.
   act_node->set_attr_mode(lite::npu::CvtActMode(op_type));
 
+  if (op_type == "relu_clipped") {
+    auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
+    act_node->set_attr_coef(Relu_clipped_coef);
+  } else if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    act_node->set_attr_negative_slope(alpha);
+  } else if (op_type == "hard_sigmoid") {
+    auto slope = op_info->GetAttr<float>("slope");
+    auto offset = op_info->GetAttr<float>("offset");
+    act_node->set_attr_negative_slope(slope);
+    act_node->set_attr_coef(offset);
+  }
+
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = act_node;
   return outputs_map;
@@ -52,14 +65,18 @@ node_map_type ActConverter(const std::shared_ptr<lite::OpLite> act_op,
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_NPU_BRIDGE(sigmod, paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_NPU_BRIDGE(sigmoid, paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(relu, paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(tanh, paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_NPU_BRIDGE(relu_clipped,
+                    paddle::lite::kernels::npu::bridges::ActConverter);
+// REGISTER_NPU_BRIDGE(elu, paddle::lite::kernels::npu::bridges::ActConverter);
+REGISTER_NPU_BRIDGE(leaky_relu,
+                    paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(abs, paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(softsign,
                     paddle::lite::kernels::npu::bridges::ActConverter);
 REGISTER_NPU_BRIDGE(softplus,
                     paddle::lite::kernels::npu::bridges::ActConverter);
-REGISTER_NPU_BRIDGE(hardsigmoid,
+REGISTER_NPU_BRIDGE(hard_sigmoid,
                     paddle::lite::kernels::npu::bridges::ActConverter);
diff --git a/lite/kernels/npu/bridges/act_op_test.cc b/lite/kernels/npu/bridges/act_op_test.cc
index 420de655dc..d50b1968b1 100644
--- a/lite/kernels/npu/bridges/act_op_test.cc
+++ b/lite/kernels/npu/bridges/act_op_test.cc
@@ -17,7 +17,7 @@
 #include "lite/core/op_registry.h"
 #include "lite/kernels/npu/bridges/registry.h"
 #include "lite/kernels/npu/bridges/test_helper.h"
-#include "lite/operators/relu_op.h"
+#include "lite/operators/activation_ops.h"
 
 namespace paddle {
 namespace lite {
@@ -25,69 +25,112 @@ namespace kernels {
 namespace npu {
 namespace bridges {
 
-void relu_ref(const std::shared_ptr<operators::ReluOp> op) {
+void act_ref(const std::shared_ptr<operators::ActivationOp> op) {
   Scope* scope = op->scope();
   const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto op_type = op_info->Type();
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
   auto x_data = x->data<float>();
   auto out_data = out->mutable_data<float>();
-  DDim x_dims = x->dims();
-  DDim out_dims = out->dims();
-  CHECK_EQ(x_dims.production(), out_dims.production());
-  for (int i = 0; i < out_dims.production(); i++) {
-    out_data[i] = std::max(0.f, x_data[i]);
+  CHECK_EQ(x->numel(), out->numel());
+
+  // "sigmoid","relu","tanh","relu_clipped","leaky_relu","softsign","hard_sigmoid"
+  if (op_type == "sigmoid") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = 1.f / (1.f + std::exp(-x_data[i]));
+    }
+  } else if (op_type == "relu") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::max(0.f, x_data[i]);
+    }
+  } else if (op_type == "tanh") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = (std::exp(x_data[i]) - std::exp(-x_data[i])) /
+                    (std::exp(x_data[i]) + std::exp(-x_data[i]));
+    }
+  } else if (op_type == "relu_clipped") {
+    auto relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(std::max(0.f, x_data[i]), relu_clipped_coef);
+    }
+  } else if (op_type == "leaky_relu") {
+    auto alpha = op_info->GetAttr<float>("alpha");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::max(x_data[i], x_data[i] * alpha);
+    }
+  } else if (op_type == "softsign") {
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = x_data[i] / (1 + std::abs(x_data[i]));
+    }
+  } else if (op_type == "hard_sigmoid") {
+    auto slope = op_info->GetAttr<float>("slope");
+    auto offset = op_info->GetAttr<float>("offset");
+    for (size_t i = 0; i < out->numel(); i++) {
+      out_data[i] = std::min(1.f, slope * x_data[i] + offset);
+      out_data[i] = std::max(0.f, out_data[i]);
+    }
+  } else {
+    LOG(FATAL) << "unsupported activation type: " << op_type;
   }
 }
 
-void test_relu(int bs, int ic, int ih, int iw) {
+void test_act(std::vector<int64_t> x_shape, std::string op_type) {
   // prepare input&output variables
   Scope scope;
   std::string x_var_name("x");
   std::string out_var_name("out");
   std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(x_shape);
 
   // initialize input&output data
-  FillTensor<float, int>(x);
+  FillTensor<float>(x, -8, 8);
 
   // initialize op desc
   cpp::OpDesc opdesc;
-  opdesc.SetType("relu");
+  opdesc.SetType(op_type);
   opdesc.SetInput("X", {x_var_name});
   opdesc.SetOutput("Out", {out_var_name});
+  if (op_type == "relu_clipped") {
+    opdesc.SetAttr("Relu_clipped_coef", 6.f);
+  } else if (op_type == "leaky_relu") {
+    opdesc.SetAttr("alpha", 0.02f);
+  } else if (op_type == "hard_sigmoid") {
+    opdesc.SetAttr("slope", 0.2f);
+    opdesc.SetAttr("offset", 0.5f);
+  }
 
   // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ReluOp>(opdesc, &scope);
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
   LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
 
   // execute reference implementation and save to output tensor
-  relu_ref(op);
+  act_ref(op);
 
   // compare results
   auto* out_data = out->mutable_data<float>();
   auto* out_ref_data = out_ref->mutable_data<float>();
   for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
   }
 }
 
-TEST(NPUBridges, relu) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {3, 4}) {
-      for (auto ih : {2, 5}) {
-        for (auto iw : {5, 9}) {
-          VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
-                  << " iw: " << iw;
-          test_relu(bs, ic, ih, iw);
-        }
-      }
+TEST(NPUBridges, activation) {
+  std::vector<std::vector<int64_t>> shapes{{1}, {2, 3}, {1, 2, 3, 4}};
+  std::vector<std::string> types{"sigmoid",
+                                 "relu",
+                                 "tanh",
+                                 "relu_clipped",
+                                 "leaky_relu",
+                                 "softsign",
+                                 "hard_sigmoid"};
+  for (auto x_shape : shapes) {
+    for (auto op_type : types) {
+      test_act(x_shape, op_type);
     }
   }
 }
@@ -98,5 +141,20 @@ TEST(NPUBridges, relu) {
 }  // namespace lite
 }  // namespace paddle
 
+USE_LITE_OP(sigmoid);
+USE_NPU_BRIDGE(sigmoid);
 USE_LITE_OP(relu);
 USE_NPU_BRIDGE(relu);
+USE_LITE_OP(tanh);
+USE_NPU_BRIDGE(tanh);
+USE_LITE_OP(relu_clipped);
+USE_NPU_BRIDGE(relu_clipped);
+
+USE_LITE_OP(leaky_relu);
+USE_NPU_BRIDGE(leaky_relu);
+
+USE_LITE_OP(softsign);
+USE_NPU_BRIDGE(softsign);
+
+USE_LITE_OP(hard_sigmoid);
+USE_NPU_BRIDGE(hard_sigmoid);
diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc
index 6f5f00959b..8c3153d242 100644
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
@@ -30,8 +30,8 @@ node_map_type BatchNormConverter(
   auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::BatchNorm> batch_norm_node =
-      std::make_shared<ge::op::BatchNorm>(unique_op_type);
+  std::shared_ptr<ge::op::BatchNormExt2> batch_norm_node =
+      std::make_shared<ge::op::BatchNormExt2>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
 
   auto scale_var_name = op_info->Input("Scale").front();
@@ -66,7 +66,7 @@ node_map_type BatchNormConverter(
 
   batch_norm_node->set_input_x(*inputs_map.at(x_var_name));
   batch_norm_node->set_input_scale(*npu_scale);
-  batch_norm_node->set_input_b(*npu_bias);
+  batch_norm_node->set_input_offset(*npu_bias);
   batch_norm_node->set_input_mean(*npu_mean);
   batch_norm_node->set_input_variance(*npu_variance);
   batch_norm_node->set_attr_momentum(npu_momentum);
diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc
index 32f4d511d5..8dc9ab1f0f 100644
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/operators/conv_op.h"
 #include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
 
@@ -42,9 +43,9 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
   auto bs = input_dims[0];
   auto ic = input_dims[1];
   auto oc = filter_dims[0];
-  CHECK_EQ(input_dims.size(), 4);
-  CHECK_EQ(output_dims.size(), 4);
-  CHECK_EQ(filter_dims.size(), 4);
+  CHECK_EQ(input_dims.size(), 4L);
+  CHECK_EQ(output_dims.size(), 4L);
+  CHECK_EQ(filter_dims.size(), 4L);
   CHECK_EQ(output_dims[0], bs);
   CHECK_EQ(output_dims[1], oc);
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
@@ -52,9 +53,28 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
   auto groups = op_info->GetAttr<int>("groups");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
 
   // check depthwise mode, and decide whether use ConvolutionDepthwise Op
   bool use_depthwise_conv =
@@ -134,7 +154,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
     depthwise_conv_node->set_attr_pad_mode(5);  // VALID
     depthwise_conv_node->set_attr_group(groups);
     depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
-        {paddings[0], paddings[0], paddings[1], paddings[1]}));
+        {paddings[0], paddings[1], paddings[2], paddings[3]}));
     depthwise_conv_node->set_attr_dilation(
         ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
     depthwise_conv_node->set_attr_stride(
@@ -161,7 +181,7 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> conv_op,
     common_conv_node->set_attr_pad_mode(0);  // NOTSET
     common_conv_node->set_attr_group(groups);
     common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
-        {paddings[0], paddings[0], paddings[1], paddings[1]}));
+        {paddings[0], paddings[0], paddings[2], paddings[2]}));
     common_conv_node->set_attr_dilation(
         ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
     common_conv_node->set_attr_stride(
diff --git a/lite/kernels/npu/bridges/conv_op_test.cc b/lite/kernels/npu/bridges/conv_op_test.cc
index 26309aa9e2..909061d2ba 100644
--- a/lite/kernels/npu/bridges/conv_op_test.cc
+++ b/lite/kernels/npu/bridges/conv_op_test.cc
@@ -54,7 +54,7 @@ void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
   int stride_h = strides[0];
   int dila_w = dilations[1];
   int dila_h = dilations[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int pad_h = paddings[0];
   int batch_size = input_dims[0];
   int in_ch_size = input_dims[1];
@@ -175,7 +175,8 @@ void test_conv(int bs,
   opdesc.SetOutput("Output", {output_var_name});
   opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
   opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int32_t>({padding, padding, padding, padding}));
   opdesc.SetAttr("groups", groups);
   opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
   if (has_bias) {
diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc
index 5ae99ef046..6eff4cb2d2 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -44,9 +44,17 @@ node_map_type ConvTransposeConverter(
   auto groups = op_info->GetAttr<int>("groups");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the input size.";
 
   // create deconv node
   auto conv_transpose_node =
@@ -82,12 +90,11 @@ node_map_type ConvTransposeConverter(
   lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
 
   // set attributes
-  conv_transpose_node->set_attr_mode(1);
   conv_transpose_node->set_attr_format(0);    // NCHW
   conv_transpose_node->set_attr_pad_mode(0);  // NOTSET
   conv_transpose_node->set_attr_group(groups);
   conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT(
-      {paddings[0], paddings[0], paddings[1], paddings[1]}));
+      {paddings[0], paddings[1], paddings[2], paddings[3]}));
   conv_transpose_node->set_attr_dilation(
       ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
   conv_transpose_node->set_attr_stride(
diff --git a/lite/kernels/npu/bridges/conv_transpose_op_test.cc b/lite/kernels/npu/bridges/conv_transpose_op_test.cc
index a009ef588e..f96e57c06f 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op_test.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op_test.cc
@@ -278,7 +278,8 @@ void test_conv_transpose(int bs,
   opdesc.SetOutput("Output", {output_var_name});
   opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
   opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int32_t>({padding, padding, padding, padding}));
   opdesc.SetAttr("groups", groups);
   opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
   if (has_bias) {
diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc
index 2ec757ab14..5eb5f4e271 100644
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
@@ -21,6 +21,30 @@ namespace kernels {
 namespace npu {
 namespace bridges {
 
+std::vector<int64_t> CvtYShape(const Tensor& x, Tensor* y, int axis) {
+  auto x_dims = x.dims();
+  CHECK_EQ(x_dims.size(), 4UL) << "[NPU] only support 4-dimension x";
+  auto y_dims = y->dims();
+  CHECK_GE(x_dims.size(), y_dims.size());
+
+  if (axis < 0) {
+    axis += x_dims.size();
+  }
+
+  std::vector<int64_t> y_new_shape(y_dims.Vectorize());
+  if (y_new_shape.size() == 4UL) {
+    return y_new_shape;
+  }
+  for (int i = 0; i < axis; i++) {
+    y_new_shape.insert(y_new_shape.begin(), 1);
+  }
+  while (y_new_shape.size() < 4) {
+    y_new_shape.push_back(1);
+  }
+  CHECK_EQ(y_new_shape.size(), 4UL);
+  return y_new_shape;
+}
+
 node_map_type ElementwiseConverter(
     const std::shared_ptr<lite::OpLite> elementwise_op,
     const node_map_type& inputs_map) {
@@ -30,34 +54,53 @@ node_map_type ElementwiseConverter(
   auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "[NPU] Converting " + op_type + "...";
 
-  std::shared_ptr<ge::op::Eltwise> elementwise_node =
-      std::make_shared<ge::op::Eltwise>(unique_op_type);
-
   auto x_var_name = op_info->Input("X").front();
   auto y_var_name = op_info->Input("Y").front();
-
-  CHECK_EQ(op_info->GetAttr<int>("axis"), -1)
-      << "[NPU] elementwise only support inputs with same size";
-
   CHECK(inputs_map.find(x_var_name) != inputs_map.end());
-  elementwise_node->set_input_x1(*inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  auto axis = op_info->GetAttr<int>("axis");
 
+  std::shared_ptr<ge::Operator> elementwise_node = nullptr;
+  std::shared_ptr<ge::Operator> x_node = inputs_map.at(x_var_name);
+  std::shared_ptr<ge::Operator> y_node = nullptr;
   if (inputs_map.find(y_var_name) != inputs_map.end()) {
-    elementwise_node->set_input_x2(*inputs_map.at(y_var_name));
-    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
+    y_node = inputs_map.at(y_var_name);
   } else {
     auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
-    auto* y = scope->FindVar(y_var_name)->GetMutable<Tensor>();
-    y_const_node->set_attr_value(lite::npu::CvtTensor(y));
-    elementwise_node->set_input_x2(*y_const_node);
-    lite::npu::OpList::Global().add(y_const_node);
+    auto x = scope->FindTensor(x_var_name);
+    auto y = scope->FindMutableTensor(y_var_name);
+    auto y_new_shape = CvtYShape(*x, y, axis);
+    y_const_node->set_attr_value(lite::npu::CvtTensor(y, y_new_shape));
+    y_node = y_const_node;
   }
+  lite::npu::OpList::Global().add(x_node);
+  lite::npu::OpList::Global().add(y_node);
 
-  lite::npu::OpList::Global().add(elementwise_node);
+  if (op_type == "elementwise_add" ||
+      op_type == "fusion_elementwise_add_activation") {
+    auto elt_node = std::make_shared<ge::op::Add>(unique_op_type);
+    elt_node->set_input_x1(*x_node);
+    elt_node->set_input_x2(*y_node);
+    elementwise_node = elt_node;
+  } else if (op_type == "elementwise_sub") {
+    auto elt_node = std::make_shared<ge::op::Sub>(unique_op_type);
+    elt_node->set_input_x1(*x_node);
+    elt_node->set_input_x2(*y_node);
+    elementwise_node = elt_node;
+  } else if (op_type == "elementwise_mul") {
+    auto elt_node = std::make_shared<ge::op::Mul>(unique_op_type);
+    elt_node->set_input_x(*x_node);
+    elt_node->set_input_y(*y_node);
+    elementwise_node = elt_node;
+  } else if (op_type == "elementwise_div") {
+    auto elt_node = std::make_shared<ge::op::RealDiv>(unique_op_type);
+    elt_node->set_input_x1(*x_node);
+    elt_node->set_input_x2(*y_node);
+    elementwise_node = elt_node;
+  } else {
+    LOG(FATAL) << "unsupported op type: " << op_type;
+  }
 
-  // paddlelite has sum only
-  elementwise_node->set_attr_mode(1);
+  lite::npu::OpList::Global().add(elementwise_node);
 
   node_map_type outputs_map;
   if (op_type == "fusion_elementwise_add_activation") {
@@ -86,3 +129,9 @@ REGISTER_NPU_BRIDGE(elementwise_add,
                     paddle::lite::kernels::npu::bridges::ElementwiseConverter);
 REGISTER_NPU_BRIDGE(fusion_elementwise_add_activation,
                     paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_NPU_BRIDGE(elementwise_sub,
+                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_NPU_BRIDGE(elementwise_mul,
+                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
+REGISTER_NPU_BRIDGE(elementwise_div,
+                    paddle::lite::kernels::npu::bridges::ElementwiseConverter);
diff --git a/lite/kernels/npu/bridges/elementwise_ops_test.cc b/lite/kernels/npu/bridges/elementwise_ops_test.cc
index 0e2fc9f262..8dd4c851ca 100644
--- a/lite/kernels/npu/bridges/elementwise_ops_test.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops_test.cc
@@ -29,37 +29,28 @@ template <typename dtype>
 void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
   Scope* scope = op->scope();
   const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Input("Y").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x = scope->FindTensor("x");
+  auto y = scope->FindTensor("y");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
 
   auto x_data = x->data<dtype>();
   auto y_data = y->data<dtype>();
-  dtype* out_data = out->mutable_data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
   int axis = op_info->GetAttr<int>("axis");
 
   if (axis < 0) {
-    axis = x_dims.size() - y_dims.size();
+    axis += x_dims.size();
   }
   int batch = 1;
-  int channels = 1;
-  int num = 1;
-  for (int i = 0; i < axis; ++i) {
-    batch *= x_dims[i];
-  }
-  for (int i = 0; i < y_dims.size(); ++i) {
-    channels *= y_dims[i];
-  }
-  for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
-    num *= x_dims[i];
-  }
+  int channels = y->numel();
+  int num = x->numel() / channels / batch;
   // do elementwise add/sub/max...
-  std::string elt_type = "add";
-  if (elt_type == "add") {
+  std::string op_type = op_info->Type();
+  if (op_type == "elementwise_add") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -73,7 +64,7 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
         }
       }
     }
-  } else if (elt_type == "sub") {
+  } else if (op_type == "elementwise_sub") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -87,7 +78,7 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
         }
       }
     }
-  } else if (elt_type == "mul") {
+  } else if (op_type == "elementwise_mul") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -101,7 +92,21 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
         }
       }
     }
-  } else if (elt_type == "max") {
+  } else if (op_type == "elementwise_div") {
+    for (int i = 0; i < batch; ++i) {
+      for (int j = 0; j < channels; ++j) {
+        int offset = (i * channels + j) * num;
+        const dtype* din_ptr = x_data + offset;
+        const dtype diny_data = y_data[j];
+        dtype* dout_ptr = out_data + offset;
+        for (int k = 0; k < num; ++k) {
+          *dout_ptr = *din_ptr / diny_data;
+          dout_ptr++;
+          din_ptr++;
+        }
+      }
+    }
+  } else if (op_type == "elementwise_max") {
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channels; ++j) {
         int offset = (i * channels + j) * num;
@@ -116,11 +121,14 @@ void elementwise_add_ref(const std::shared_ptr<operators::ElementwiseOp> op) {
       }
     }
   } else {
-    LOG(FATAL) << "unsupported Elementwise type: " << elt_type;
+    LOG(FATAL) << "unsupported Elementwise type: " << op_type;
   }
 }
 
-void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
+void test_elementwise_add(const std::vector<int64_t>& x_shape,
+                          const std::vector<int64_t>& y_shape,
+                          int axis,
+                          std::string elt_type) {
   // prepare input&output variables
   Scope scope;
   std::string x_var_name = "x";
@@ -131,16 +139,16 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
   auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
   auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
   auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  y->Resize({bs, ic, ih, iw});
+  x->Resize(x_shape);
+  y->Resize(y_shape);
 
   // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(y);
+  FillTensor<float>(x, 1, 3);
+  FillTensor<float>(y, 1, 3);
 
   // initialize op desc
   cpp::OpDesc opdesc;
-  opdesc.SetType("elementwise_add");
+  opdesc.SetType("elementwise_" + elt_type);
   opdesc.SetInput("X", {x_var_name});
   opdesc.SetInput("Y", {y_var_name});
   opdesc.SetOutput("Out", {out_var_name});
@@ -149,7 +157,6 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
   // create and convert op to NPU model, then run it on NPU
   auto op = CreateOp<operators::ElementwiseOp>(opdesc, &scope);
   LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
 
   // execute reference implementation and save to output tensor
   elementwise_add_ref<float>(op);
@@ -158,19 +165,15 @@ void test_elementwise_add(int bs, int ic, int ih, int iw, int axis) {
   auto* out_data = out->mutable_data<float>();
   auto* out_ref_data = out_ref->mutable_data<float>();
   for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-1);
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
   }
 }
 
 TEST(NPUBridges, elementwise_add) {
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto axis : {-1}) test_elementwise_add(bs, ic, ih, iw, axis);
-        }
-      }
-    }
+  for (auto elt_type : {"add", "sub", "mul", "div"}) {
+    test_elementwise_add({1, 2, 3, 4}, {2}, 1, elt_type);
+    test_elementwise_add({1, 2, 3, 4}, {1, 2, 1, 1}, 1, elt_type);
+    test_elementwise_add({1, 2, 3, 4}, {1, 2, 3, 4}, 3, elt_type);
   }
 }
 
@@ -182,3 +185,9 @@ TEST(NPUBridges, elementwise_add) {
 
 USE_LITE_OP(elementwise_add);
 USE_NPU_BRIDGE(elementwise_add);
+USE_LITE_OP(elementwise_sub);
+USE_NPU_BRIDGE(elementwise_sub);
+USE_LITE_OP(elementwise_mul);
+USE_NPU_BRIDGE(elementwise_mul);
+USE_LITE_OP(elementwise_div);
+USE_NPU_BRIDGE(elementwise_div);
diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc
index 71f5eac57a..8e60a39fe4 100644
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -45,6 +45,7 @@ node_map_type InterpolateConverter(
   auto out_h = op_info->GetAttr<int>("out_h");
   auto align_corners = op_info->GetAttr<bool>("align_corners");
   int align_mode = op_info->GetAttr<int>("align_mode");
+  auto interp_method = op_info->GetAttr<std::string>("interp_method");
   CHECK(!(align_mode == 0 && !align_corners)) << "[NPU] align_mode = 0 && "
                                                  "align_corners = false isn't "
                                                  "supported in HiAI DDK";
@@ -58,11 +59,11 @@ node_map_type InterpolateConverter(
   }
 
   // update out_h and out_w if has OutSize
-  bool inputs_map_has_w = false;
+  std::shared_ptr<ge::Operator> out_size_node = nullptr;
   if (lite::npu::HasInputArg(op_info, scope, "OutSize")) {
     auto out_size_var_name = op_info->Input("OutSize").front();
     if (inputs_map.count(out_size_var_name)) {
-      inputs_map_has_w = true;
+      out_size_node = inputs_map.at(out_size_var_name);
     } else {
       auto out_size =
           scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
@@ -73,58 +74,45 @@ node_map_type InterpolateConverter(
       out_w = out_size_data[1];
     }
   }
-
-  node_map_type outputs_map;
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-  if (interp_method == "bilinear") {
-    auto interp_node = std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
-    lite::npu::OpList::Global().add(interp_node);
-    interp_node->set_input_x(*inputs_map.at(x_var_name));
-    if (inputs_map_has_w) {
-      auto out_size_var_name = op_info->Input("OutSize").front();
-      interp_node->set_input_w(*inputs_map.at(out_size_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
-    } else {
+  if (out_size_node == nullptr) {
+    if (interp_method == "bilinear") {
       const float largest_multiple = 7.0f;
       float multiple = static_cast<float>(x_h * x_w) / (out_h * out_w);
       CHECK_LT(multiple, largest_multiple)
           << "[NPU] multiple=(ih*iw)/(oh*ow)=" << multiple
           << " is too large, should not exceed " << largest_multiple
           << " in HiAI DDK";
-      auto w_const_node =
-          std::make_shared<ge::op::Const>(unique_op_type + "/w");
-      w_const_node->set_attr_value(
-          lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-      interp_node->set_input_w(*w_const_node);
-      lite::npu::OpList::Global().add(w_const_node);
     }
-    interp_node->set_attr_output_dim_mode(
-        2);  // 0: zoom_factor, 1: shrink_factor, 2: height/width
-    interp_node->set_attr_align_corners(align_corners);
-    outputs_map[op_info->Output("Out").front()] = interp_node;
+    auto out_size_const_node =
+        std::make_shared<ge::op::Const>(unique_op_type + "/out_size");
+    out_size_const_node->set_attr_value(
+        lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
+    out_size_node = out_size_const_node;
+  }
+  lite::npu::OpList::Global().add(out_size_node);
+
+  std::shared_ptr<ge::Operator> interp_node = nullptr;
+  if (interp_method == "bilinear") {
+    auto bilinear_interp_node =
+        std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
+    bilinear_interp_node->set_input_x(*inputs_map.at(x_var_name));
+    bilinear_interp_node->set_input_size(*out_size_node);
+    bilinear_interp_node->set_attr_align_corners(align_corners);
+    interp_node = bilinear_interp_node;
   } else if (interp_method == "nearest") {
-    auto interp_node =
+    auto nearest_interp_node =
         std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type);
-    lite::npu::OpList::Global().add(interp_node);
-    interp_node->set_input_image(*inputs_map.at(x_var_name));
-    if (inputs_map_has_w) {
-      auto out_size_var_name = op_info->Input("OutSize").front();
-      interp_node->set_input_size(*inputs_map.at(out_size_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
-    } else {
-      auto w_const_node =
-          std::make_shared<ge::op::Const>(unique_op_type + "/w");
-      w_const_node->set_attr_value(
-          lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-      interp_node->set_input_size(*w_const_node);
-      lite::npu::OpList::Global().add(w_const_node);
-    }
-    interp_node->set_attr_align_corners(align_corners);
-    outputs_map[op_info->Output("Out").front()] = interp_node;
+    nearest_interp_node->set_input_image(*inputs_map.at(x_var_name));
+    nearest_interp_node->set_input_size(*out_size_node);
+    nearest_interp_node->set_attr_align_corners(align_corners);
+    interp_node = nearest_interp_node;
   } else {
     LOG(FATAL) << "[NPU] Unsupported interpolate method: " << interp_method;
   }
+  lite::npu::OpList::Global().add(interp_node);
 
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = interp_node;
   return outputs_map;
 }
 
diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc
index 5f8bdc4ee9..2313351f6c 100644
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -31,82 +31,67 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
   auto unique_op_type = lite::npu::UniqueName(op_type);
   LOG(INFO) << "[NPU] Converting " + op_type + "...";
 
-  auto output_node = std::make_shared<ge::op::MatMul>(unique_op_type);
-
   auto x_var_name = op_info->Input("X").front();
   auto y_var_name = op_info->Input("Y").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
   int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
   int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
-  auto* xtensor = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
-  auto* ytensor = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
-
-  int m = xtensor->dims().Slice(0, x_num_col_dims).production();
-  int x_w = xtensor->dims()
-                .Slice(x_num_col_dims, xtensor->dims().size())
-                .production();
-  int y_h = ytensor->dims().Slice(0, y_num_col_dims).production();
-  int n = ytensor->dims()
-              .Slice(y_num_col_dims, ytensor->dims().size())
-              .production();
-  CHECK_EQ(x_w, y_h) << "[NPU] x_w must be equal with y_h";
-  int k = x_w;
+  int m = x_dims.Slice(0, x_num_col_dims).production();
+  int k = x_dims.Slice(x_num_col_dims, x_dims.size()).production();
+  CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production())
+      << "[NPU] columns of X must be equal with rows of Y";
+  int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production();
   LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k;
   LOG(INFO) << "x_var_name:" << x_var_name
             << ", is data: " << inputs_map.count(x_var_name);
   LOG(INFO) << "y_var_name:" << y_var_name
             << ", is data: " << inputs_map.count(y_var_name);
   CHECK(inputs_map.count(x_var_name))
-      << "[NPU] MatMul only support X is data, Y is const yet";
+      << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";
+
+  auto mul_node = std::make_shared<ge::op::MatMul>(unique_op_type);
+  // add input x node which supports persistable and non-persistable tensor, and
+  // reshape to (m, k)
   if (inputs_map.count(x_var_name)) {
-    auto xsrc = inputs_map.at(x_var_name);
-    auto reshapex = std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-    reshapex->set_input_tensor(*xsrc);
-    reshapex->set_attr_shape({m, k});
-    reshapex->set_attr_axis(0);
-    lite::npu::OpList::Global().add(xsrc);
-    lite::npu::OpList::Global().add(reshapex);
-    output_node->set_input_x(*reshapex);
+    auto reshaped_x_node =
+        std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
+    reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
+    reshaped_x_node->set_attr_shape({m, k});
+    reshaped_x_node->set_attr_axis(0);
+    mul_node->set_input_x1(*reshaped_x_node);
+    lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+    lite::npu::OpList::Global().add(reshaped_x_node);
   } else {
-    auto constx = std::make_shared<ge::op::Const>(x_var_name);
-    ge::TensorDesc desc(ge::Shape({m, k}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto size = desc.GetShape().GetShapeSize();
-    CHECK_EQ(size, xtensor->dims().production());
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(desc);
-    auto* pdata = reinterpret_cast<uint8_t*>(xtensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    constx->set_attr_value(ptensor);
-    lite::npu::OpList::Global().add(constx);
-    output_node->set_input_x(*constx);
+    auto x_const_node = std::make_shared<ge::op::Const>(x_var_name);
+    x_const_node->set_attr_value(lite::npu::CvtTensor(x, {m, k}));
+    mul_node->set_input_x1(*x_const_node);
+    lite::npu::OpList::Global().add(x_const_node);
   }
-
+  // add input y node which only supports persistable tensor, and reshape to (k,
+  // n)
   if (inputs_map.count(y_var_name)) {
-    auto ysrc = inputs_map.at(y_var_name);
-    auto reshapey = std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
-    reshapey->set_input_tensor(*ysrc);
-    reshapey->set_attr_shape({k, n});
-    reshapey->set_attr_axis(0);
-    lite::npu::OpList::Global().add(ysrc);
-    lite::npu::OpList::Global().add(reshapey);
-    output_node->set_input_w(*reshapey);
+    auto reshaped_y_node =
+        std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
+    reshaped_y_node->set_input_tensor(*inputs_map.at(y_var_name));
+    reshaped_y_node->set_attr_shape({k, n});
+    reshaped_y_node->set_attr_axis(0);
+    mul_node->set_input_x2(*reshaped_y_node);
+    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
+    lite::npu::OpList::Global().add(reshaped_y_node);
   } else {
-    auto consty = std::make_shared<ge::op::Const>(y_var_name);
-    ge::TensorDesc desc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT);
-    auto size = desc.GetShape().GetShapeSize();
-    CHECK_EQ(size, ytensor->dims().production());
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(desc);
-    auto* pdata = reinterpret_cast<uint8_t*>(ytensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    consty->set_attr_value(ptensor);
-    lite::npu::OpList::Global().add(consty);
-    output_node->set_input_w(*consty);
+    auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
+    y_const_node->set_attr_value(lite::npu::CvtTensor(y, {k, n}));
+    mul_node->set_input_x2(*y_const_node);
+    lite::npu::OpList::Global().add(y_const_node);
   }
 
-  lite::npu::OpList::Global().add(output_node);
+  lite::npu::OpList::Global().add(mul_node);
 
   node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
+  outputs_map[op_info->Output("Out").front()] = mul_node;
   return outputs_map;
 }
 
diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
index 8b4252de06..9a432d17e5 100644
--- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
@@ -16,23 +16,40 @@
 
 #include "lite/kernels/npu/bridges/registry.h"
 
-USE_NPU_BRIDGE(mul);
-USE_NPU_BRIDGE(fc);
+USE_NPU_BRIDGE(sigmoid);
+USE_NPU_BRIDGE(relu);
+USE_NPU_BRIDGE(tanh);
+USE_NPU_BRIDGE(relu_clipped);
+USE_NPU_BRIDGE(leaky_relu);
+USE_NPU_BRIDGE(softsign);
+USE_NPU_BRIDGE(hard_sigmoid);
+
+USE_NPU_BRIDGE(batch_norm);
+USE_NPU_BRIDGE(concat);
 USE_NPU_BRIDGE(conv2d);
 USE_NPU_BRIDGE(depthwise_conv2d);
-USE_NPU_BRIDGE(pool2d);
-USE_NPU_BRIDGE(relu);
+USE_NPU_BRIDGE(conv2d_transpose);
+
 USE_NPU_BRIDGE(elementwise_add);
 USE_NPU_BRIDGE(fusion_elementwise_add_activation);
+USE_NPU_BRIDGE(elementwise_sub);
+USE_NPU_BRIDGE(elementwise_mul);
+USE_NPU_BRIDGE(elementwise_div);
+
+USE_NPU_BRIDGE(fc);
+USE_NPU_BRIDGE(bilinear_interp);
+USE_NPU_BRIDGE(nearest_interp);
+USE_NPU_BRIDGE(mul);
+USE_NPU_BRIDGE(pad2d);
+USE_NPU_BRIDGE(pool2d);
+USE_NPU_BRIDGE(reduce_mean);
+USE_NPU_BRIDGE(reshape);
+USE_NPU_BRIDGE(reshape2);
 USE_NPU_BRIDGE(scale);
+USE_NPU_BRIDGE(shuffle_channel);
 USE_NPU_BRIDGE(softmax);
-USE_NPU_BRIDGE(concat);
 USE_NPU_BRIDGE(split);
+USE_NPU_BRIDGE(sqrt);
+USE_NPU_BRIDGE(square);
 USE_NPU_BRIDGE(transpose);
 USE_NPU_BRIDGE(transpose2);
-USE_NPU_BRIDGE(shuffle_channel);
-USE_NPU_BRIDGE(batch_norm);
-USE_NPU_BRIDGE(bilinear_interp);
-USE_NPU_BRIDGE(conv2d_transpose);
-USE_NPU_BRIDGE(reshape);
-USE_NPU_BRIDGE(reshape2);
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index 5915b7a8aa..7bbe94d5db 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/operators/pool_op.h"
 #include "lite/backends/npu/builder.h"
 #include "lite/kernels/npu/bridges/registry.h"
 
@@ -32,44 +33,78 @@ node_map_type PoolConverter(const std::shared_ptr<lite::OpLite> pool_op,
   std::shared_ptr<ge::op::Pooling> pool_node =
       std::make_shared<ge::op::Pooling>(unique_op_type);
   auto x_var_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_var_name);
+  pool_node->set_input_x(*inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(pool_node);
+
+  int mode = 0;
   auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
-  int npu_mode = 0;
   if (pooling_type == "max") {
-    npu_mode = 0;
+    mode = 0;
   } else if (pooling_type == "avg") {
-    npu_mode = 1;
+    mode = 1;
     CHECK(op_info->GetAttr<bool>("exclusive"))
         << "[NPU] exclusive must be true in HiAI DDK";
   } else {
     LOG(FATAL) << "[NPU] Unsupported pooling type: " << pooling_type;
   }
-  bool npu_global_pooling = op_info->GetAttr<bool>("global_pooling");
+  pool_node->set_attr_mode(mode);
+
+  int pad_mode = 0;
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  if (padding_algorithm == "SAME") {
+    pad_mode = 6;
+  } else if (padding_algorithm == "VALID") {
+    pad_mode = 5;
+  }
+  pool_node->set_attr_pad_mode(pad_mode);
+
+  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
+  pool_node->set_attr_global_pooling(global_pooling);
+
   auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  auto npu_window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end());
+  auto window = ge::AttrValue::LIST_INT(ksize.begin(), ksize.end());
+  pool_node->set_attr_window(window);
 
-  auto padding = op_info->GetAttr<std::vector<int>>("paddings");
-  auto npu_pad =
-      ge::AttrValue::LIST_INT{padding[0], padding[0], padding[1], padding[1]};
+  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the inputs size.";
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
   auto strides = op_info->GetAttr<std::vector<int>>("strides");
+  operators::UpdatePadding(&paddings,
+                           global_pooling,
+                           adaptive,
+                           padding_algorithm,
+                           x->dims(),
+                           strides,
+                           ksize);
+  auto npu_pad = ge::AttrValue::LIST_INT{
+      paddings[0], paddings[1], paddings[2], paddings[3]};
+  pool_node->set_attr_pad(npu_pad);
+
   auto npu_stride = ge::AttrValue::LIST_INT(strides.begin(), strides.end());
-  int npu_ceil_mode = 0;
+  pool_node->set_attr_stride(npu_stride);
+
+  int ceil_mode = 0;
   if (op_info->HasAttr("ceil_mode")) {
-    npu_ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
+    ceil_mode = op_info->GetAttr<bool>("ceil_mode") ? 1 : 0;
   }
-
-  pool_node->set_input_x(*inputs_map.at(x_var_name));
-  pool_node->set_attr_mode(npu_mode);
-  pool_node->set_attr_pad_mode(0);
-  pool_node->set_attr_global_pooling(npu_global_pooling);
-  pool_node->set_attr_window(npu_window);
-  pool_node->set_attr_pad(npu_pad);
-  pool_node->set_attr_stride(npu_stride);
-  pool_node->set_attr_ceil_mode(npu_ceil_mode);
+  pool_node->set_attr_ceil_mode(ceil_mode);
   // output_node->set_attr_data_mode(npu_data_mode);
 
-  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-  lite::npu::OpList::Global().add(pool_node);
-
   node_map_type outputs_map;
   outputs_map[op_info->Output("Out").front()] = pool_node;
   return outputs_map;
diff --git a/lite/kernels/npu/bridges/pool_op_test.cc b/lite/kernels/npu/bridges/pool_op_test.cc
index d4543a6ae1..298e065547 100644
--- a/lite/kernels/npu/bridges/pool_op_test.cc
+++ b/lite/kernels/npu/bridges/pool_op_test.cc
@@ -61,7 +61,7 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -163,7 +163,8 @@ void test_pool(int bs,
   opdesc.SetAttr("global_pooling", global_pooling);
   opdesc.SetAttr("exclusive", exclusive);
   opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int>({padding, padding, padding, padding}));
 
   // create and convert op to NPU model, then run it on NPU
   auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc
new file mode 100644
index 0000000000..4725bdfb0e
--- /dev/null
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+node_map_type ReduceMeanConverter(
+    const std::shared_ptr<lite::OpLite> reduce_mean_op,
+    const node_map_type& inputs_map) {
+  auto scope = reduce_mean_op->scope();
+  auto op_info = reduce_mean_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::npu::UniqueName(op_type);
+  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+
+  // get input, and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x_dims = scope->FindTensor(x_var_name)->dims();
+  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
+  auto dim = op_info->GetAttr<std::vector<int>>("dim");
+  CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty.";
+  for (size_t i = 0; i < dim.size(); i++) {
+    if (dim[i] < 0) {
+      dim[i] += x_dims.size();
+    }
+  }
+  std::sort(dim.begin(), dim.end());
+
+  // create reduce_mean(reduce_sum + scale) node and set input node from
+  // inputs_map
+  // creat reduce_sum node
+  auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum");
+  auto reduce_sum_node = std::make_shared<ge::op::ReduceSum>(unique_reduce_sum);
+  CHECK(inputs_map.count(x_var_name));
+  reduce_sum_node->set_input_x(*inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(reduce_sum_node);
+
+  auto dim_const_node =
+      std::make_shared<ge::op::Const>(unique_reduce_sum + "/dim");
+  dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData<int>(dim));
+  reduce_sum_node->set_input_w(*dim_const_node);
+  lite::npu::OpList::Global().add(dim_const_node);
+
+  reduce_sum_node->set_attr_keep_dims(keep_dim);
+
+  // create scale node
+  auto unique_scale = lite::npu::UniqueName("scale");
+  auto scale_node = std::make_shared<ge::op::Scale>(unique_scale);
+  scale_node->set_input_x(*reduce_sum_node);
+  lite::npu::OpList::Global().add(scale_node);
+
+  float scale = 1;
+  for (size_t i = 0; i < dim.size(); i++) {
+    scale /= x_dims[dim[i]];
+  }
+
+  std::vector<int64_t> scale_bias_shape = x_dims.Vectorize();
+  if (keep_dim) {
+    for (size_t i = 0; i < dim.size(); i++) {
+      scale_bias_shape[dim[i]] = 1;
+    }
+  } else {
+    const int64_t kDelFlag = -2;
+    for (size_t i = 0; i < dim.size(); ++i) {
+      scale_bias_shape[dim[i]] = kDelFlag;
+    }
+    scale_bias_shape.erase(
+        remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag),
+        scale_bias_shape.end());
+  }
+
+  auto filter_const_node =
+      std::make_shared<ge::op::Const>(unique_scale + "/filter");
+  filter_const_node->set_attr_value(
+      lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
+  scale_node->set_input_filter(*filter_const_node);
+  lite::npu::OpList::Global().add(filter_const_node);
+
+  scale_node->set_attr_axis(1);
+
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = scale_node;
+  return outputs_map;
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_NPU_BRIDGE(reduce_mean,
+                    paddle::lite::kernels::npu::bridges::ReduceMeanConverter);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op_test.cc b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
new file mode 100644
index 0000000000..8646ce5c25
--- /dev/null
+++ b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
@@ -0,0 +1,347 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/reduce_mean_op.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+void reduce_mean_n(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] += static_cast<float>(src[src_index]) / num_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_c(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] += static_cast<float>(src[src_index]) / channel_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_h(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = 0.0;
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] += static_cast<float>(src[src_index]) / height_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_w(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = 0.0;
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] += static_cast<float>(src[src_index]) / width_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_all(const float* src,
+                     float* dst,
+                     int num_in,
+                     int channel_in,
+                     int height_in,
+                     int width_in) {
+  float mean = 0.0;
+  int src_index;
+  int n_id, c_id;
+  int all = num_in * channel_in * height_in * width_in;
+  for (int n = 0; n < num_in; ++n) {
+    n_id = n * channel_in * height_in * width_in;
+    for (int c = 0; c < channel_in; ++c) {
+      c_id = c * height_in * width_in;
+      for (int h = 0; h < height_in; ++h) {
+        for (int w = 0; w < width_in; ++w) {
+          src_index = n_id + c_id + h * width_in + w;
+          mean = src[src_index] / all;
+        }
+      }
+    }
+  }
+  dst[0] = mean;
+}
+
+void reduce_mean_nc(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+void reduce_mean_ch(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+void reduce_mean_hw(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+void reduce_mean_ref(const std::shared_ptr<operators::ReduceMeanOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto x_dims = x->dims();
+  auto x_data = x->data<float>();
+  auto out = scope->FindMutableTensor("out_ref");
+
+  auto dim = op_info->GetAttr<std::vector<int>>("dim");
+  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
+
+  auto x_rank = x_dims.size();
+  if (!dim.empty()) {
+    for (size_t i = 0; i < dim.size(); i++) {
+      if (dim[i] < 0) {
+        dim[i] += x_rank;
+      }
+    }
+  }
+
+  bool reduce_all = false;
+  sort(dim.begin(), dim.end());
+  if (dim.size() == 0) {
+    reduce_all = true;
+  }
+
+  std::vector<int64_t> out_dims;
+  if (reduce_all) {
+    if (keep_dim) {
+      for (size_t i = 0; i < x_dims.size(); i++) {
+        out_dims.push_back(1);
+      }
+    } else {
+      out_dims.push_back(1);
+    }
+  } else {
+    for (int i = 0; i < x_dims.size(); i++) {
+      out_dims.push_back(x_dims[i]);
+    }
+    if (keep_dim) {
+      for (size_t i = 0; i < dim.size(); ++i) {
+        out_dims[dim[i]] = 1L;
+      }
+    } else {
+      int64_t kDelFlag = -2;
+      for (size_t i = 0; i < dim.size(); ++i) {
+        out_dims[dim[i]] = kDelFlag;
+      }
+      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                     out_dims.end());
+    }
+    out->Resize(DDim(out_dims));
+  }
+
+  auto out_data = out->mutable_data<float>();
+  int in_n = x_dims[0];
+  int in_c = x_dims[1];
+  int in_h = x_dims[2];
+  int in_w = x_dims[3];
+
+  if (dim.size() == 0) {
+    reduce_mean_all(x_data, out_data, in_n, in_c, in_h, in_w);
+  } else if (dim.size() == 1) {
+    switch (dim[0]) {
+      case 0:
+        reduce_mean_n(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 1:
+        reduce_mean_c(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 2:
+        reduce_mean_h(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 3:
+        reduce_mean_w(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      default:
+        LOG(FATAL) << "error!!!";
+    }
+  } else if (dim.size() == 2) {
+    if (dim[0] == 0 && dim[1] == 1) {
+      reduce_mean_nc(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim[0] == 1 && dim[1] == 2) {
+      reduce_mean_ch(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim[0] == 2 && dim[1] == 3) {
+      reduce_mean_hw(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else {
+      LOG(FATAL) << "invalid dim!!";
+    }
+  }
+}
+
+void test_reduce_mean(const std::vector<int64_t>& input_shape,
+                      std::vector<int> dim,
+                      bool keep_dim) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("reduce_mean");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("dim", dim);
+  opdesc.SetAttr("keep_dim", keep_dim);
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ReduceMeanOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  reduce_mean_ref(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, reduce_mean) {
+  std::vector<std::vector<int>> reduce_dim{
+      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
+  for (auto dim : reduce_dim) {
+    for (auto keep_dim : {true, false}) {
+      test_reduce_mean({1, 2, 3, 4}, dim, keep_dim);
+    }
+  }
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(reduce_mean);
+USE_NPU_BRIDGE(reduce_mean);
diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc
index b2ed556faf..a554aac94f 100644
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -41,8 +41,10 @@ node_map_type ReshapeConverter(const std::shared_ptr<lite::OpLite> reshape_op,
   reshape_node->set_input_tensor(*inputs_map.at(x_var_name));
   lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
 
-  // read shape from actual shape tensor as input "w" if 'Shape' is found
-  if (lite::npu::HasInputArg(op_info, scope, "Shape")) {
+  // read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr)
+  if (lite::npu::HasInputArg(op_info, scope, "ShapeTensor")) {
+    LOG(FATAL) << "[NPU] not support \"Shape\" from more than one Tensor.";
+  } else if (lite::npu::HasInputArg(op_info, scope, "Shape")) {
     auto actual_shape_var_name = op_info->Input("Shape").front();
     if (!inputs_map.count(actual_shape_var_name)) {
       auto actual_shape =
diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc
new file mode 100644
index 0000000000..84ab3a9eb2
--- /dev/null
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+node_map_type SqrtConverter(const std::shared_ptr<lite::OpLite> sqrt_op,
+                            const node_map_type& inputs_map) {
+  auto scope = sqrt_op->scope();
+  auto op_info = sqrt_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::npu::UniqueName(op_type);
+  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+
+  std::shared_ptr<ge::op::Sqrt> sqrt_node =
+      std::make_shared<ge::op::Sqrt>(unique_op_type);
+
+  auto x_var_name = op_info->Input("X").front();
+
+  CHECK(inputs_map.count(x_var_name));
+  sqrt_node->set_input_x(*inputs_map.at(x_var_name));
+
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(sqrt_node);
+
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = sqrt_node;
+  return outputs_map;
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_NPU_BRIDGE(sqrt, paddle::lite::kernels::npu::bridges::SqrtConverter);
diff --git a/lite/kernels/npu/bridges/sqrt_op_test.cc b/lite/kernels/npu/bridges/sqrt_op_test.cc
new file mode 100644
index 0000000000..015d61685b
--- /dev/null
+++ b/lite/kernels/npu/bridges/sqrt_op_test.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <cmath>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+#include "lite/operators/activation_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+template <typename dtype>
+void sqrt_ref(const std::shared_ptr<operators::ActivationOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  for (size_t i = 0; i < x->numel(); i++) {
+    out_data[i] = std::sqrtf(x_data[i]);
+  }
+}
+
+void test_sqrt(const std::vector<int64_t>& input_shape) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x, 0, 5);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("sqrt");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  sqrt_ref<float>(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, sqrt) {
+  test_sqrt({2});
+  test_sqrt({2, 3});
+  test_sqrt({1, 2, 3, 4});
+  test_sqrt({5, 6, 7, 8});
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(sqrt);
+USE_NPU_BRIDGE(sqrt);
diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc
new file mode 100644
index 0000000000..2ca91adba0
--- /dev/null
+++ b/lite/kernels/npu/bridges/square_op.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+node_map_type SquareConverter(const std::shared_ptr<lite::OpLite> square_op,
+                              const node_map_type& inputs_map) {
+  auto scope = square_op->scope();
+  auto op_info = square_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::npu::UniqueName(op_type);
+  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+
+  std::shared_ptr<ge::op::Square> square_node =
+      std::make_shared<ge::op::Square>(unique_op_type);
+
+  auto x_var_name = op_info->Input("X").front();
+
+  CHECK(inputs_map.count(x_var_name));
+  square_node->set_input_x(*inputs_map.at(x_var_name));
+
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(square_node);
+
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = square_node;
+  return outputs_map;
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_NPU_BRIDGE(square,
+                    paddle::lite::kernels::npu::bridges::SquareConverter);
diff --git a/lite/kernels/npu/bridges/square_op_test.cc b/lite/kernels/npu/bridges/square_op_test.cc
new file mode 100644
index 0000000000..d715c11430
--- /dev/null
+++ b/lite/kernels/npu/bridges/square_op_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+#include "lite/operators/activation_ops.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+template <typename dtype>
+void square_ref(const std::shared_ptr<operators::ActivationOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto out = scope->FindMutableTensor("out_ref");
+  out->Resize(x->dims());
+  auto x_data = x->data<dtype>();
+  auto out_data = out->mutable_data<dtype>();
+
+  for (size_t i = 0; i < x->numel(); i++) {
+    out_data[i] = x_data[i] * x_data[i];
+  }
+}
+
+void test_square(const std::vector<int64_t>& input_shape) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  auto* x = scope.NewTensor(x_var_name);
+  auto* out = scope.NewTensor(out_var_name);
+  auto* out_ref = scope.NewTensor(out_ref_var_name);
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("square");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  square_ref<float>(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, square) {
+  test_square({2});
+  test_square({2, 3});
+  test_square({1, 2, 3, 4});
+  test_square({5, 6, 7, 8});
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(square);
+USE_NPU_BRIDGE(square);
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index d070eb84c5..99b23c19f0 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (NOT LITE_WITH_OPENCL)
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_OPENCL))
     return ()
 endif()
 
diff --git a/lite/kernels/opencl/conv_compute.cc b/lite/kernels/opencl/conv_compute.cc
index 04a78face2..e13d12ec22 100644
--- a/lite/kernels/opencl/conv_compute.cc
+++ b/lite/kernels/opencl/conv_compute.cc
@@ -38,15 +38,20 @@ void ConvCompute::PrepareForRun() {
   int w_out = output_dims[3];
   int kernel_h = filter_dims[2];  // oihw
   int kernel_w = filter_dims[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int groups = param.groups;
   bool relu_fused = param.fuse_relu;
-  bool no_dilation = (param.dilations[0] == 1) && (param.dilations[1] == 1);
+  bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
   bool zero_pad = (pad_h == 0) && (pad_w == 0);
 
+  bool pad_equal =
+      ((paddings[0] == paddings[1]) && (paddings[2] == paddings[3]));
+
   VLOG(3) << "Is relu fused? / " << (relu_fused ? "Yes" : "No");
   VLOG(3) << "groups:" << groups << " stride_h:" << stride_h
           << " stride_w:" << stride_w << " pad_h:" << pad_h
@@ -60,7 +65,7 @@ void ConvCompute::PrepareForRun() {
           << filter_dims[2] << " " << filter_dims[3];
 
   if (kernel_h == 1 && kernel_w == 1 && stride_h == 1 && stride_w == 1 &&
-      zero_pad && no_dilation) {
+      zero_pad && no_dilation && pad_equal) {
     // conv2d_1x1
     kernel_func_names_.push_back("gemm_batch");
     kernel_func_paths_.push_back("buffer/fc_kernel.cl");
@@ -70,7 +75,7 @@ void ConvCompute::PrepareForRun() {
       build_options_.push_back("-DCL_DTYPE=float");
     }
     impl_ = &ConvCompute::Conv2d1x1;
-  } else {
+  } else if (pad_equal) {
     kernel_func_names_.push_back("im2col");
     kernel_func_names_.push_back("gemm_batch");
     kernel_func_paths_.push_back("buffer/im2col_kernel.cl");
@@ -85,6 +90,9 @@ void ConvCompute::PrepareForRun() {
     col_buffer_.reset(new lite::Tensor);
     col_buffer_->Resize({bs, c_in, kernel_h * kernel_w, h_out * w_out});
     col_buffer_->mutable_data<float, cl::Buffer>(TARGET(kOpenCL));
+  } else {
+    LOG(FATAL) << "This pad not support ! " << paddings[0] << ", "
+               << paddings[1] << ", " << paddings[2] << ", " << paddings[3];
   }
 
   for (size_t i = 0; i < kernel_func_names_.size(); i++) {
@@ -102,17 +110,19 @@ void ConvCompute::GemmlikeConv2d() {
   int c_in = x_dims[1];
   int h_in = x_dims[2];
   int w_in = x_dims[3];
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   int c_out = output_dims[1];
   int h_out = output_dims[2];
   int w_out = output_dims[3];
   int kernel_h = filter_dims[2];
   int kernel_w = filter_dims[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int pad_h = paddings[0];
+  int pad_w = paddings[2];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
-  int dilation_h = param.dilations[0];
-  int dilation_w = param.dilations[1];
+  int dilation_h = dilations[0];
+  int dilation_w = dilations[1];
 
   auto* x_buf = param.x->data<float, cl::Buffer>();
   auto* filter_buf = param.filter->data<float, cl::Buffer>();
diff --git a/lite/kernels/opencl/conv_compute_test.cc b/lite/kernels/opencl/conv_compute_test.cc
index a7417e3525..3bc7a0734d 100644
--- a/lite/kernels/opencl/conv_compute_test.cc
+++ b/lite/kernels/opencl/conv_compute_test.cc
@@ -24,7 +24,6 @@ namespace lite {
 #define A(i, j) a[i * lda + j]
 #define B(i, j) cur_b[i * ldb + j]
 #define C(i, j) cur_c[i * ldc + j]
-
 template <typename Dtype1, typename Dtype2>
 static void conv_basic(const Dtype1* din,
                        Dtype2* dout,
@@ -227,10 +226,12 @@ TEST(conv2d, compute_conv2d_1x1) {
                 param.bias = bias_flag ? &bias : nullptr;
                 param.output = &out;
                 param.strides = {stride, stride};
-                param.paddings = {pad, pad};
+                std::vector<int> paddings = {pad, pad, pad, pad};
                 param.groups = group;
-                param.dilations = {dilation, dilation};
+                std::vector<int> dilations = {dilation, dilation};
                 param.fuse_relu = relu_flag;
+                param.paddings = std::make_shared<std::vector<int>>(paddings);
+                param.dilations = std::make_shared<std::vector<int>>(dilations);
 
                 kernel->SetParam(param);
                 std::unique_ptr<KernelContext> conv_context(new KernelContext);
@@ -454,11 +455,14 @@ TEST(conv2d, compute_conv2d_gemm) {
                 param.bias = bias_flag ? &bias : nullptr;
                 param.output = &out;
                 param.strides = {stride, stride};
-                param.paddings = {pad, pad};
+                std::vector<int> paddings = {pad, pad, pad, pad};
                 param.groups = group;
-                param.dilations = {dilation, dilation};
+                std::vector<int> dilations = {dilation, dilation};
                 param.fuse_relu = relu_flag;
 
+                param.paddings = std::make_shared<std::vector<int>>(paddings);
+                param.dilations = std::make_shared<std::vector<int>>(dilations);
+
                 kernel->SetParam(param);
                 std::unique_ptr<KernelContext> conv_context(new KernelContext);
                 context->As<OpenCLContext>().CopySharedTo(
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute.cc b/lite/kernels/opencl/depthwise_conv2d_compute.cc
index 62734610e2..ed942d7f0c 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute.cc
@@ -44,7 +44,7 @@ class DepthwiseConv2dCompute
     auto x_dims = param.x->dims();
     auto filter_dims = param.filter->dims();
     auto output_dims = param.output->dims();
-    auto paddings = param.paddings;
+    auto paddings = *param.paddings;
     auto strides = param.strides;
 
     auto& context = ctx_->As<OpenCLContext>();
diff --git a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
index a189acaf91..3556d1abed 100644
--- a/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
+++ b/lite/kernels/opencl/depthwise_conv2d_compute_test.cc
@@ -105,7 +105,8 @@ TEST(depthwise_conv2d, compute) {
   param.x = &input;
   param.filter = &filter;
   param.output = &output;
-  param.paddings = std::vector<int>{0, 0};
+  std::vector<int> paddings = {0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
   param.strides = std::vector<int>{1, 1};
 
   std::unique_ptr<KernelContext> context(new KernelContext);
diff --git a/lite/kernels/opencl/io_copy_compute.cc b/lite/kernels/opencl/io_copy_compute.cc
index dc4bdfe64c..3387a0887d 100644
--- a/lite/kernels/opencl/io_copy_compute.cc
+++ b/lite/kernels/opencl/io_copy_compute.cc
@@ -103,8 +103,9 @@ class IoCopykOpenCLToHostCompute
     auto* wait_list = context.cl_wait_list();
     auto* x_ptr = param.x->data<float, cl::Buffer>();
 
-    /* TODO(ysh329): io_copy(device->host) jammed if emplace to `cl_wait_list`
-    in kernel and enable wait_list
+    /* TODO(ysh329): io_copy(device->host) jammed if `it` emplaced to
+    `cl_wait_list`
+    in kernel and `wait_list` enabled
     auto it = wait_list->find(x_ptr);
     if (it != wait_list->end()) {
       VLOG(4) << "--- Find the sync event for the target cl tensor. ---";
diff --git a/lite/kernels/opencl/pool_compute.cc b/lite/kernels/opencl/pool_compute.cc
index dc2e851595..d275b312d6 100644
--- a/lite/kernels/opencl/pool_compute.cc
+++ b/lite/kernels/opencl/pool_compute.cc
@@ -44,16 +44,22 @@ class PoolCompute
     const auto& out_dims = param.output->dims();
     const std::string pooling_type = param.pooling_type;
     const bool global_pooling = param.global_pooling;
-    std::vector<int> paddings = param.paddings;
+    std::vector<int> paddings = *param.paddings;
     std::vector<int> strides = param.strides;
     std::vector<int> ksize = param.ksize;
     if (global_pooling) {
       for (size_t i = 0; i < ksize.size(); ++i) {
-        paddings[i] = 0;
+        paddings[2 * i] = 0;
+        paddings[2 * i + 1] = 0;
         ksize[i] = static_cast<int>(in_dims[i + 2]);
       }
     }
-
+    bool pads_equal =
+        (paddings[0] == paddings[1]) && (paddings[2] == paddings[3]);
+    if (!pads_equal) {
+      LOG(FATAL)
+          << "padding requires pad_left == pad_right, pad_top == pad_bottom";
+    }
     auto& context = ctx_->As<OpenCLContext>();
     CHECK(context.cl_context() != nullptr);
     auto* input_buf = param.x->data<float, cl::Buffer>();
@@ -89,7 +95,7 @@ class PoolCompute
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
     CL_CHECK_FATAL(status);
-    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[1]));
+    status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[2]));
     CL_CHECK_FATAL(status);
     status = kernel.setArg(++arg_idx, *output_buf);
     CL_CHECK_FATAL(status);
diff --git a/lite/kernels/opencl/pool_compute_test.cc b/lite/kernels/opencl/pool_compute_test.cc
index 53f64e9505..25f0e72634 100644
--- a/lite/kernels/opencl/pool_compute_test.cc
+++ b/lite/kernels/opencl/pool_compute_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+#include <memory>
 #include <random>
 #include "lite/backends/opencl/target_wrapper.h"
 #include "lite/core/op_registry.h"
@@ -88,9 +89,10 @@ TEST(pool2d, compute) {
   param.output = &out;
   param.global_pooling = true;
   param.pooling_type = "avg";
-  param.paddings = std::vector<int>{0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
   param.strides = std::vector<int>{1, 1};
   param.ksize = std::vector<int>{7, 7};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
 
   std::unique_ptr<KernelContext> context(new KernelContext);
   context->As<OpenCLContext>().InitOnce();
diff --git a/lite/kernels/x86/CMakeLists.txt b/lite/kernels/x86/CMakeLists.txt
index da955e4fd5..bf3a1685f0 100644
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -5,6 +5,7 @@ add_kernel(activation_compute_x86 X86 basic SRCS activation_compute.cc DEPS ${li
 
 # lite_cc_library(fc_compute_x86 SRCS fc_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(scale_compute_x86 X86 basic SRCS scale_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(cast_compute_x86 X86 basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
 add_kernel(slice_compute_x86 X86 basic SRCS slice_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(squeeze_compute_x86 X86 basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(fill_constant_batch_size_like_compute_x86 X86 basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_function)
@@ -15,8 +16,10 @@ add_kernel(conv_compute_x86 X86 basic SRCS conv_compute.cc DEPS ${lite_kernel_de
 # lite_cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps} )
 # lite_cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
 add_kernel(pool_compute_x86 X86 basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
+add_kernel(stack_compute_x86 X86 basic SRCS stack_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(dropout_compute_x86 X86 basic SRCS dropout_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(transpose_compute_x86 X86 basic SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_function)
+add_kernel(layer_norm_compute_x86 X86 basic SRCS layer_norm_compute.cc DEPS ${lite_kernel_deps} jit_kernel_helper)
 # add_kernel(fc_compute_x86 X86 basic SRCS fc_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_library(uniform_random_compute_x86 SRCS uniform_random_compute.cc DEPS ${lite_kernel_deps} )
@@ -26,6 +29,7 @@ add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_comp
 
 # lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
 # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
+add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps} fluid_data_type)
 # lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
 # lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 # lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
@@ -33,12 +37,27 @@ add_kernel(mul_compute_x86 X86 basic SRCS mul_compute.cc DEPS ${lite_kernel_deps
 add_kernel(concat_compute_x86 X86 basic SRCS concat_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(shape_compute_x86 X86 basic SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_pool_compute_x86 X86 basic SRCS sequence_pool_compute.cc DEPS ${lite_kernel_deps} sequence_pooling)
+add_kernel(search_group_padding_compute_x86 X86 basic SRCS search_group_padding_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(sequence_reverse_compute_x86 X86 basic SRCS sequence_reverse_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(softmax_compute_x86 X86 basic SRCS softmax_compute.cc DEPS ${lite_kernel_deps} softmax)
 add_kernel(elementwise_compute_x86 X86 basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(batch_norm_compute_x86 X86 basic SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reduce_sum_compute_x86 X86 basic SRCS reduce_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(lookup_table_compute_x86 X86 basic SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(sequence_reshape_compute_x86 X86 basic SRCS sequence_reshape_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(match_matrix_tensor_compute_x86 X86 basic SRCS match_matrix_tensor_compute.cc DEPS ${lite_kernel_deps} blas math_function)
+add_kernel(search_seq_depadding_compute_x86 X86 basic SRCS search_seq_depadding_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(search_grnn_compute_x86 X86 basic SRCS search_grnn_compute.cc DEPS ${lite_kernel_deps} blas math_function)
+add_kernel(sequence_concat_compute_x86 X86 basic SRCS sequence_concat_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(var_conv_2d_compute_x86 X86 basic SRCS var_conv_2d_compute.cc DEPS ${lite_kernel_deps} blas fluid_data_type)
+add_kernel(attention_padding_mask_compute_x86 X86 basic SRCS attention_padding_mask_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(sequence_arithmetic_compute_x86 X86 basic SRCS sequence_arithmetic_compute.cc DEPS ${lite_kernel_deps})
+
+# for content-dnn specific
+add_kernel(search_aligned_mat_mul_compute_x86 X86 extra SRCS search_aligned_mat_mul_compute.cc DEPS ${lite_kernel_deps} blas)
+add_kernel(search_seq_fc_compute_x86 X86 extra SRCS search_seq_fc_compute.cc DEPS ${lite_kernel_deps} blas)
+add_kernel(sequence_topk_avg_pooling_compute_x86 X86 basic SRCS sequence_topk_avg_pooling_compute.cc DEPS ${lite_kernel_deps} sequence_topk_avg_pooling)
+add_kernel(search_fc_compute_x86 X86 basic SRCS search_fc_compute.cc DEPS ${lite_kernel_deps} search_fc)
 
 if(NOT LITE_WITH_X86)
     return()
@@ -47,12 +66,14 @@ add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kerne
 
 lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86)
+lite_cc_test(test_gather_compute_x86 SRCS gather_compute_test.cc DEPS gather_compute_x86)
 lite_cc_test(test_slice_compute_x86 SRCS slice_compute_test.cc DEPS slice_compute_x86)
 lite_cc_test(test_squeeze_compute_x86 SRCS squeeze_compute_test.cc DEPS squeeze_compute_x86)
 lite_cc_test(test_fill_constant_batch_size_like_compute_x86 SRCS fill_constant_batch_size_like_compute_test.cc DEPS fill_constant_batch_size_like_compute_x86)
 lite_cc_test(test_reshape_compute_x86 SRCS reshape_compute_test.cc DEPS reshape_compute_x86)
 lite_cc_test(test_concat_compute_x86 SRCS concat_compute_test.cc DEPS concat_compute_x86)
 lite_cc_test(test_sequence_pool_compute_x86 SRCS sequence_pool_compute_test.cc DEPS sequence_pool_compute_x86)
+lite_cc_test(test_sequence_reverse_compute_x86 SRCS sequence_reverse_compute_test.cc DEPS sequence_reverse_compute_x86)
 lite_cc_test(test_shape_compute_x86 SRCS shape_compute_test.cc DEPS shape_compute_x86)
 lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
 lite_cc_test(test_softmax_compute_x86 SRCS softmax_compute_test.cc DEPS softmax_compute_x86)
@@ -63,7 +84,19 @@ lite_cc_test(test_gelu_compute_x86 SRCS gelu_compute_test.cc DEPS activation_com
 lite_cc_test(test_sequence_expand_as_compute_x86 SRCS sequence_expand_as_compute_test.cc DEPS sequence_expand_as_compute_x86)
 lite_cc_test(test_gru_compute_x86 SRCS gru_compute_test.cc DEPS gru_compute_x86)
 lite_cc_test(test_matmul_compute_x86 SRCS matmul_compute_test.cc DEPS matmul_compute_x86)
-
+lite_cc_test(test_cast_compute_x86 SRCS cast_compute_test.cc DEPS cast_compute_x86)
 lite_cc_test(test_pool2d_compute_x86 SRCS pool_compute_test.cc DEPS pool_compute_x86)
+lite_cc_test(test_layer_norm_compute_x86 SRCS layer_norm_compute_test.cc DEPS layer_norm_compute_x86)
 lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 lite_cc_test(test_transpose_compute_x86 SRCS transpose_compute_test.cc DEPS transpose_compute_x86)
+lite_cc_test(test_search_fc_compute_x86 SRCS search_fc_compute_test.cc DEPS search_fc_compute_x86)
+lite_cc_test(test_search_seq_depadding_compute_x86 SRCS search_seq_depadding_compute_test.cc DEPS search_seq_depadding_compute_x86)
+lite_cc_test(test_search_grnn_compute_x86 SRCS search_grnn_compute_test.cc DEPS search_grnn_compute_x86)
+lite_cc_test(test_match_matrix_compute_x86 SRCS match_matrix_tensor_compute_test.cc DEPS match_matrix_tensor_compute_x86)
+lite_cc_test(test_lookup_table_compute_x86 SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_x86)
+lite_cc_test(test_stack_compute_x86 SRCS stack_compute_test.cc DEPS stack_compute_x86)
+lite_cc_test(test_search_group_padding_compute_x86 SRCS search_group_padding_compute_test.cc DEPS search_group_padding_compute_x86)
+lite_cc_test(test_sequence_concat_compute_x86 SRCS sequence_concat_compute_test.cc DEPS sequence_concat_compute_x86)
+lite_cc_test(test_var_conv_2d_compute_x86 SRCS var_conv_2d_compute_test.cc DEPS var_conv_2d_compute_x86)
+#lite_cc_test(test_attention_padding_mask_compute_x86 SRCS attention_padding_mask_compute_test.cc DEPS attention_padding_mask_compute_x86)
+lite_cc_test(test_sequence_arithmetic_compute_x86 SRCS sequence_arithmetic_compute_test.cc DEPS sequence_arithmetic_compute_x86)
diff --git a/lite/kernels/x86/attention_padding_mask_compute.cc b/lite/kernels/x86/attention_padding_mask_compute.cc
new file mode 100644
index 0000000000..0c35c416e7
--- /dev/null
+++ b/lite/kernels/x86/attention_padding_mask_compute.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/attention_padding_mask_compute.h"
+
+REGISTER_LITE_KERNEL(
+    search_attention_padding_mask,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::AttentionPaddingMaskCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("pad_begin", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/attention_padding_mask_compute.h b/lite/kernels/x86/attention_padding_mask_compute.h
new file mode 100644
index 0000000000..b9124e5ad4
--- /dev/null
+++ b/lite/kernels/x86/attention_padding_mask_compute.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <Eigen/Core>
+#include <random>
+#include <string>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/eigen.h"
+#include "lite/operators/attention_padding_mask_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class AttentionPaddingMaskCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::AttentionPaddingMaskParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto* bottom0 = param.X;
+    auto* bottom1 = param.Y;
+    auto* _pad_begin = param.pad_begin;
+    auto* top = param.Out;
+    int _pad_id = param.pad_id;
+    float _mask = param.mask;
+    auto src_len = static_cast<int64_t>(bottom1->lod()[0][1]);
+    const int att_batch = bottom0->lod()[0].size() - 1;
+    const int src_batch = bottom1->lod()[0].size() - 1;
+    int* pad_begin = _pad_begin->mutable_data<int>();
+    for (int i = 0; i < src_batch; ++i) {
+      const auto* src_data = bottom1->data<T>() + src_len * i;
+      int index = src_len - 1;
+      for (; index >= 0 && _pad_id == static_cast<int>(src_data[index]);
+           --index) {
+      }
+      pad_begin[i] = index + 1;
+    }
+
+    const auto att_len = static_cast<int64_t>(bottom0->lod()[0][1]);
+    auto* top_data = top->mutable_data<T>();
+    memcpy(top_data,
+           bottom0->data<T>(),
+           bottom0->dims()[0] * bottom0->dims()[1] * sizeof(T));
+    for (int i = 0; i < att_batch; ++i) {
+      for (int j = 0; j < att_len; ++j) {
+        top_data = top->mutable_data<T>() + src_len * (att_len * i + j);
+        int src_idx = i % src_batch;
+        for (int k = pad_begin[src_idx]; k < src_len; ++k) {
+          top_data[k] = _mask;
+        }
+      }
+    }
+  }
+
+  virtual ~AttentionPaddingMaskCompute() = default;
+
+ private:
+  lite::Tensor src_offset_;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/attention_padding_mask_compute_test.cc b/lite/kernels/x86/attention_padding_mask_compute_test.cc
new file mode 100644
index 0000000000..35ce822e01
--- /dev/null
+++ b/lite/kernels/x86/attention_padding_mask_compute_test.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/attention_padding_mask_compute.cc"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+void attention_padding_mask_ref(
+    const Tensor& x,
+    const Tensor& y,
+    Tensor* out,
+    Tensor* pad_begin,
+    const operators::AttentionPaddingMaskParam& param) {
+  auto attn_offset = x.lod()[0];
+  auto src_offset = y.lod()[0];
+  int attn_seq_num = attn_offset.size() - 1;
+  int src_seq_num = src_offset.size() - 1;
+  int attn_seq_len = attn_offset[1];
+  int src_seq_len = x.dims()[1];
+  CHECK_EQ(attn_seq_num % src_seq_num, 0);
+
+  auto count = x.numel();
+  auto attn_data = x.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  memcpy(out_data, attn_data, count * sizeof(float));
+
+  for (int i = 0; i < attn_seq_num; ++i) {
+    for (int j = 0; j < attn_seq_len; ++j) {
+      auto tmp_out_data = out_data + src_seq_len * (attn_seq_len * i + j);
+      int src_seq_idx = i % src_seq_num;
+      int cur_len = src_offset[src_seq_idx + 1] - src_offset[src_seq_idx];
+      for (int k = cur_len; k < src_seq_len; k++) {
+        tmp_out_data[k] = param.mask;
+      }
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& lod, int64_t dim2rd) {
+  std::vector<int64_t> x_dims{static_cast<int64_t>(lod[0].back()), dim2rd};
+  x->Resize(x_dims);
+  x->set_lod(lod);
+  auto x_data = x->mutable_data<float>();
+  auto x_num = x->numel();
+  for (int i = 0; i < x_num; i++) {
+    x_data[i] = (i - x_num) * 1.1;
+  }
+}
+
+int get_max_len(const LoD& lod) {
+  int max_len = 0;
+  auto offset = lod[0];
+  for (int i = 0; i < offset.size() - 1; i++) {
+    int cur_len = offset[i + 1] - offset[i];
+    max_len = max_len < cur_len ? cur_len : max_len;
+  }
+  return max_len;
+}
+
+TEST(attention_padding_mask_x86, retrive_op) {
+  auto attention_padding_mask =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "attention_padding_mask");
+  ASSERT_FALSE(attention_padding_mask.empty());
+  ASSERT_TRUE(attention_padding_mask.front());
+}
+
+TEST(attention_padding_mask_x86, init) {
+  AttentionPaddingMaskCompute<float> attention_padding_mask;
+  ASSERT_EQ(attention_padding_mask.precision(), PRECISION(kFloat));
+  ASSERT_EQ(attention_padding_mask.target(), TARGET(kX86));
+}
+
+TEST(attention_padding_mask_x86, run_test) {
+  lite::Tensor x, y;
+  lite::Tensor out, pad_begin, out_ref, pad_begin_ref;
+
+  LoD x_lod{{0, 3, 6, 9, 12}}, y_lod{{0, 4, 6}};
+  prepare_input(&x, x_lod, get_max_len(y_lod));
+  prepare_input(&y, y_lod, 1);
+
+  operators::AttentionPaddingMaskParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.pad_id = 12800001;
+  param.mask = -90000000.f;
+  param.Out = &out;
+  param.pad_begin = &pad_begin;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  AttentionPaddingMaskCompute<float> attention_padding_mask_kernel;
+  attention_padding_mask_kernel.SetParam(param);
+  attention_padding_mask_kernel.SetContext(std::move(ctx));
+  attention_padding_mask_kernel.Run();
+
+  attention_padding_mask_ref(x, y, &out_ref, &pad_begin_ref, param);
+  auto out_data = out.data<float>();
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_attention_padding_mask, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/cast_compute.cc b/lite/kernels/x86/cast_compute.cc
new file mode 100644
index 0000000000..d342056c7f
--- /dev/null
+++ b/lite/kernels/x86/cast_compute.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/cast_compute.h"
+
+REGISTER_LITE_KERNEL(cast,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::CastCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/cast_compute.h b/lite/kernels/x86/cast_compute.h
new file mode 100644
index 0000000000..06e47e9a50
--- /dev/null
+++ b/lite/kernels/x86/cast_compute.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/data_type.h"
+#include "lite/fluid/hostdevice.h"
+#include "lite/fluid/transform.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <lite::TargetType Target, typename InT>
+class CastOpFunctor {
+ public:
+  CastOpFunctor(const lite::Tensor* in,
+                lite::Tensor* out,
+                const lite::Context<Target>& context)
+      : input(in), output(out), ctx(context) {}
+
+  template <typename OutT>
+  void apply() const {
+    auto* in_begin = input->data<InT>();
+    auto numel = input->dims().production();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = output->mutable_data<OutT>();
+    paddle::lite::fluid::Transform<lite::TargetType::kX86> trans;
+    trans(
+        ctx, in_begin, in_end, out_begin, CastOpTransformFunctor<InT, OutT>());
+  }
+
+ private:
+  const lite::Tensor* input;
+  lite::Tensor* output;
+  const lite::Context<Target>& ctx;
+};
+
+template <typename InT>
+class CastCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::CastParam;
+
+  void Run() override {
+    auto param = param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    auto x = param->X;
+    auto out = param->Out;
+    auto out_dtype = param->out_dtype;
+    paddle::lite::fluid::VisitDataType(
+        static_cast<framework::proto::VarType::Type>(out_dtype),
+        CastOpFunctor<lite::TargetType::kX86, InT>(x, out, context));
+  }
+  virtual ~CastCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/cast_compute_test.cc b/lite/kernels/x86/cast_compute_test.cc
new file mode 100644
index 0000000000..f7aa52ca6d
--- /dev/null
+++ b/lite/kernels/x86/cast_compute_test.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/cast_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(cast_x86, retrive_op) {
+  auto cast =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("cast");
+  ASSERT_FALSE(cast.empty());
+  ASSERT_TRUE(cast.front());
+}
+
+TEST(cast_x86, init) {
+  CastCompute<float> cast;
+  ASSERT_EQ(cast.precision(), PRECISION(kFloat));
+  ASSERT_EQ(cast.target(), TARGET(kX86));
+}
+
+TEST(cast_x86, run_test) {
+  lite::Tensor x, out;
+  constexpr int batch_size = 1;
+  std::vector<int64_t> x_shape{batch_size, 1, 3, 3};
+  x.Resize(lite::DDim(x_shape));
+
+  std::vector<int64_t> out_shape{batch_size, 1, 3, 3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<int32_t>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(1);
+  }
+
+  CastCompute<float> cast;
+  operators::CastParam param;
+  param.X = &x;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  cast.SetContext(std::move(ctx));
+  cast.SetParam(param);
+  cast.Run();
+
+  std::vector<int32_t> ref_results = {1, 1, 1, 1, 1, 1, 1, 1, 1};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(cast, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/conv_compute.h b/lite/kernels/x86/conv_compute.h
index 48cb3c74ef..e9f403059f 100644
--- a/lite/kernels/x86/conv_compute.h
+++ b/lite/kernels/x86/conv_compute.h
@@ -67,7 +67,7 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     lite::DDim col_shape(col_shape_vec);
     lite::DDim col_matrix_shape = col_shape.Flatten2D(data_dim + 1);
     bool is_expand = IsExpand(
-        filter_shape_vec, param.strides, param.paddings, param.dilations);
+        filter_shape_vec, param.strides, *param.paddings, *param.dilations);
     lite::Tensor col;
     lite::Tensor col_matrix;
     if (is_expand) {
@@ -95,20 +95,15 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto blas =
         paddle::lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
     for (int i = 0; i < batch_size; i++) {
-      lite::Tensor in_batch;
-      lite::Tensor tmp_in_batch = param.x->Slice<T>(i, i + 1);
-      tmp_in_batch.Resize(input_shape);
-      in_batch.ShareDataWith(tmp_in_batch);
-      lite::Tensor out_batch;
-      lite::Tensor tmp_out_batch = param.output->Slice<T>(i, i + 1);
-      tmp_out_batch.Resize(output_matrix_shape);
-      out_batch.ShareDataWith(tmp_out_batch);
+      lite::Tensor in_batch = param.x->Slice<T>(i, i + 1);
+      in_batch.Resize(input_shape);
+      lite::Tensor out_batch = param.output->Slice<T>(i, i + 1);
+      out_batch.Resize(output_matrix_shape);
       for (int g = 0; g < param.groups; g++) {
-        lite::Tensor in_slice;
-        in_slice.ShareDataWith(
+        lite::Tensor in_slice =
             in_batch.Slice<T>(static_cast<int64_t>(g * in_step),
-                              static_cast<int64_t>((g + 1) * in_step)));
-
+                              static_cast<int64_t>((g + 1) * in_step));
+        auto paddings = *param.paddings;
         if (!is_expand) {
           col.ShareDataWith(in_slice);
           col_matrix.ShareDataWith(col);
@@ -117,32 +112,30 @@ class Conv2dCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
           // im2col
           im2col(context,
                  in_slice,
-                 param.dilations,
+                 *param.dilations,
                  param.strides,
-                 std::vector<int>{param.paddings[0],
-                                  param.paddings[1],
-                                  param.paddings[0],
-                                  param.paddings[1]},
+                 std::vector<int>{
+                     paddings[0], paddings[2], paddings[0], paddings[2]},
                  &(col));
         } else if (data_dim == 3U) {
           // vol2col
           vol2col(context,
                   in_slice,
-                  param.dilations,
+                  *param.dilations,
                   param.strides,
-                  param.paddings,
+                  *param.paddings,
                   &(col));
         }
 
         // gemm
         lite::Tensor out_slice;
-        out_slice.ShareDataWith(
+        out_slice =
             out_batch.Slice<T>(static_cast<int64_t>(g * out_step),
-                               static_cast<int64_t>((g + 1) * out_step)));
+                               static_cast<int64_t>((g + 1) * out_step));
         lite::Tensor filter_slice;
-        filter_slice.ShareDataWith(
+        filter_slice =
             filter.Slice<T>(static_cast<int64_t>(g * out_step),
-                            static_cast<int64_t>((g + 1) * out_step)));
+                            static_cast<int64_t>((g + 1) * out_step));
         blas.MatMul(filter_slice,
                     false,
                     col_matrix,
diff --git a/lite/kernels/x86/conv_compute_test.cc b/lite/kernels/x86/conv_compute_test.cc
index f2dde962b9..2827c6577e 100644
--- a/lite/kernels/x86/conv_compute_test.cc
+++ b/lite/kernels/x86/conv_compute_test.cc
@@ -73,9 +73,11 @@ TEST(conv2d_x86, run_test) {
   param.bias = &b;
   param.output = &out;
   param.strides = {1, 1};
-  param.paddings = {0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
   param.groups = 1;
-  param.dilations = {1, 1};
+  std::vector<int> dilations = {1, 1};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
+  param.dilations = std::make_shared<std::vector<int>>(dilations);
   LOG(INFO) << 123;
   std::unique_ptr<KernelContext> ctx(new KernelContext);
   ctx->As<X86Context>();
diff --git a/lite/kernels/x86/fill_constant_compute.cc b/lite/kernels/x86/fill_constant_compute.cc
index 1eb76332cc..dace1e9025 100644
--- a/lite/kernels/x86/fill_constant_compute.cc
+++ b/lite/kernels/x86/fill_constant_compute.cc
@@ -29,6 +29,38 @@ class FillConstantCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
  public:
   using param_t = operators::FillConstantParam;
 
+  inline DDimLite GetShape(const param_t& param) {
+    // 1. shape is a Tensor
+    if (param.shape_tensor != nullptr) {
+      auto* shape_tensor = param.shape_tensor;
+      auto* shape_data = shape_tensor->data<int>();
+      auto vec_shape =
+          std::vector<int64_t>(shape_data, shape_data + shape_tensor->numel());
+      return DDimLite(vec_shape);
+    }
+
+    // 2. shape is a list/tuple containing Tensor
+    auto shape_tensor_list = param.shape_tensor_list;
+    if (shape_tensor_list.size() > 0) {
+      std::vector<int64_t> vec_shape;
+      for (size_t i = 0; i < shape_tensor_list.size(); ++i) {
+        auto tensor = shape_tensor_list[i];
+        vec_shape.push_back(*tensor->data<int>());
+      }
+      return DDimLite(vec_shape);
+    }
+
+    // 3. shape is a list/tuple without containing Tensor
+    auto vec_shape = param.shape;
+    return DDimLite(vec_shape);
+  }
+
+  void PrepareForRun() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto outdims = GetShape(param);
+    param.Out->Resize(outdims);
+  }
+
   void Run() override {
     auto& param = *param_.get_mutable<param_t>();
     auto& context = ctx_->As<X86Context>();
@@ -55,5 +87,9 @@ REGISTER_LITE_KERNEL(fill_constant,
                      kNCHW,
                      paddle::lite::kernels::x86::FillConstantCompute<float>,
                      def)
+    .BindInput("ShapeTensor",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("ShapeTensorList",
+               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
diff --git a/lite/kernels/x86/gather_compute.cc b/lite/kernels/x86/gather_compute.cc
new file mode 100644
index 0000000000..836f336271
--- /dev/null
+++ b/lite/kernels/x86/gather_compute.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/gather_compute.h"
+
+typedef paddle::lite::kernels::x86::GatherCompute<float, int32_t> GatherInt32;
+typedef paddle::lite::kernels::x86::GatherCompute<float, int64_t> GatherInt64;
+
+REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt64, int64_in)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/gather_compute.h b/lite/kernels/x86/gather_compute.h
new file mode 100644
index 0000000000..6ee270647f
--- /dev/null
+++ b/lite/kernels/x86/gather_compute.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/data_type.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+/**
+ * A thin wrapper for gathering on cpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-IndexT index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T, typename IndexT = int>
+void CPUGather(const lite::Tensor* src,
+               const lite::Tensor* index,
+               lite::Tensor* output) {
+  // check index of shape 1-D
+  if (index->dims().size() == 2) {
+    CHECK(index->dims()[1] == 1) << "Index(Input)'s dimension[1] should be 1 "
+                                    "when Index(input)'s dimension's size "
+                                    "equal to 2 in Gather(Op).";
+  } else {
+    CHECK(index->dims().size() == 1)
+        << "Index(Input)'s dimension's size() should be 1 or 2 in Gather(Op).";
+  }
+  int64_t index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+
+  const T* p_src = src->data<T>();
+  const IndexT* p_index = index->data<IndexT>();
+  T* p_output = output->mutable_data<T>();
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
+  for (int64_t i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
+}
+
+template <typename T, typename IndexT>
+class GatherCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GatherParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+
+    auto x = param.X;
+    auto index = param.Index;
+    auto out = param.Out;
+
+    out->mutable_data<T>();
+    if (x->dims().production() == 0) return;
+    /*
+     * Since there's no type defined for lite::Tensor in Paddle-Lite, then
+     * convert the Index's value to float which must be int32_t or int64_t and
+     * this supposes to cause no precision difference during inference just for
+     * now.
+     * Alternatively, if define the Tensor's type during registering, may cause
+     * a redefinition error.
+     */
+    CPUGather<T, IndexT>(x, index, out);
+  }
+
+  virtual ~GatherCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/gather_compute_test.cc b/lite/kernels/x86/gather_compute_test.cc
new file mode 100644
index 0000000000..286dfcb08a
--- /dev/null
+++ b/lite/kernels/x86/gather_compute_test.cc
@@ -0,0 +1,159 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/gather_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(gather_x86, retrive_op) {
+  auto gather =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "gather");
+  ASSERT_FALSE(gather.empty());
+  int cnt = 0;
+  for (auto item = gather.begin(); item != gather.end(); ++item) {
+    cnt++;
+    ASSERT_TRUE(*item);
+  }
+  ASSERT_EQ(cnt, 2);
+}
+
+TEST(gather_x86, int32_init) {
+  GatherCompute<float, int32_t> gather;
+  ASSERT_EQ(gather.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gather.target(), TARGET(kX86));
+}
+
+TEST(gather_x86, int64_init) {
+  GatherCompute<float, int64_t> gather;
+  ASSERT_EQ(gather.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gather.target(), TARGET(kX86));
+}
+
+template <typename T>
+void test_case_1dims() {
+  lite::Tensor x, index, out;
+  std::vector<int64_t> x_shape{10};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> index_shape{3};
+  index.Resize(lite::DDim(index_shape));
+  std::vector<int64_t> out_shape{3};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto index_data = index.mutable_data<T>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<float> index_value{1, 3, 5};
+  for (int i = 0; i < index.dims().production(); ++i) {
+    index_data[i] = static_cast<T>(index_value[i]);
+  }
+
+  GatherCompute<float, T> gather;
+  operators::GatherParam param;
+
+  param.X = &x;
+  param.Index = &index;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gather.SetContext(std::move(ctx));
+  gather.SetParam(param);
+  gather.Run();
+
+  std::vector<float> ref_data{1, 3, 5};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+
+template <typename T>
+void test_case_2dims() {
+  lite::Tensor x, index, out;
+  std::vector<int64_t> x_shape{10, 20};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> index_shape{3};
+  index.Resize(lite::DDim(index_shape));
+  std::vector<int64_t> out_shape{3, 20};
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto index_data = index.mutable_data<T>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<float> index_value{1, 3, 5};
+  for (int i = 0; i < index.dims().production(); ++i) {
+    index_data[i] = static_cast<T>(index_value[i]);
+  }
+
+  GatherCompute<float, T> gather;
+  operators::GatherParam param;
+
+  param.X = &x;
+  param.Index = &index;
+  param.Out = &out;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gather.SetContext(std::move(ctx));
+  gather.SetParam(param);
+  gather.Run();
+
+  std::vector<float> ref_data(60);
+  for (int i = 0; i < 20; ++i) {
+    ref_data[i] = static_cast<float>(20 + i);
+  }
+  for (int i = 20; i < 40; ++i) {
+    ref_data[i] = static_cast<float>(40 + i);
+  }
+  for (int i = 40; i < 60; ++i) {
+    ref_data[i] = static_cast<float>(60 + i);
+  }
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+
+TEST(gather_x86, run_test_1dims) {
+  test_case_1dims<int32_t>();
+  test_case_1dims<int64_t>();
+}
+
+TEST(gather_x86, run_test_2dims) {
+  test_case_2dims<int32_t>();
+  test_case_2dims<int64_t>();
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, int64_in);
diff --git a/lite/kernels/x86/layer_norm_compute.cc b/lite/kernels/x86/layer_norm_compute.cc
new file mode 100644
index 0000000000..4854a69a1d
--- /dev/null
+++ b/lite/kernels/x86/layer_norm_compute.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/layer_norm_compute.h"
+
+REGISTER_LITE_KERNEL(layer_norm,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::LayerNormCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Mean", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Variance", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h
new file mode 100644
index 0000000000..bbbdb91deb
--- /dev/null
+++ b/lite/kernels/x86/layer_norm_compute.h
@@ -0,0 +1,91 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/backends/x86/jit/helper.h"
+#include "lite/backends/x86/jit/kernel_base.h"
+#include "lite/backends/x86/jit/kernels.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::LayerNormParam;
+
+  void Run() override {
+    auto &param = *param_.get_mutable<param_t>();
+    float epsilon = param.epsilon;
+    auto Scale = param.Scale;
+    auto Bias = param.Bias;
+    auto x = param.X;
+
+    auto y = param.Y;
+    auto Mean = param.Mean;
+    auto Var = param.Variance;
+    auto begin_norm_axis = param.begin_norm_axis;
+
+    auto x_dims = x->dims();
+
+    y->mutable_data<T>();
+    Mean->mutable_data<T>();
+    Var->mutable_data<T>();
+
+    auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    lite::DDim matrix_shape({left, right});
+
+    lite::Tensor in;
+    in.ShareDataWith(*x);
+    in.Resize(matrix_shape);
+    lite::Tensor out;
+    out.ShareDataWith(*y);
+    out.Resize(matrix_shape);
+
+    PADDLE_ENFORCE_EQ(Mean->numel(), left);
+    PADDLE_ENFORCE_EQ(Var->numel(), left);
+    PADDLE_ENFORCE_EQ(Scale->numel(), right);
+    PADDLE_ENFORCE_EQ(Bias->numel(), right);
+
+    auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<T>,
+                                              lite::fluid::CPUPlace>::Cache()
+                   .At(right);
+    ker(in.mutable_data<T>(),
+        out.mutable_data<T>(),
+        Mean->mutable_data<T>(),
+        Var->mutable_data<T>(),
+        Scale->data<T>(),
+        Bias->data<T>(),
+        static_cast<int>(left),
+        static_cast<const float>(epsilon),
+        right);
+  }
+
+  virtual ~LayerNormCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/layer_norm_compute_test.cc b/lite/kernels/x86/layer_norm_compute_test.cc
new file mode 100644
index 0000000000..fbac395052
--- /dev/null
+++ b/lite/kernels/x86/layer_norm_compute_test.cc
@@ -0,0 +1,169 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/layer_norm_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/backends/x86/jit/helper.h"
+#include "lite/backends/x86/jit/kernel_base.h"
+#include "lite/backends/x86/jit/kernels.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+std::vector<float> ref(lite::Tensor* x,
+                       lite::Tensor* Scale,
+                       lite::Tensor* Bias,
+                       lite::Tensor* y,
+                       lite::Tensor* Mean,
+                       lite::Tensor* Var,
+                       int begin_norm_axis,
+                       float epsilon) {
+  auto x_dims = x->dims();
+
+  y->mutable_data<float>();
+  Mean->mutable_data<float>();
+  Var->mutable_data<float>();
+
+  auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  lite::DDim matrix_shape({left, right});
+
+  x->Resize(matrix_shape);
+  Tensor out;
+  out.ShareDataWith(*y);
+  out.Resize(matrix_shape);
+
+  auto ker = paddle::lite::jit::KernelFuncs<jit::LayerNormTuple<float>,
+                                            lite::fluid::CPUPlace>::Cache()
+                 .At(right);
+  ker(x->mutable_data<float>(),
+      out.mutable_data<float>(),
+      Mean->mutable_data<float>(),
+      Var->mutable_data<float>(),
+      Scale->data<float>(),
+      Bias->data<float>(),
+      static_cast<int>(left),
+      static_cast<const float>(epsilon),
+      right);
+
+  std::vector<float> ref_data;
+  auto result = out.mutable_data<float>();
+  for (int i = 0; i < y->dims().production(); ++i) {
+    ref_data.emplace_back(result[i]);
+  }
+  return ref_data;
+}
+
+// layer_norm
+TEST(layer_norm_x86, retrive_op) {
+  auto layer_norm =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "layer_norm");
+  ASSERT_FALSE(layer_norm.empty());
+  ASSERT_TRUE(layer_norm.front());
+}
+
+TEST(layer_norm_x86, init) {
+  lite::kernels::x86::LayerNormCompute<float> layer_norm;
+  ASSERT_EQ(layer_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(layer_norm.target(), TARGET(kX86));
+}
+
+TEST(layer_norm_x86, run_test) {
+  lite::Tensor x;
+  lite::Tensor Scale;
+  lite::Tensor Bias;
+
+  lite::Tensor out;
+  lite::Tensor Mean;
+  lite::Tensor Var;
+
+  std::vector<int64_t> x_shape({1, 2, 3, 1});
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> out_shape({1, 2, 3, 1});
+  out.Resize(lite::DDim(out_shape));
+
+  int begin_norm_axis = 0;
+  float epsilon = 1e-5;
+  int pre = 1;
+  int post = 1;
+  for (int i = 0; i < begin_norm_axis; ++i) {
+    pre *= x_shape[i];
+  }
+  for (int i = begin_norm_axis; i < x_shape.size(); ++i) {
+    post *= x_shape[i];
+  }
+  std::vector<int64_t> scale_shape({post});
+  Scale.Resize(scale_shape);
+  std::vector<int64_t> bias_shape({post});
+  Bias.Resize(bias_shape);
+
+  auto x_data = x.mutable_data<float>();
+  auto scale_data = Scale.mutable_data<float>();
+  auto bias_data = Bias.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  auto mean_data = Mean.mutable_data<float>();
+  auto var_data = Var.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < Scale.dims().production(); ++i) {
+    scale_data[i] = 1.5;
+  }
+  for (int64_t i = 0; i < Bias.dims().production(); ++i) {
+    bias_data[i] = 0.25;
+  }
+
+  LayerNormCompute<float> layer_norm;
+  operators::LayerNormParam param;
+
+  param.X = &x;
+  param.Y = &out;
+  param.Scale = &Scale;
+  param.Bias = &Bias;
+  param.Mean = &Mean;
+  param.Variance = &Var;
+  param.begin_norm_axis = begin_norm_axis;
+  param.epsilon = epsilon;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  layer_norm.SetContext(std::move(ctx));
+  layer_norm.SetParam(param);
+  layer_norm.Run();
+
+  std::vector<float> ref_data =
+      ref(&x, &Scale, &Bias, &out, &Mean, &Var, begin_norm_axis, epsilon);
+  for (int j = 0; j < out.dims().production(); ++j) {
+    EXPECT_NEAR(out_data[j], ref_data[j], 1e-5);
+    // LOG(INFO) << out_data[j];
+  }
+  LOG(INFO) << *mean_data;
+  LOG(INFO) << *var_data;
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(layer_norm, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/lookup_table_compute.cc b/lite/kernels/x86/lookup_table_compute.cc
index 364593251e..856a07a94c 100644
--- a/lite/kernels/x86/lookup_table_compute.cc
+++ b/lite/kernels/x86/lookup_table_compute.cc
@@ -32,3 +32,13 @@ REGISTER_LITE_KERNEL(lookup_table,
     .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+REGISTER_LITE_KERNEL(lookup_table_v2,
+                     kX86,
+                     kInt64,
+                     kNCHW,
+                     paddle::lite::kernels::x86::LookupTableCompute<float>,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/lookup_table_compute.h b/lite/kernels/x86/lookup_table_compute.h
index e0d7752ca7..d5719f332c 100644
--- a/lite/kernels/x86/lookup_table_compute.h
+++ b/lite/kernels/x86/lookup_table_compute.h
@@ -30,7 +30,6 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kInt64)> {
 
   void Run() override {
     auto &param = *param_.get_mutable<operators::LookupTableParam>();
-    // auto& context = context_->As<X86Context>();
     auto *ids_t = param.Ids;
     auto *output_t = param.Out;
     int64_t padding_idx = param.padding_idx;
@@ -41,18 +40,18 @@ class LookupTableCompute : public KernelLite<TARGET(kX86), PRECISION(kInt64)> {
     int64_t row_number = table_t->dims()[0];
     int64_t row_width = table_t->dims()[1];
 
-    auto *table = table_t->data<float>();
-    auto *output = output_t->mutable_data<float>();
-    memset(output, 0, output_t->dims().production() * sizeof(float));
+    auto *table = table_t->data<T>();
+    auto *output = output_t->mutable_data<T>();
+    memset(output, 0, output_t->dims().production() * sizeof(T));
     for (int64_t i = 0; i < ids_numel; ++i) {
       if (padding_idx != -1 && ids[i] == padding_idx) {
-        memset(output + i * row_width, 0, row_width * sizeof(float));
+        memset(output + i * row_width, 0, row_width * sizeof(T));
       } else {
         CHECK_LT(ids[i], row_number);
         CHECK_GE(ids[i], 0);
         memcpy(output + i * row_width,
                table + ids[i] * row_width,
-               row_width * sizeof(float));
+               row_width * sizeof(T));
       }
     }
   }
diff --git a/lite/kernels/x86/lookup_table_compute_test.cc b/lite/kernels/x86/lookup_table_compute_test.cc
new file mode 100644
index 0000000000..86b2d39186
--- /dev/null
+++ b/lite/kernels/x86/lookup_table_compute_test.cc
@@ -0,0 +1,82 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/lookup_table_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(lookup_table_x86, compute) {
+  LookupTableCompute<float> lookup_table;
+  operators::LookupTableParam param;
+  lite::Tensor w, ids, out, out_ref;
+  int64_t padding_idx = -1;
+
+  int vocab_size = 40;
+  int emb_size = 50;
+  int ids_h = 30;
+  int ids_w = 20;
+
+  auto w_dim = DDim({vocab_size, emb_size});
+  auto ids_dim = DDim({ids_h, ids_w});
+  auto out_dim = DDim({ids_h, ids_w, emb_size});
+
+  w.Resize(w_dim);
+  ids.Resize(ids_dim);
+  out.Resize(out_dim);
+  out_ref.Resize(out_dim);
+
+  auto* w_data = w.mutable_data<float>();
+  auto* ids_data = ids.mutable_data<int64_t>();
+  auto* out_data = out.mutable_data<float>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  int w_num = w_dim.production();
+  for (int i = 0; i < w_num; i++) {
+    w_data[i] = static_cast<float>(i + 1) / (w_num + 1);
+  }
+  int ids_num = ids_dim.production();
+  for (int i = 0; i < ids_num; i++) {
+    ids_data[i] = i % vocab_size;
+  }
+  int out_num = out_dim.production();
+  for (int i = 0; i < out_num; i++) {
+    out_ref_data[i] =
+        static_cast<float>((i % (vocab_size * emb_size)) + 1) / (w_num + 1);
+  }
+
+  param.W = &w;
+  param.Ids = &ids;
+  param.Out = &out;
+  param.padding_idx = padding_idx;
+  lookup_table.SetParam(param);
+  lookup_table.Run();
+  for (int i = 0; i < out_num; i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(lookup_table, kX86, kInt64, kNCHW, def);
diff --git a/lite/kernels/x86/match_matrix_tensor_compute.cc b/lite/kernels/x86/match_matrix_tensor_compute.cc
new file mode 100644
index 0000000000..feda180d22
--- /dev/null
+++ b/lite/kernels/x86/match_matrix_tensor_compute.cc
@@ -0,0 +1,142 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/match_matrix_tensor_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void MatchMatrixTensorCompute<T>::Run() {
+  auto& context = ctx_->As<X86Context>();
+  auto& param = this->Param<param_t>();
+  auto* x = param.x;
+  auto* w = param.w;
+  auto* y = param.y;
+  auto* out = param.out;
+  auto* tmp = param.tmp;
+  int dim_t = param.dim_t;
+  int dim_in = x->dims()[1];
+
+  const auto& offset_l = x->lod()[0];
+  const auto& offset_r = y->lod()[0];
+
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    int len_l = offset_l[b + 1] - offset_l[b];
+    int len_r = offset_r[b + 1] - offset_r[b];
+    top_size += dim_t * len_l * len_r;
+    top_offset.push_back(top_size);
+  }
+
+  auto* bottom_l_data = x->template data<T>();
+  auto* bottom_r_data = y->template data<T>();
+  auto* t_data = w->template data<T>();
+  auto* out_data = out->template mutable_data<T>();
+  auto* bottom_l_trans_data = tmp->template mutable_data<T>();
+  memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T));
+  memset(bottom_l_trans_data, 0.0, tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
+
+  auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
+  blas.GEMM(CblasNoTrans,
+            CblasNoTrans,
+            x->dims()[0],
+            dim_t * dim_in,
+            dim_in,
+            1.0f,
+            bottom_l_data,
+            dim_in,
+            t_data,
+            dim_t * dim_in,
+            0.0f,
+            bottom_l_trans_data,
+            dim_t * dim_in);
+
+  for (size_t b = 0; b < x->lod()[0].size() - 1; b++) {
+    for (int t = 0; t < dim_t; t++) {
+      int len_l = offset_l[b + 1] - offset_l[b];
+      int len_r = offset_r[b + 1] - offset_r[b];
+      auto* top_data = out_data + top_offset[b] + t * len_l * len_r;
+      const auto* l_t_data =
+          bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
+      const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
+
+      auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
+      blas.GEMM(CblasNoTrans,
+                CblasTrans,
+                len_l,
+                len_r,
+                dim_in,
+                1.0f,
+                l_t_data,
+                dim_t * dim_in,
+                r_data,
+                dim_in,
+                0.0f,
+                top_data,
+                len_r);
+    }
+  }
+
+  int batch_size = x->lod()[0].size() - 1;
+  int lod_lv1_size = batch_size * dim_t;
+  int lod_lv2_size = x->lod()[0].back() * dim_t;
+  std::vector<size_t> out_lod0(batch_size + 1, 0);
+  std::vector<size_t> out_lod1(lod_lv1_size + 1, 0);
+  std::vector<size_t> out_lod2(lod_lv2_size + 1, 0);
+  for (int i = 0; i < batch_size; i++) {
+    out_lod0[i + 1] = out_lod0[i] + dim_t;
+    int len_l = offset_l[i + 1] - offset_l[i];
+
+    for (int j = 0; j < dim_t; j++) {
+      out_lod1[i * dim_t + j + 1] = out_lod1[i * dim_t + j] + len_l;
+      int len_r = offset_r[i + 1] - offset_r[i];
+
+      for (int k = 0; k < len_l; k++) {
+        out_lod2[offset_l[i] * dim_t + j * len_l + k + 1] =
+            out_lod2[offset_l[i] * dim_t + j * len_l + k] + len_r;
+      }
+    }
+  }
+
+  LoD out_lod;
+  out_lod.push_back(top_offset);
+  out_lod.push_back(offset_l);
+  out_lod.push_back(offset_r);
+  out->set_lod(out_lod);
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    match_matrix_tensor,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::MatchMatrixTensorCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Tmp", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/match_matrix_tensor_compute.h b/lite/kernels/x86/match_matrix_tensor_compute.h
new file mode 100644
index 0000000000..6189676fd8
--- /dev/null
+++ b/lite/kernels/x86/match_matrix_tensor_compute.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class MatchMatrixTensorCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatchMatrixTensorParam;
+
+  void Run() override;
+
+  virtual ~MatchMatrixTensorCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/match_matrix_tensor_compute_test.cc b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
new file mode 100644
index 0000000000..0c3f3ad509
--- /dev/null
+++ b/lite/kernels/x86/match_matrix_tensor_compute_test.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/match_matrix_tensor_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(match_matrix_tensor_x86, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "match_matrix_tensor");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(match_matrix_tensor_x86, init) {
+  MatchMatrixTensorCompute<float> mmtc;
+  ASSERT_EQ(mmtc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(mmtc.target(), TARGET(kX86));
+}
+
+TEST(match_matrix_tensor_x86, run_test) {
+  int ix = 5, iy = 4, h = 2, dim_t = 2;
+  lite::Tensor x, w, y, out, tmp;
+  x.Resize({ix, h});
+  w.Resize({h, dim_t, h});
+  y.Resize({iy, h});
+  out.Resize({18, 1});
+  tmp.Resize({20, 1});
+
+  LoD x_lod{};
+  x_lod.push_back({0, 2, 5});
+  x.set_lod(x_lod);
+  LoD y_lod{};
+  y_lod.push_back({0, 3, 4});
+  y.set_lod(y_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* y_data = y.mutable_data<float>();
+  for (int64_t i = 0; i < y.numel(); i++) {
+    y_data[i] = static_cast<float>(i);
+  }
+  auto* w_data = w.mutable_data<float>();
+  for (int64_t i = 0; i < w.numel(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  MatchMatrixTensorCompute<float> mmtc;
+  mmtc.SetContext(std::move(ctx));
+
+  operators::MatchMatrixTensorParam param;
+  param.x = &x;
+  param.w = &w;
+  param.y = &y;
+  param.dim_t = dim_t;
+  param.out = &out;
+  param.tmp = &tmp;
+
+  mmtc.SetParam(param);
+  mmtc.Run();
+
+  std::vector<float> ref_results = {5,
+                                    23,
+                                    41,
+                                    17,
+                                    75,
+                                    133,
+                                    7,
+                                    33,
+                                    59,
+                                    27,
+                                    125,
+                                    223,
+                                    323,
+                                    455,
+                                    587,
+                                    557,
+                                    793,
+                                    1029};
+  auto* out_data = out.mutable_data<float>();
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+    // LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(match_matrix_tensor, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/mean_compute.cc b/lite/kernels/x86/mean_compute.cc
index b618d2d377..1216d99ad8 100644
--- a/lite/kernels/x86/mean_compute.cc
+++ b/lite/kernels/x86/mean_compute.cc
@@ -54,29 +54,6 @@ class MeanCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~MeanCompute() = default;
 };
 
-template <typename T>
-class MeanGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MeanGradParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto& context = ctx_->As<X86Context>();
-    CHECK_EQ(param.Out_grad->raw_tensor().numel(), 1);
-    CHECK(context.x86_device_context());
-
-    param.X_grad->template mutable_data<T>();
-    T x_grad_size = static_cast<T>(param.X_grad->raw_tensor().numel());
-    Eigen::DSizes<int, 1> bcast(static_cast<int>(x_grad_size));
-    EigenVector<T>::Flatten(param.X_grad->raw_tensor())
-        .device(*(context.x86_device_context()->eigen_device())) =
-        (EigenVector<T>::From(param.Out_grad->raw_tensor()) / x_grad_size)
-            .broadcast(bcast);
-  }
-
-  virtual ~MeanGradCompute() = default;
-};
-
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
@@ -93,16 +70,3 @@ REGISTER_LITE_KERNEL(mean,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
-
-REGISTER_LITE_KERNEL(mean_grad,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::MeanGradCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindInput(paddle::framework::GradVarName("Out"),
-               {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput(paddle::framework::GradVarName("X"),
-                {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/mul_compute.cc b/lite/kernels/x86/mul_compute.cc
index 64558f6677..3de4340543 100644
--- a/lite/kernels/x86/mul_compute.cc
+++ b/lite/kernels/x86/mul_compute.cc
@@ -24,21 +24,3 @@ REGISTER_LITE_KERNEL(mul,
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
-
-// #ifdef LITE_WITH_TRAIN
-// REGISTER_LITE_KERNEL(mul_grad,
-//                      kX86,
-//                      kFloat,
-//                      kNCHW,
-//                      paddle::lite::kernels::x86::MulGradCompute<float>,
-//                      def)
-//     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindInput(paddle::framework::GradVarName("Out"),
-//                {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindOutput(paddle::framework::GradVarName("X"),
-//                 {LiteType::GetTensorTy(TARGET(kX86))})
-//     .BindOutput(paddle::framework::GradVarName("Y"),
-//                 {LiteType::GetTensorTy(TARGET(kX86))})
-//     .Finalize();
-// #endif
diff --git a/lite/kernels/x86/mul_compute.h b/lite/kernels/x86/mul_compute.h
index e204fc81f2..be58f24ba2 100644
--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
@@ -81,78 +81,6 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
   virtual ~MulCompute() = default;
 };
 
-#ifdef LITE_WITH_TRAIN
-template <typename T>
-class MulGradCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto& context = ctx_->As<X86Context>();
-    auto& param = *param_.get_mutable<operators::MulGradParam>();
-    CHECK(context.x86_device_context());
-
-    auto* x = &param.x->raw_tensor();
-    auto* y = &param.y->raw_tensor();
-
-    Tensor x_matrix, y_matrix;
-
-    if (x->dims().size() > 2) {
-      x_matrix = framework::ReshapeToMatrix(*x, param.x_num_col_dims);
-    } else {
-      x_matrix = *x;
-    }
-
-    if (y->dims().size() > 2) {
-      y_matrix = framework::ReshapeToMatrix(*y, param.y_num_col_dims);
-
-    } else {
-      y_matrix = *y;
-    }
-
-    auto* dout = &param.output_grad->raw_tensor();
-
-    Tensor dout_mat;
-    dout_mat.ShareDataWith(*dout);
-    dout_mat.Resize(
-        {framework::flatten_to_2d(x->dims(), param.x_num_col_dims)[0],
-         framework::flatten_to_2d(y->dims(), param.y_num_col_dims)[1]});
-
-    auto* dx = &param.x_grad->raw_tensor();
-    auto* dy = &param.y_grad->raw_tensor();
-
-    if (dx != nullptr) {
-      dx->set_lod(x->lod());
-    }
-    if (dy != nullptr) {
-      dy->set_lod(y->lod());
-    }
-
-    auto blas = paddle::operators::math::GetBlas<platform::CPUDeviceContext, T>(
-        *context.x86_device_context());
-    if (dx) {
-      // dx->mutable_data<T>(context.x86_device_context->GetPlace());
-      param.x_grad->template mutable_data<T>();
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dx, param.x_num_col_dims)
-                                               : *dx;
-
-      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      blas.MatMul(dout_mat, false, y_matrix, true, &dx_matrix);
-    }
-    if (dy) {
-      // dy->yutable_data<T>(context.x86_device_context->GetPlace());
-      param.y_grad->template mutable_data<T>();
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix(
-                                                     *dy, param.y_num_col_dims)
-                                               : *dy;
-      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      blas.MatMul(x_matrix, true, dout_mat, false, &dy_matrix);
-    }
-  }
-
-  virtual ~MulGradCompute() = default;
-};
-#endif
-
 }  // namespace x86
 }  // namespace kernels
 }  // namespace lite
diff --git a/lite/kernels/x86/pool_compute.h b/lite/kernels/x86/pool_compute.h
index 57bcddcec9..0dccb245b1 100644
--- a/lite/kernels/x86/pool_compute.h
+++ b/lite/kernels/x86/pool_compute.h
@@ -35,7 +35,6 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
     auto& param = *param_.get_mutable<param_t>();
     if (param.global_pooling) {
       for (size_t i = 0; i < param.ksize.size(); ++i) {
-        param.paddings[i] = 0;
         param.ksize[i] = static_cast<int>(param.x->dims()[i + 2]);
       }
     }
@@ -52,7 +51,7 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                          param.x,
                          param.ksize,
                          param.strides,
-                         param.paddings,
+                         *param.paddings,
                          pool_process,
                          true,
                          false,
@@ -68,7 +67,7 @@ class PoolCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
                          param.x,
                          param.ksize,
                          param.strides,
-                         param.paddings,
+                         *param.paddings,
                          pool_process,
                          param.exclusive,
                          param.adaptive,
diff --git a/lite/kernels/x86/pool_compute_test.cc b/lite/kernels/x86/pool_compute_test.cc
index 87b75a0760..4ea727cedd 100644
--- a/lite/kernels/x86/pool_compute_test.cc
+++ b/lite/kernels/x86/pool_compute_test.cc
@@ -60,7 +60,8 @@ TEST(pool2d_x86, run_test) {
   param.x = &x;
   param.output = &out;
   param.strides = {2, 2};
-  param.paddings = {0, 0};
+  std::vector<int> paddings = {0, 0, 0, 0};
+  param.paddings = std::make_shared<std::vector<int>>(paddings);
   param.ksize = {2, 2};
   param.pooling_type = "max";
   std::unique_ptr<KernelContext> ctx(new KernelContext);
diff --git a/lite/kernels/x86/search_aligned_mat_mul_compute.cc b/lite/kernels/x86/search_aligned_mat_mul_compute.cc
new file mode 100644
index 0000000000..956f2a3beb
--- /dev/null
+++ b/lite/kernels/x86/search_aligned_mat_mul_compute.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_aligned_mat_mul_compute.h"
+
+REGISTER_LITE_KERNEL(
+    search_aligned_mat_mul,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SearchAlignedMatMulCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("_a_addr", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("_b_addr", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("_c_addr", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_aligned_mat_mul_compute.h b/lite/kernels/x86/search_aligned_mat_mul_compute.h
new file mode 100644
index 0000000000..ea6b546c2c
--- /dev/null
+++ b/lite/kernels/x86/search_aligned_mat_mul_compute.h
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchAlignedMatMulCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::MatMulParam;
+
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::MatMulParam>();
+
+    auto x = param.X;
+    auto y = param.Y;
+    auto out = param.Out;
+    bool x_transpose = param.transpose_X;
+    bool y_transpose = param.transpose_Y;
+    float alpha = param.alpha;
+    const auto x_dims = x->dims();
+    const auto y_dims = y->dims();
+    const auto& x_lod = x->lod();
+    const auto& y_lod = y->lod();
+    const auto& x_lod_0 = x_lod[0];
+    const auto& y_lod_0 = y_lod[0];
+
+    int seq_num = x_lod_0.size() - 1;
+    int x_inner_size = x_dims[1];
+    int y_inner_size = y_dims[1];
+    int x_batch_size = x_lod_0[1];
+    int y_batch_size = y_lod_0[1];
+    int M = x_transpose ? x_inner_size : x_batch_size;
+    int N = y_transpose ? y_batch_size : y_inner_size;
+    int X_K = x_transpose ? x_batch_size : x_inner_size;
+    int Y_K = y_transpose ? y_inner_size : y_batch_size;
+    CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+    int K = X_K;
+
+    lite::x86::math::MatDescriptor mat_dim_a;
+    mat_dim_a.height_ = M;
+    mat_dim_a.width_ = K;
+    mat_dim_a.stride_ = x_batch_size * x_inner_size;
+    mat_dim_a.batch_size_ = seq_num;
+    mat_dim_a.trans_ = x_transpose;
+    lite::x86::math::MatDescriptor mat_dim_b;
+    mat_dim_b.height_ = K;
+    mat_dim_b.width_ = N;
+    mat_dim_b.stride_ = y_batch_size * y_inner_size;
+    mat_dim_b.batch_size_ = seq_num;
+    mat_dim_b.trans_ = y_transpose;
+    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
+    blas.MatMul(*x, mat_dim_a, *y, mat_dim_b, static_cast<T>(alpha), out, T(0));
+  }
+
+  virtual ~SearchAlignedMatMulCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_fc_compute.cc b/lite/kernels/x86/search_fc_compute.cc
new file mode 100644
index 0000000000..cf76113e01
--- /dev/null
+++ b/lite/kernels/x86/search_fc_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_fc_compute.h"
+
+REGISTER_LITE_KERNEL(search_fc,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SearchFcCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_fc_compute.h b/lite/kernels/x86/search_fc_compute.h
new file mode 100644
index 0000000000..e0f44de526
--- /dev/null
+++ b/lite/kernels/x86/search_fc_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/search_fc.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchFcParam;
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<param_t>();
+
+    param.Out->Resize({param.X->dims()[0], param.out_size});
+    lite::x86::math::SearchFcFunctor<lite::TargetType::kX86, T> search_fc;
+    search_fc(context, *param.X, *param.W, *param.b, param.Out, param.out_size);
+  }
+  virtual ~SearchFcCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_fc_compute_test.cc b/lite/kernels/x86/search_fc_compute_test.cc
new file mode 100644
index 0000000000..425df2a0f0
--- /dev/null
+++ b/lite/kernels/x86/search_fc_compute_test.cc
@@ -0,0 +1,122 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_fc_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+void fc_cpu_base(const lite::Tensor* X,
+                 const lite::Tensor* W,
+                 const lite::Tensor* b,
+                 int out_size,
+                 lite::Tensor* Out) {
+  const float* data_in = X->data<float>();
+  const float* bias = b->data<float>();
+  const float* weights = W->data<float>();
+  float* data_out = Out->mutable_data<float>();
+  int out_rows = X->dims()[0];
+  int in_cols = X->numel() / out_rows;
+  int out_cols = W->numel() / in_cols;
+  int index_out;
+
+  for (int i = 0; i < out_rows; i++) {
+    for (int j = 0; j < out_cols; j++) {
+      index_out = i * out_cols + j;
+      data_out[index_out] = bias ? bias[j] : 0;
+
+      for (int k = 0; k < in_cols; k++) {
+        data_out[index_out] +=
+            data_in[i * in_cols + k] * weights[j * in_cols + k];
+      }
+    }
+  }
+}
+
+TEST(search_fc_x86, retrive_op) {
+  auto search_fc =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "search_fc");
+  ASSERT_FALSE(search_fc.empty());
+  ASSERT_TRUE(search_fc.front());
+}
+
+TEST(search_fc_x86, init) {
+  SearchFcCompute<float> search_fc;
+  ASSERT_EQ(search_fc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(search_fc.target(), TARGET(kX86));
+}
+
+TEST(search_fc_x86, run_test) {
+  lite::Tensor x, w, b, out;
+  lite::Tensor out_ref;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  std::vector<int64_t> x_shape{1, 4};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> w_shape{3, 4};
+  w.Resize(lite::DDim(w_shape));
+  std::vector<int64_t> b_shape{3};
+  b.Resize(lite::DDim(b_shape));
+  std::vector<int64_t> out_shape{1, 4};
+  out.Resize(lite::DDim(out_shape));
+  out_ref.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto w_data = w.mutable_data<float>();
+  auto b_data = b.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+  auto out_data_ref = out_ref.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < w.dims().production(); i++) {
+    w_data[i] = static_cast<float>(i);
+  }
+  for (int64_t i = 0; i < b.dims().production(); i++) {
+    b_data[i] = static_cast<float>(i);
+  }
+
+  fc_cpu_base(&x, &w, &b, 4, &out_ref);
+
+  SearchFcCompute<float> fc;
+  operators::SearchFcParam param;
+  param.X = &x;
+  param.W = &w;
+  param.b = &b;
+  param.Out = &out;
+  param.out_size = 4;
+  fc.SetParam(param);
+  fc.SetContext(std::move(ctx));
+  fc.Run();
+
+  VLOG(3) << "output vs ref";
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_data_ref[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_fc, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/search_grnn_compute.cc b/lite/kernels/x86/search_grnn_compute.cc
new file mode 100644
index 0000000000..95839ba71b
--- /dev/null
+++ b/lite/kernels/x86/search_grnn_compute.cc
@@ -0,0 +1,332 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_grnn_compute.h"
+#include <algorithm>
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+T sigmoid(T z) {
+  return 1 / (1 + std::exp(-z));
+}
+
+template <typename T>
+void CallGemm(const lite::x86::math::BlasT<TARGET(kX86), T>& blas,
+              const CBLAS_TRANSPOSE TransA,
+              const CBLAS_TRANSPOSE TransB,
+              const int M,
+              const int N,
+              const int K,
+              const T alpha,
+              const T* A,
+              const T* B,
+              const T beta,
+              T* C) {
+  int lda = (TransA == CblasNoTrans) ? K : M;
+  int ldb = (TransB == CblasNoTrans) ? N : K;
+  blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N);
+}
+
+template <typename T>
+void SearchGrnnCompute<T>::PrepareLayout(const Tensor* input_blob) {
+  auto& param = this->Param<param_t>();
+  auto* _idx_sorted_by_width = param.idx_sorted_by_width;
+  auto* _layout_input = param.layout_input;
+  auto* _input = input_blob;
+
+  // usually total length
+  int dim0 = _input->dims()[0];
+  // if it is id only sequence
+  int dim1 = 1;
+  // if its a embedding like sequence (dim1 would be embedding_size)
+  if (_input->dims().size() > 1) {
+    dim1 = _input->dims()[1];
+  }
+
+  int batch = _input->lod()[0].size() - 1;
+  auto& offset = _input->lod()[0];
+
+  Tensor _width;
+  _width.Resize({batch});
+  _idx_sorted_by_width->Resize({batch});
+  int* width_data = _width.template mutable_data<int>();
+  int* idx_sorted_by_width_data =
+      _idx_sorted_by_width->template mutable_data<int>();
+  // sort sequence by width (descending) and find the largest width in the
+  // batch
+  for (int i = 0; i < batch; i++) {
+    width_data[i] = offset[i + 1] - offset[i];
+    idx_sorted_by_width_data[i] = i;
+  }
+  std::sort(idx_sorted_by_width_data,
+            idx_sorted_by_width_data + batch,
+            [&_width](int a, int b) {
+              return _width.template data<int>()[a] >
+                     _width.template data<int>()[b];
+            });
+  int max_width = width_data[idx_sorted_by_width_data[0]];
+
+  // start of reorganizing the input
+  std::vector<size_t> new_offset;
+  new_offset.resize(max_width + 1);
+
+  new_offset[0] = 0;
+  int j = batch - 1;
+  int last_width = 0;
+  int sub_row = 0;
+  int sub_col = 0;
+
+  for (int i = 1; i <= max_width;) {
+    for (int k = j; k >= 0; --k) {
+      if (width_data[idx_sorted_by_width_data[k]] > last_width) {
+        sub_row = width_data[idx_sorted_by_width_data[k]] - last_width;
+        sub_col = k + 1;
+
+        for (int s = 0; s < sub_row; s++) {
+          new_offset[i] = new_offset[i - 1] + sub_col;
+          i++;
+        }
+        // move on
+        last_width = width_data[idx_sorted_by_width_data[k]];
+        j = k - 1;
+        break;
+      }
+    }
+  }
+
+  // copying to the reorganized buffer
+  if (_input->dims().size() == 1) {
+    // _layout_input.reshape_batch_sequence({dim0}, new_offset);
+    LOG(FATAL) << "_input->dims().size() = 1, error.";
+  } else {
+    // _layout_input.reshape_batch_sequence({dim0, dim1}, new_offset);
+    LoD new_lod;
+    new_lod.push_back(new_offset);
+    _layout_input->set_lod(new_lod);
+    _layout_input->Resize({dim0, dim1});
+  }
+
+  auto* new_emb = _layout_input->template mutable_data<T>();
+  for (int i = 0; i < max_width; i++) {
+    int w = new_offset[i + 1] - new_offset[i];
+    auto* emb_start = new_emb + dim1 * new_offset[i];
+    for (int j = 0; j < w; ++j) {
+      memcpy(emb_start + dim1 * j,
+             _input->template data<T>() +
+                 dim1 * offset[idx_sorted_by_width_data[j]] + dim1 * i,
+             dim1 * sizeof(T));
+    }
+  }
+}
+
+template <typename T>
+void SearchGrnnCompute<T>::CopyBack(T* from, T* to, int step) {
+  auto& param = this->Param<param_t>();
+  auto* _input = param.x;
+  auto* _layout_input = param.layout_input;
+  auto* _idx_sorted_by_width = param.idx_sorted_by_width;
+
+  const auto& offset = _input->lod()[0];
+  const auto& new_offset = _layout_input->lod()[0];
+  const auto* idx_sorted_by_width_data =
+      _idx_sorted_by_width->template data<int>();
+  for (size_t i = 0; i < _layout_input->lod()[0].size() - 1; ++i) {
+    int w = new_offset[i + 1] - new_offset[i];
+    for (int j = 0; j < w; j++) {
+      memcpy(to + step * (offset[idx_sorted_by_width_data[j]] + i),
+             from + (new_offset[i] + j) * step,
+             step * sizeof(T));
+    }
+  }
+}
+
+template <typename T>
+void SearchGrnnCompute<T>::Run() {
+  auto& context = ctx_->As<X86Context>();
+  auto& param = this->Param<param_t>();
+  auto* bottom = param.x;
+  auto* wi = param.wi;
+  auto* wh = param.wh;
+  auto* top = param.out;
+  auto* _buffer = param.tmp_buffer;
+  int _cap_h = param.num_hidden;
+  int _cap_e = param.num_input;
+
+  int _cap_l = bottom->dims()[0];
+  int batch = bottom->lod()[0].size() - 1;
+
+  const auto& offset = bottom->lod()[0];
+  LoD top_lod;
+  top_lod.push_back(offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{_cap_l, _cap_h};
+  top->Resize(top_dims_vec);
+  auto* top_hidden = top->template mutable_data<T>();
+
+  const auto* dense_e2h = wi->template data<T>();
+  const auto* dense_h2h = wh->template data<T>();
+
+  const auto* e2h = dense_e2h;
+  const auto* e2hr = dense_e2h + 1 * _cap_e * _cap_h;
+  const auto* e2hz = dense_e2h + 2 * _cap_e * _cap_h;
+  const auto* h2h = dense_h2h;
+  const auto* h2hr = dense_h2h + 1 * _cap_h * _cap_h;
+  const auto* h2hz = dense_h2h + 2 * _cap_h * _cap_h;
+
+  PrepareLayout(bottom);
+
+  auto* _layout_input = param.layout_input;
+  auto* new_emb = _layout_input->template mutable_data<T>();
+  const auto& new_offset = _layout_input->lod()[0];
+  int max_width = _layout_input->lod()[0].size() - 1;
+
+  // this buffer is used for book keeping info which will be used in bp
+  // buffer also needed in bp, so make it larger
+  _buffer->Resize({20, _cap_l, _cap_h});
+  auto* buffer_data = _buffer->template mutable_data<T>();
+  auto* w_x_e = buffer_data + 0 * _cap_l * _cap_h;
+  auto* wr_x_e = buffer_data + 1 * _cap_l * _cap_h;
+  auto* wz_x_e = buffer_data + 2 * _cap_l * _cap_h;
+  auto* u_x_h = buffer_data + 3 * _cap_l * _cap_h;
+  auto* ur_x_h = buffer_data + 4 * _cap_l * _cap_h;
+  auto* uz_x_h = buffer_data + 5 * _cap_l * _cap_h;
+  auto* r = buffer_data + 6 * _cap_l * _cap_h;
+  auto* z = buffer_data + 7 * _cap_l * _cap_h;
+  auto* tilde = buffer_data + 8 * _cap_l * _cap_h;
+  // the internal hidden
+  auto* hidden = buffer_data + 19 * _cap_l * _cap_h;
+
+  auto blas = lite::x86::math::GetBlas<TARGET(kX86), T>(context);
+  CallGemm(blas,
+           CblasNoTrans,
+           CblasTrans,
+           _cap_l,
+           _cap_h,
+           _cap_e,
+           1.0f,
+           new_emb,
+           e2h,
+           0.0f,
+           w_x_e);
+  CallGemm(blas,
+           CblasNoTrans,
+           CblasTrans,
+           _cap_l,
+           _cap_h,
+           _cap_e,
+           1.0f,
+           new_emb,
+           e2hr,
+           0.0f,
+           wr_x_e);
+  CallGemm(blas,
+           CblasNoTrans,
+           CblasTrans,
+           _cap_l,
+           _cap_h,
+           _cap_e,
+           1.0f,
+           new_emb,
+           e2hz,
+           0.0f,
+           wz_x_e);
+
+  // precompute hidden0
+  for (int i = 0; i < batch * _cap_h; i++) {
+    tilde[i] = std::tanh(w_x_e[i]);
+    z[i] = sigmoid<T>(wz_x_e[i]);
+    hidden[i] = (1. - z[i]) * tilde[i];
+  }
+
+  // recurrence
+  for (int i = 1; i < max_width; i++) {
+    int w_tm1 = new_offset[i] - new_offset[i - 1];
+    int w = new_offset[i + 1] - new_offset[i];
+
+    // precompute hidden i-1 to hidden i
+    auto* htm1 = hidden + new_offset[i - 1] * _cap_h;
+
+    CallGemm(blas,
+             CblasNoTrans,
+             CblasTrans,
+             w,
+             _cap_h,
+             _cap_h,
+             1.0f,
+             htm1,
+             h2h,
+             0.0f,
+             u_x_h + new_offset[i] * _cap_h);
+    CallGemm(blas,
+             CblasNoTrans,
+             CblasTrans,
+             w,
+             _cap_h,
+             _cap_h,
+             1.0f,
+             htm1,
+             h2hr,
+             0.0f,
+             ur_x_h + new_offset[i] * _cap_h);
+    CallGemm(blas,
+             CblasNoTrans,
+             CblasTrans,
+             w,
+             _cap_h,
+             _cap_h,
+             1.0f,
+             htm1,
+             h2hz,
+             0.0f,
+             uz_x_h + new_offset[i] * _cap_h);
+
+    // compute the gate and hidden
+    for (size_t j = new_offset[i] * _cap_h; j < (new_offset[i] + w) * _cap_h;
+         j++) {
+      r[j] = sigmoid(wr_x_e[j] + ur_x_h[j]);
+      z[j] = sigmoid(wz_x_e[j] + uz_x_h[j]);
+      tilde[j] = std::tanh(w_x_e[j] + r[j] * u_x_h[j]);
+      hidden[j] = z[j] * hidden[j - _cap_h * w_tm1] + (1.0 - z[j]) * tilde[j];
+    }
+  }
+
+  CopyBack(hidden, top_hidden, _cap_h);
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(search_grnn,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SearchGrnnCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("idx_sorted_by_width",
+                {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_grnn_compute.h b/lite/kernels/x86/search_grnn_compute.h
new file mode 100644
index 0000000000..66866761e1
--- /dev/null
+++ b/lite/kernels/x86/search_grnn_compute.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchGrnnCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGrnnParam;
+
+  void Run() override;
+
+  virtual ~SearchGrnnCompute() = default;
+
+ private:
+  void PrepareLayout(const Tensor* input);
+  void CopyBack(T* from, T* to, int step);
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_grnn_compute_test.cc b/lite/kernels/x86/search_grnn_compute_test.cc
new file mode 100644
index 0000000000..b85d97e3f1
--- /dev/null
+++ b/lite/kernels/x86/search_grnn_compute_test.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_grnn_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(search_grnn_x86, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "search_grnn");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(search_grnn_x86, init) {
+  SearchGrnnCompute<float> ssdc;
+  ASSERT_EQ(ssdc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(ssdc.target(), TARGET(kX86));
+}
+
+TEST(search_grnn_x86, run_test) {
+  int num_input = 128;
+  int num_hidden = 128;
+  int num_batch = 3;
+  lite::Tensor x, wi, wh, out, idx_sorted_by_width, layout_input, tmp_buffer;
+  x.Resize({num_batch, num_input});
+  wi.Resize({3, num_hidden, num_input});
+  wh.Resize({3, num_hidden, num_hidden});
+  // out.Resize({num_batch, num_hidden});
+  LoD x_lod{};
+  x_lod.push_back({0, 1, 3});
+  x.set_lod(x_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.numel(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  auto* wi_data = wi.mutable_data<float>();
+  for (int64_t i = 0; i < wi.numel(); i++) {
+    wi_data[i] = static_cast<float>(i);
+  }
+  auto* wh_data = wh.mutable_data<float>();
+  for (int64_t i = 0; i < wh.numel(); i++) {
+    wh_data[i] = static_cast<float>(i);
+  }
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+
+  operators::SearchGrnnParam param;
+  param.x = &x;
+  param.wi = &wi;
+  param.wh = &wh;
+  param.out = &out;
+  param.idx_sorted_by_width = &idx_sorted_by_width;
+  param.layout_input = &layout_input;
+  param.tmp_buffer = &tmp_buffer;
+  param.num_input = num_input;
+  param.num_hidden = num_hidden;
+
+  SearchGrnnCompute<float> sgc;
+  sgc.SetContext(std::move(ctx));
+  sgc.SetParam(param);
+  sgc.Run();
+
+  // std::vector<float> ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19};
+  auto* out_data = out.mutable_data<float>();
+  LOG(INFO) << out.numel();
+  for (int i = 0; i < out.numel(); i++) {
+    // EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+    LOG(INFO) << out_data[i];
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_grnn, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/search_group_padding_compute.cc b/lite/kernels/x86/search_group_padding_compute.cc
new file mode 100644
index 0000000000..d1847ac9db
--- /dev/null
+++ b/lite/kernels/x86/search_group_padding_compute.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_group_padding_compute.h"
+
+REGISTER_LITE_KERNEL(
+    search_group_padding,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SearchGroupPaddingCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out_emb_padding", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out_new", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out_padding", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_group_padding_compute.h b/lite/kernels/x86/search_group_padding_compute.h
new file mode 100644
index 0000000000..17244d15d9
--- /dev/null
+++ b/lite/kernels/x86/search_group_padding_compute.h
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchGroupPaddingCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchGroupPaddingParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<operators::SearchGroupPaddingParam>();
+
+    auto* bottom0 = param.x;
+    auto* top0 = param.out_emb_padding;
+    auto* top1 = param.out_new;
+    auto* top2 = param.out_padding;
+
+    int _pad_id = param.pad_id;
+
+    int batch = bottom0->lod()[0].size() - 1;
+    int dim0 = bottom0->dims()[0];
+    int dim1 = bottom0->dims()[1];
+
+    const auto offset = bottom0->lod()[0];
+    int max_seq = 0;
+    for (int i = 0; i < batch; ++i) {
+      if (offset[i + 1] - offset[i] > max_seq) {
+        max_seq = offset[i + 1] - offset[i];
+      }
+    }
+
+    std::vector<size_t> new_offset;
+    new_offset.resize(batch + 1);
+    for (int i = 0; i < batch + 1; ++i) {
+      new_offset[i] = i * max_seq;
+    }
+
+    // for padding data
+    lite::LoD top0_lod;
+    top0_lod.push_back(new_offset);
+    top0->set_lod(top0_lod);
+    top0->Resize({batch * max_seq, dim1});
+    // for origin input id
+    // already set by ShareLoD in InferShape
+    lite::LoD top1_lod;
+    top1_lod.push_back(offset);
+    top1->set_lod(top1_lod);
+    top1->Resize({dim0, 1});
+    memset(top1->mutable_data<T>(),
+           0,
+           top1->dims()[0] * top1->dims()[1] * sizeof(T));
+    // for padding input id
+    lite::LoD top2_lod;
+    top2_lod.push_back(new_offset);
+    top2->set_lod(top2_lod);
+    top2->Resize({batch * max_seq, 1});
+    // copy data
+    const auto* bottom_data = bottom0->data<T>();
+    auto* top_data = top0->mutable_data<T>();
+    auto* top_padding_input_data = top2->mutable_data<T>();
+    for (int i = 0; i < batch; i++) {
+      const int copy_step = offset[i + 1] - offset[i];
+      const int start = i * max_seq;
+      memcpy(top_data + start * dim1,
+             bottom_data + offset[i] * dim1,
+             copy_step * dim1 * sizeof(T));
+      memset(top_data + (start + copy_step) * dim1,
+             0,
+             (max_seq - copy_step) * dim1 * sizeof(T));
+      // for padding input id
+      memset(top_padding_input_data + start, 0, copy_step * sizeof(T));
+      for (int j = start + copy_step; j < start + max_seq; j++) {
+        top_padding_input_data[j] = static_cast<T>(_pad_id);
+      }
+    }
+  }
+
+  virtual ~SearchGroupPaddingCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_group_padding_compute_test.cc b/lite/kernels/x86/search_group_padding_compute_test.cc
new file mode 100644
index 0000000000..f4c36c2a63
--- /dev/null
+++ b/lite/kernels/x86/search_group_padding_compute_test.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_group_padding_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(search_group_padding_x86, retrieve_op) {
+  auto search_group_padding =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "search_group_padding");
+  ASSERT_FALSE(search_group_padding.empty());
+  ASSERT_TRUE(search_group_padding.front());
+}
+
+TEST(search_group_padding_x86, init) {
+  SearchGroupPaddingCompute<float> search_group_padding;
+  ASSERT_EQ(search_group_padding.precision(), PRECISION(kFloat));
+  ASSERT_EQ(search_group_padding.target(), TARGET(kX86));
+}
+
+TEST(search_group_padding_x86, run_test) {
+  lite::Tensor x, out_emb_padding, out_new, out_padding;
+  x.Resize({2, 3});
+  out_emb_padding.Resize({-1, 3});
+  out_new.Resize({2, 1});
+  out_padding.Resize({-1, 1});
+  LoD x_lod{};
+  x_lod.push_back({0, 1});
+  x.set_lod(x_lod);
+
+  auto* x_data = x.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  SearchGroupPaddingCompute<float> sgp_kernel;
+  operators::SearchGroupPaddingParam param;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  sgp_kernel.SetContext(std::move(ctx));
+
+  param.x = &x;
+  param.out_emb_padding = &out_emb_padding;
+  param.out_new = &out_new;
+  param.out_padding = &out_padding;
+
+  sgp_kernel.SetParam(param);
+  sgp_kernel.Run();
+
+  std::vector<float> out_emb_padding_ref = {0, 1, 2};
+  std::vector<float> out_new_ref = {0, 0};
+  std::vector<float> out_padding_ref = {0};
+  auto* out_emb_padding_data = out_emb_padding.mutable_data<float>();
+  auto* out_new_data = out_new.mutable_data<float>();
+  auto* out_padding_data = out_padding.mutable_data<float>();
+  for (int i = 0; i < out_emb_padding.dims().production(); i++) {
+    EXPECT_NEAR(out_emb_padding_data[i], out_emb_padding_ref[i], 1e-5);
+  }
+  for (int i = 0; i < out_new.dims().production(); i++) {
+    EXPECT_NEAR(out_new_data[i], out_new_ref[i], 1e-5);
+  }
+  for (int i = 0; i < out_padding.dims().production(); i++) {
+    EXPECT_NEAR(out_padding_data[i], out_padding_ref[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_group_padding, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/search_seq_depadding_compute.cc b/lite/kernels/x86/search_seq_depadding_compute.cc
new file mode 100644
index 0000000000..db1816fb48
--- /dev/null
+++ b/lite/kernels/x86/search_seq_depadding_compute.cc
@@ -0,0 +1,76 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_seq_depadding_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+void SearchSeqDepaddingCompute<T>::Run() {
+  auto& param = this->Param<param_t>();
+  auto* pad = param.pad;
+  auto* src = param.src;
+  auto* out = param.out;
+
+  const int pad_batch = pad->lod()[0].size() - 1;
+  const int src_batch = src->lod()[0].size() - 1;
+  if (pad_batch % src_batch != 0) {
+    LOG(FATAL) << "Mismatch batch size.";
+  }
+
+  const auto& pad_offset = pad->lod()[0];
+  const int pad_cap_e = pad->dims()[1];
+  const auto& src_offset = src->lod()[0];
+  const int src_cap_l = src->dims()[0];
+
+  LoD out_lod;
+  out_lod.push_back(src_offset);
+  out->set_lod(out_lod);
+  out->Resize({src_cap_l, pad_cap_e});
+
+  const auto* pad_data = pad->template data<T>();
+  auto* out_data = out->template mutable_data<T>();
+  for (int i = 0; i < src_batch; ++i) {
+    const int src_i_l = src_offset[i + 1] - src_offset[i];
+    const int pad_i_l = pad_offset[i + 1] - pad_offset[i];
+    if (pad_i_l < src_i_l) {
+      LOG(FATAL)
+          << "the length of padding seq input is less than source seq input.";
+    }
+    memcpy(out_data + src_offset[i] * pad_cap_e,
+           pad_data + pad_offset[i] * pad_cap_e,
+           src_i_l * pad_cap_e * sizeof(T));
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(
+    search_seq_depadding,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SearchSeqDepaddingCompute<float>,
+    def)
+    .BindInput("Pad", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Src", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_seq_depadding_compute.h b/lite/kernels/x86/search_seq_depadding_compute.h
new file mode 100644
index 0000000000..e48fa92723
--- /dev/null
+++ b/lite/kernels/x86/search_seq_depadding_compute.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/operators/op_params.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchSeqDepaddingCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchSeqDepaddingParam;
+
+  void Run() override;
+
+  virtual ~SearchSeqDepaddingCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/search_seq_depadding_compute_test.cc b/lite/kernels/x86/search_seq_depadding_compute_test.cc
new file mode 100644
index 0000000000..0d978b35ed
--- /dev/null
+++ b/lite/kernels/x86/search_seq_depadding_compute_test.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_seq_depadding_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(search_seq_depadding_x86, retrive_op) {
+  auto kernel =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "search_seq_depadding");
+  ASSERT_FALSE(kernel.empty());
+  ASSERT_TRUE(kernel.front());
+}
+
+TEST(search_seq_depadding_x86, init) {
+  SearchSeqDepaddingCompute<float> ssdc;
+  ASSERT_EQ(ssdc.precision(), PRECISION(kFloat));
+  ASSERT_EQ(ssdc.target(), TARGET(kX86));
+}
+
+TEST(search_seq_depadding_x86, run_test) {
+  lite::Tensor pad, src, out;
+  pad.Resize({2 * 3, 4});
+  src.Resize({3, 1});
+  out.Resize({3, 4});
+  LoD pad_lod{};
+  pad_lod.push_back({0, 4, 6});
+  pad.set_lod(pad_lod);
+  LoD src_lod{};
+  src_lod.push_back({0, 2, 3});
+  src.set_lod(src_lod);
+
+  auto* pad_data = pad.mutable_data<float>();
+  for (int64_t i = 0; i < pad.dims().production(); i++) {
+    pad_data[i] = static_cast<float>(i);
+  }
+  SearchSeqDepaddingCompute<float> ssdc;
+  operators::SearchSeqDepaddingParam param;
+
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  ssdc.SetContext(std::move(ctx));
+
+  param.pad = &pad;
+  param.src = &src;
+  param.out = &out;
+
+  ssdc.SetParam(param);
+  ssdc.Run();
+
+  std::vector<float> ref_results = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19};
+  auto* out_data = out.mutable_data<float>();
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], 1e-3);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(search_seq_depadding, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/search_seq_fc_compute.cc b/lite/kernels/x86/search_seq_fc_compute.cc
new file mode 100644
index 0000000000..e0845bd74c
--- /dev/null
+++ b/lite/kernels/x86/search_seq_fc_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/search_seq_fc_compute.h"
+
+REGISTER_LITE_KERNEL(search_seq_fc,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SearchSeqFcCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("b", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/search_seq_fc_compute.h b/lite/kernels/x86/search_seq_fc_compute.h
new file mode 100644
index 0000000000..80ef54b30b
--- /dev/null
+++ b/lite/kernels/x86/search_seq_fc_compute.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SearchSeqFcCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SearchSeqFcParam;
+
+  void Run() override {
+    auto& context = ctx_->As<X86Context>();
+    auto& param = *param_.get_mutable<operators::SearchSeqFcParam>();
+
+    auto x = param.x;
+    auto w = param.w;
+    auto b = param.b;
+    auto out = param.out;
+    auto out_size = param.out_size;
+    const auto x_dims = x->dims();
+    const auto w_dims = w->dims();
+    const auto out_dims = out->dims();
+    CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+    CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+    CHECK_EQ(out_dims.size(), 2) << "The Output(Out) should be 2-D tensor.";
+    CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+    CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size";
+    CHECK_EQ(out_dims[0], x_dims[0]) << "Wrong shape: out_dims[0] != x_dims[0]";
+    CHECK_EQ(out_dims[1], out_size) << "Wrong shape: out_dims[1] != out_size";
+
+    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
+    blas.MatMul(*x, false, *w, true, out);
+
+    if (b != nullptr) {
+      auto b_dims = b->dims();
+      CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+      CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+      int M = x_dims[0];
+      int N = w_dims[0];
+      for (int i = 0; i < M; i++) {
+        blas.AXPY(
+            N, static_cast<T>(1), b->data<T>(), out->mutable_data<T>() + i * N);
+      }
+    }
+  }
+
+  virtual ~SearchSeqFcCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_arithmetic_compute.cc b/lite/kernels/x86/sequence_arithmetic_compute.cc
new file mode 100644
index 0000000000..95fa27e3d4
--- /dev/null
+++ b/lite/kernels/x86/sequence_arithmetic_compute.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_arithmetic_compute.h"
+
+REGISTER_LITE_KERNEL(
+    sequence_arithmetic,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SequenceArithmeticCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+REGISTER_LITE_KERNEL(
+    search_seq_arithmetic,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SequenceArithmeticCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_arithmetic_compute.h b/lite/kernels/x86/sequence_arithmetic_compute.h
new file mode 100644
index 0000000000..88510b8b1c
--- /dev/null
+++ b/lite/kernels/x86/sequence_arithmetic_compute.h
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <cstring>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SequenceArithmeticCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceArithmeticParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto x = param.X;
+    auto y = param.Y;
+    auto out = param.Out;
+    int op_type = param.op_type;
+
+    out->Resize(x->dims());
+    out->set_lod(x->lod());
+
+    auto x_data = x->data<T>();
+    auto y_data = y->data<T>();
+    auto out_data = out->mutable_data<T>();
+    auto x_seq_offset = x->lod()[0];
+    auto y_seq_offset = y->lod()[0];
+    int seq_num = x_seq_offset.size() - 1;
+    int inner_size = (x->numel()) / (x->dims()[0]);
+
+    // sum
+    if (op_type == 1) {
+      for (int i = 0; i < seq_num; i++) {
+        int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+        int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+        auto input_x = x_data + x_seq_offset[i] * inner_size;
+        auto input_y = y_data + y_seq_offset[i] * inner_size;
+        auto t_out = out_data + x_seq_offset[i] * inner_size;
+        int len = std::min(len_x, len_y);
+        for (int j = 0; j < len; j++) {
+          t_out[j] = input_x[j] + input_y[j];
+        }
+        if (len_x > len) {
+          memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len));
+        }
+      }
+    }
+
+    // sub
+    if (op_type == 2) {
+      for (int i = 0; i < seq_num; i++) {
+        int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+        int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+        auto input_x = x_data + x_seq_offset[i] * inner_size;
+        auto input_y = y_data + y_seq_offset[i] * inner_size;
+        auto t_out = out_data + x_seq_offset[i] * inner_size;
+        int len = std::min(len_x, len_y);
+        for (int j = 0; j < len; j++) {
+          t_out[j] = input_x[j] - input_y[j];
+        }
+        if (len_x > len) {
+          memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len));
+        }
+      }
+    }
+
+    // mul
+    if (op_type == 3) {
+      for (int i = 0; i < seq_num; i++) {
+        int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+        int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+        auto input_x = x_data + x_seq_offset[i] * inner_size;
+        auto input_y = y_data + y_seq_offset[i] * inner_size;
+        auto t_out = out_data + x_seq_offset[i] * inner_size;
+        int len = std::min(len_x, len_y);
+        for (int j = 0; j < len; j++) {
+          t_out[j] = input_x[j] * input_y[j];
+        }
+        if (len_x > len) {
+          memcpy(t_out + len, input_x + len, sizeof(T) * (len_x - len));
+        }
+      }
+    }
+  }
+
+  virtual ~SequenceArithmeticCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_arithmetic_compute_test.cc b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
new file mode 100644
index 0000000000..3b41e7d7ce
--- /dev/null
+++ b/lite/kernels/x86/sequence_arithmetic_compute_test.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_arithmetic_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+void sequence_arithmetic_compute_ref(const Tensor& x,
+                                     const Tensor& y,
+                                     Tensor* out,
+                                     int op_type) {
+  auto x_data = x.data<float>();
+  auto y_data = y.data<float>();
+  out->Resize(x.dims());
+  out->set_lod(x.lod());
+  auto out_data = out->mutable_data<float>();
+  auto x_seq_offset = x.lod()[0];
+  auto y_seq_offset = y.lod()[0];
+  int seq_num = x_seq_offset.size() - 1;
+  int inner_size = x.numel() / x.dims()[0];
+
+  for (int i = 0; i < seq_num; i++) {
+    int len_x = (x_seq_offset[i + 1] - x_seq_offset[i]) * inner_size;
+    int len_y = (y_seq_offset[i + 1] - y_seq_offset[i]) * inner_size;
+    auto input_x = x_data + x_seq_offset[i] * inner_size;
+    auto input_y = y_data + y_seq_offset[i] * inner_size;
+    auto t_out = out_data + x_seq_offset[i] * inner_size;
+    int len = std::min(len_x, len_y);
+    for (int j = 0; j < len; j++) {
+      switch (op_type) {
+        case 1:
+          t_out[j] = input_x[j] + input_y[j];
+          break;
+        case 2:
+          t_out[j] = input_x[j] - input_y[j];
+          break;
+        case 3:
+          t_out[j] = input_x[j] * input_y[j];
+          break;
+        default:
+          break;
+      }
+    }
+    if (len_x > len) {
+      memcpy(t_out + len, input_x + len, sizeof(float) * (len_x - len));
+    }
+  }
+}
+
+void prepare_input(Tensor* x, const LoD& x_lod) {
+  x->Resize({static_cast<int64_t>(x_lod[0].back()), 3});
+  x->set_lod(x_lod);
+  auto x_data = x->mutable_data<float>();
+  for (int i = 0; i < x->numel(); i++) {
+    x_data[i] = (i - x->numel() / 2) * 1.1;
+  }
+}
+
+TEST(sequence_arithmetic_x86, retrive_op) {
+  auto sequence_arithmetic =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "sequence_arithmetic");
+  ASSERT_FALSE(sequence_arithmetic.empty());
+  ASSERT_TRUE(sequence_arithmetic.front());
+}
+
+TEST(sequence_arithmetic_x86, init) {
+  SequenceArithmeticCompute<float> sequence_arithmetic;
+  ASSERT_EQ(sequence_arithmetic.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_arithmetic.target(), TARGET(kX86));
+}
+
+TEST(sequence_arithmetic_x86, run_test) {
+  SequenceArithmeticCompute<float> sequence_arithmetic;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+
+  lite::Tensor x, y, out, out_ref;
+  lite::LoD x_lod{{0, 2, 5, 9}}, y_lod{{0, 2, 5, 9}};
+  prepare_input(&x, x_lod);
+  prepare_input(&y, y_lod);
+
+  operators::SequenceArithmeticParam param;
+  param.X = &x;
+  param.Y = &y;
+  param.Out = &out;
+  param.op_type = 1;
+
+  sequence_arithmetic.SetContext(std::move(ctx));
+  sequence_arithmetic.SetParam(param);
+  sequence_arithmetic.Run();
+
+  sequence_arithmetic_compute_ref(x, y, &out_ref, param.op_type);
+  auto out_data = out.data<float>();
+  auto out_ref_data = out_ref.data<float>();
+  for (int i = 0; i < out.numel(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-3);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_arithmetic, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/sequence_concat_compute.cc b/lite/kernels/x86/sequence_concat_compute.cc
new file mode 100644
index 0000000000..facdad39d3
--- /dev/null
+++ b/lite/kernels/x86/sequence_concat_compute.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_concat_compute.h"
+
+REGISTER_LITE_KERNEL(sequence_concat,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SequenceConcatCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_concat_compute.h b/lite/kernels/x86/sequence_concat_compute.h
new file mode 100644
index 0000000000..553e2e8b06
--- /dev/null
+++ b/lite/kernels/x86/sequence_concat_compute.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<size_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<T>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+template <typename T>
+class SequenceConcatCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceConcatParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    // auto& param = Param<param_t>();
+    T* dout = param.Out->mutable_data<T>();
+
+    std::vector<lite::Tensor> x_in_order;
+    param.Out->set_lod(ConcatLoD<T>(param.X, &x_in_order));
+
+    int num = x_in_order.size();
+    int out_rows = 1;
+
+    std::vector<int64_t> input_cols(num);
+    for (int i = 0; i < num; ++i) {
+      input_cols[i] = x_in_order[i].numel() / out_rows;
+    }
+
+    int col_idx = 0;
+    for (int j = 0; j < num; ++j) {
+      int col_len = input_cols[j];
+      auto input_data = x_in_order[j].data<T>();
+      memcpy(dout + col_idx, input_data, sizeof(T) * col_len);
+      col_idx += col_len;
+    }
+  }
+
+  virtual ~SequenceConcatCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_concat_compute_test.cc b/lite/kernels/x86/sequence_concat_compute_test.cc
new file mode 100644
index 0000000000..be1f86a5c8
--- /dev/null
+++ b/lite/kernels/x86/sequence_concat_compute_test.cc
@@ -0,0 +1,163 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_concat_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+namespace {
+inline LoD ConcatLoD(const std::vector<lite::Tensor*>& xs,
+                     std::vector<lite::Tensor>* xs_in_order) {
+  std::vector<size_t> result;
+  result.resize(xs[0]->lod()[0].size());
+
+  for (size_t i = 1; i < result.size(); ++i) {
+    size_t sum = 0;
+    for (size_t j = 0; j < xs.size(); ++j) {
+      auto& x_lod = xs[j]->lod()[0];
+      if (x_lod[i - 1] < x_lod[i]) {
+        xs_in_order->emplace_back(xs[j]->Slice<float>(x_lod[i - 1], x_lod[i]));
+      }
+      sum += x_lod[i];
+    }
+    result[i] = sum;
+  }
+  LoD lod;
+  lod.emplace_back(result);
+  return lod;
+}
+
+static void sequence_concat_ref(const std::vector<lite::Tensor*>& xs,
+                                lite::Tensor* out) {
+  std::vector<int64_t> out_dims;
+  int64_t batch_size = 0;
+  int64_t feature_size = 0;
+  for (const auto& tensor : xs) {
+    const auto x_dims = tensor->dims();
+    if (out_dims.empty()) {
+      out_dims = x_dims.Vectorize();
+    }
+    batch_size += x_dims[0];
+    if (feature_size == 0) {
+      feature_size = x_dims.production() / x_dims[0];
+    } else {
+      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+          << "Inputs of sequence concat must have same feature size";
+    }
+  }
+  out_dims[0] = batch_size;
+  out->Resize(out_dims);
+  std::vector<lite::Tensor> x_in_order;
+  out->set_lod(ConcatLoD(xs, &x_in_order));
+
+  int num = x_in_order.size();
+  std::vector<int64_t> input_cols(num);
+  for (int i = 0; i < num; ++i) {
+    input_cols[i] = x_in_order[i].numel();
+  }
+  float* out_data = out->mutable_data<float>();
+  int col_idx = 0;
+  for (int j = 0; j < num; ++j) {
+    int col_len = input_cols[j];
+    auto input_data = x_in_order[j].data<float>();
+    memcpy(out_data + col_idx, input_data, sizeof(float) * col_len);
+    col_idx += col_len;
+  }
+}
+
+#define PREPARE_INPUT(name)                        \
+  name.Resize({name##_lod_len, feature_len});      \
+  name.set_lod(lod_info_##name);                   \
+  float* name##_data = name.mutable_data<float>(); \
+  for (int i = 0; i < name.numel(); ++i) {         \
+    name##_data[i] = (i - 2.0) * 1.0;              \
+  }
+
+}  // namespace
+
+TEST(sequence_concat_x86, retrive_op) {
+  auto sequence_concat =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "sequence_concat");
+  ASSERT_FALSE(sequence_concat.empty());
+  ASSERT_TRUE(sequence_concat.front());
+}
+
+TEST(sequence_concat_x86, init) {
+  SequenceConcatCompute<float> sequence_concat;
+  ASSERT_EQ(sequence_concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_concat.target(), TARGET(kX86));
+}
+
+TEST(sequence_concat_x86, run_test) {
+  SequenceConcatCompute<float> seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+
+  operators::SequenceConcatParam param;
+  lite::Tensor x1, x2, x3;
+  lite::Tensor y, y_ref;
+
+  int32_t x1_lod_len = 10, feature_len = 4;
+  int32_t x2_lod_len = 4, x3_lod_len = 8;
+  int32_t y_lod_len = x1_lod_len + x2_lod_len + x3_lod_len;
+  LoD lod_info_x1{{0, 3, 5, 6, 10}};
+  LoD lod_info_x2{{0, 1, 2, 3, 4}};
+  LoD lod_info_x3{{0, 2, 4, 6, 8}};
+  LoD lod_info_y{{0, 0, 0, 0, 0}};
+  for (size_t i = 0; i < lod_info_x1[0].size(); ++i) {
+    lod_info_y[0][i] =
+        lod_info_x1[0][i] + lod_info_x2[0][i] + lod_info_x3[0][i];
+  }
+
+  PREPARE_INPUT(x1);
+  PREPARE_INPUT(x2);
+  PREPARE_INPUT(x3);
+
+  y_ref.Resize({y_lod_len, feature_len});
+  y.Resize({y_lod_len, feature_len});
+  y_ref.set_lod(lod_info_y);
+  y.set_lod(lod_info_y);
+
+  std::vector<lite::Tensor*> xs{&x1, &x2, &x3};
+
+  param.X = xs;
+  param.Out = &y;
+  seq_kernel.SetParam(param);
+
+  seq_kernel.SetContext(std::move(ctx));
+  seq_kernel.Run();
+
+  auto* y_data = y.mutable_data<float>();
+  sequence_concat_ref(xs, &y_ref);
+  float* y_ref_data = y_ref.mutable_data<float>();
+
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_concat, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/sequence_reverse_compute.cc b/lite/kernels/x86/sequence_reverse_compute.cc
new file mode 100644
index 0000000000..6c391e12ad
--- /dev/null
+++ b/lite/kernels/x86/sequence_reverse_compute.cc
@@ -0,0 +1,32 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_reverse_compute.h"
+
+typedef paddle::lite::kernels::x86::SequenceReverseCompute<float,
+                                                           PRECISION(kFloat)>
+    ReverseFp32;
+typedef paddle::lite::kernels::x86::SequenceReverseCompute<int64_t,
+                                                           PRECISION(kInt64)>
+    ReverseInt64;
+
+REGISTER_LITE_KERNEL(sequence_reverse, kX86, kFloat, kNCHW, ReverseFp32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(sequence_reverse, kX86, kInt64, kNCHW, ReverseInt64, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_reverse_compute.h b/lite/kernels/x86/sequence_reverse_compute.h
new file mode 100644
index 0000000000..ab93972276
--- /dev/null
+++ b/lite/kernels/x86/sequence_reverse_compute.h
@@ -0,0 +1,63 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T, PrecisionType Ptype>
+class SequenceReverseCompute : public KernelLite<TARGET(kX86), Ptype> {
+ public:
+  using param_t = operators::SequenceReverseParam;
+
+  void Run() override {
+    auto& param = this->template Param<param_t>();
+    auto* output = param.Out;
+    const auto* din = param.X->template data<T>();
+
+    T* dout = output->template mutable_data<T>();
+    CHECK_NE(din, dout)
+        << "SequenceReverse Op does not support in-place operation";
+    const auto lod = param.X->lod()[param.X->lod().size() - 1];
+    const size_t lod_count = lod.size();
+
+    size_t limit = static_cast<size_t>(param.X->numel());
+    size_t row_numel = static_cast<size_t>(limit / param.X->dims()[0]);
+
+    for (size_t idx = 0; idx < lod_count - 1; ++idx) {
+      auto start_pos = lod[idx];
+      auto end_pos = lod[idx + 1];
+      for (auto pos = start_pos; pos < end_pos; ++pos) {
+        auto cur_pos = end_pos - pos - 1 + start_pos;
+        std::memcpy(dout + pos * row_numel,
+                    din + cur_pos * row_numel,
+                    row_numel * sizeof(T));
+      }
+    }
+    output->set_lod(param.X->lod());
+  }
+
+  virtual ~SequenceReverseCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/sequence_reverse_compute_test.cc b/lite/kernels/x86/sequence_reverse_compute_test.cc
new file mode 100644
index 0000000000..4b84241c8b
--- /dev/null
+++ b/lite/kernels/x86/sequence_reverse_compute_test.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_reverse_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+namespace {
+static void sequence_reverse_ref(const lite::Tensor* x, lite::Tensor* y) {
+  const auto* x_data = x->data<float>();
+  auto seq_offset = x->lod()[x->lod().size() - 1];
+  int width = x->numel() / x->dims()[0];
+  auto* y_data = y->mutable_data<float>();
+  for (int i = 0; i < seq_offset.size() - 1; ++i) {
+    auto start_pos = seq_offset[i];
+    auto end_pos = seq_offset[i + 1];
+    for (auto pos = start_pos; pos < end_pos; ++pos) {
+      auto cur_pos = end_pos - pos - 1 + start_pos;
+      std::memcpy(y_data + pos * width,
+                  x_data + cur_pos * width,
+                  width * sizeof(float));
+    }
+  }
+}
+}  // namespace
+
+TEST(sequence_reverse_x86, retrive_op) {
+  auto sequence_reverse =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "sequence_reverse");
+  ASSERT_FALSE(sequence_reverse.empty());
+  ASSERT_TRUE(sequence_reverse.front());
+}
+
+TEST(sequence_reverse_x86, init) {
+  SequenceReverseCompute<float, PRECISION(kFloat)> sequence_reverse;
+  ASSERT_EQ(sequence_reverse.precision(), PRECISION(kFloat));
+  ASSERT_EQ(sequence_reverse.target(), TARGET(kX86));
+}
+
+TEST(sequence_reverse_x86, run_test) {
+  SequenceReverseCompute<float, PRECISION(kFloat)> seq_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+
+  operators::SequenceReverseParam param;
+  lite::Tensor x, x_ref;
+  lite::Tensor y, y_ref;
+
+  int32_t lod_len = 10, feature_len = 4;
+  LoD lod_info{{0, 2, 4}, {0, 3, 5, 6, 10}};
+
+  x.Resize({lod_len, feature_len});
+  x_ref.Resize({lod_len, feature_len});
+  y.Resize({lod_len, feature_len});
+  y_ref.Resize({lod_len, feature_len});
+  x.set_lod(lod_info);
+  x_ref.set_lod(lod_info);
+  y.set_lod(lod_info);
+  y_ref.set_lod(lod_info);
+
+  auto* y_data = y.mutable_data<float>();
+  float* x_data = x.mutable_data<float>();
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* y_ref_data = y_ref.mutable_data<float>();
+
+  for (int i = 0; i < x.numel(); ++i) {
+    x_ref_data[i] = (i - 2.0) * 1.0;
+    x_data[i] = (i - 2.0) * 1.0;
+  }
+
+  param.X = &x;
+  param.Out = &y;
+  seq_kernel.SetParam(param);
+
+  seq_kernel.SetContext(std::move(ctx));
+  seq_kernel.Run();
+
+  sequence_reverse_ref(&x_ref, &y_ref);
+  for (int i = 0; i < y.numel(); i++) {
+    EXPECT_NEAR(y_data[i], y_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(sequence_reverse, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc
new file mode 100644
index 0000000000..9bd8b28750
--- /dev/null
+++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.cc
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/sequence_topk_avg_pooling_compute.h"
+
+REGISTER_LITE_KERNEL(
+    sequence_topk_avg_pooling,
+    kX86,
+    kFloat,
+    kNCHW,
+    paddle::lite::kernels::x86::SequenceTopkAvgPoolingCompute<float>,
+    def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("ROW", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("COLUMN", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("pos", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/sequence_topk_avg_pooling_compute.h b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h
new file mode 100644
index 0000000000..724415288a
--- /dev/null
+++ b/lite/kernels/x86/sequence_topk_avg_pooling_compute.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "lite/backends/x86/math/sequence_topk_avg_pooling.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class SequenceTopkAvgPoolingCompute
+    : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::SequenceTopkAvgPoolingParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    lite::x86::math::SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T>
+        sequence_topk_avg_pooling;
+    sequence_topk_avg_pooling(*param.X,
+                              *param.ROW,
+                              *param.COLUMN,
+                              param.Out,
+                              param.pos,
+                              param.channel_num,
+                              param.topks);
+  };
+  virtual ~SequenceTopkAvgPoolingCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/softmax_compute.cc b/lite/kernels/x86/softmax_compute.cc
index a00aa6d566..3a2cdc29ed 100644
--- a/lite/kernels/x86/softmax_compute.cc
+++ b/lite/kernels/x86/softmax_compute.cc
@@ -23,3 +23,13 @@ REGISTER_LITE_KERNEL(softmax,
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
     .Finalize();
+REGISTER_LITE_KERNEL(search_seq_softmax,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::SoftmaxCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out_log", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/stack_compute.cc b/lite/kernels/x86/stack_compute.cc
new file mode 100644
index 0000000000..5f69319a6c
--- /dev/null
+++ b/lite/kernels/x86/stack_compute.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/stack_compute.h"
+
+REGISTER_LITE_KERNEL(stack,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::StackCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/stack_compute.h b/lite/kernels/x86/stack_compute.h
new file mode 100644
index 0000000000..12a6c3490e
--- /dev/null
+++ b/lite/kernels/x86/stack_compute.h
@@ -0,0 +1,72 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/operators/stack_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class StackCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::StackParam;
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto x = param.X;
+    auto y = param.Out;
+
+    int axis = param.axis;
+    if (axis < 0) axis += (x[0]->dims().size() + 1);
+
+    int n = static_cast<int>(x.size());
+    auto y_data = y->mutable_data<T>();
+    std::vector<const T*> x_datas(n);
+    for (int i = 0; i < n; ++i) x_datas[i] = x[i]->data<T>();
+
+    int pre = 1, post = 1;
+    auto dim = x[0]->dims();
+    for (int i = 0; i < axis; ++i) pre *= dim[i];
+    for (int i = axis; i < dim.size(); ++i) post *= dim[i];
+
+    auto x_data_arr = x_datas.data();
+
+    size_t x_offset = 0;
+    size_t y_offset = 0;
+    for (int i = 0; i < pre; i++) {
+      for (int j = 0; j < n; j++) {
+        std::memcpy(
+            y_data + y_offset, x_data_arr[j] + x_offset, post * sizeof(T));
+        y_offset += post;
+      }
+      x_offset += post;
+    }
+  }
+
+  virtual ~StackCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/stack_compute_test.cc b/lite/kernels/x86/stack_compute_test.cc
new file mode 100644
index 0000000000..d105165a98
--- /dev/null
+++ b/lite/kernels/x86/stack_compute_test.cc
@@ -0,0 +1,89 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/stack_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+// stack
+TEST(stack_x86, retrive_op) {
+  auto stack =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("stack");
+  ASSERT_FALSE(stack.empty());
+  ASSERT_TRUE(stack.front());
+}
+
+TEST(stack_x86, init) {
+  lite::kernels::x86::StackCompute<float> stack;
+  ASSERT_EQ(stack.precision(), PRECISION(kFloat));
+  ASSERT_EQ(stack.target(), TARGET(kX86));
+}
+
+TEST(stack_x86, run_test) {
+  lite::Tensor x;
+  lite::Tensor out;
+  int num_input = 5;
+
+  std::vector<int64_t> x_shape({10, 20, 10});
+  x.Resize(lite::DDim(x_shape));
+
+  std::vector<int64_t> out_shape({5, 10, 20, 10});
+  out.Resize(lite::DDim(out_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<lite::Tensor*> input;
+  for (int i = 0; i < num_input; ++i) {
+    input.emplace_back(&x);
+  }
+
+  // StackCompute stack;
+  StackCompute<float> stack;
+  operators::StackParam param;
+
+  param.X = input;
+  param.Out = &out;
+  int axis = 0;
+  param.axis = axis;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  stack.SetContext(std::move(ctx));
+  stack.SetParam(param);
+  stack.Run();
+
+  int ref_data = 0;
+  for (int j = 0; j < out.dims().production(); ++j) {
+    EXPECT_NEAR(out_data[j], ref_data, 1e-5);
+    ref_data++;
+    ref_data = (ref_data >= 2000) ? (ref_data - 2000) : ref_data;
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(stack, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/var_conv_2d_compute.cc b/lite/kernels/x86/var_conv_2d_compute.cc
new file mode 100644
index 0000000000..48ae1b055e
--- /dev/null
+++ b/lite/kernels/x86/var_conv_2d_compute.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/var_conv_2d_compute.h"
+
+REGISTER_LITE_KERNEL(var_conv_2d,
+                     kX86,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::x86::VarConv2DCompute<float>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Col", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
diff --git a/lite/kernels/x86/var_conv_2d_compute.h b/lite/kernels/x86/var_conv_2d_compute.h
new file mode 100644
index 0000000000..c94cb2ca2d
--- /dev/null
+++ b/lite/kernels/x86/var_conv_2d_compute.h
@@ -0,0 +1,213 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <vector>
+#include "lite/backends/x86/math/blas.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::VarConv2DParam;
+
+  void Im2Col(const lite::Tensor& input, lite::Tensor* col) const {
+    auto& param = *param_.get_mutable<param_t>();
+    int input_channel = param.input_channel;
+    int kernel_h = param.kernel_h;
+    int kernel_w = param.kernel_w;
+    int stride_h = param.stride_h;
+    int stride_w = param.stride_w;
+    // auto* in_row = param.ROW;
+    // auto* in_col = param.COLUMN;
+
+    int batch = input.lod()[0].size() - 1;
+    const auto& bottom_offset = input.lod()[0];
+    // 2-D lod info.
+    // const auto& offset_x = in_col->lod()[0];
+    // const auto& offset_y = in_row->lod()[0];
+    const auto& offset_y = param.X->lod()[1];
+    const auto& offset_x = param.X->lod()[2];
+
+    // top offset is the whole size of each data sample
+    std::vector<uint64_t> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (int b = 0; b < batch; ++b) {
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      int top_im_x = 0;
+      if (width == 0) {
+        top_im_x = 0;
+      } else {
+        top_im_x = (width - 1) / stride_w + 1;
+      }
+      int top_im_y = 0;
+      if (height == 0) {
+        top_im_y = 0;
+      } else {
+        top_im_y = (height - 1) / stride_h + 1;
+      }
+      int top_x = top_im_x * top_im_y;
+      int top_y = input_channel * kernel_h * kernel_w;
+      top_size += top_y * top_x;
+      top_offset.push_back(top_size);
+    }
+    // std::vector<int64_t> col_lod_vec;
+    // col_lod_vec.push_back(top_offset);
+    LoD col_lod;
+    col_lod.push_back(top_offset);
+    col->set_lod(col_lod);
+    std::vector<int64_t> col_dims_vec{top_size};
+    col_dims_vec.push_back(1);
+    col->Resize(col_dims_vec);
+    auto* top_data = col->mutable_data<T>();
+    const auto* bottom_data = input.data<T>();
+
+    int kernel_win_size = kernel_h * kernel_w;
+    int half_kernel_h = kernel_h / 2;
+    int half_kernel_w = kernel_w / 2;
+    for (int b = 0; b < batch; ++b) {
+      int t_offset = top_offset[b];
+      int b_offset = bottom_offset[b];
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      if (width == 0 || height == 0) {
+        continue;
+      }
+      int top_im_x = (width - 1) / stride_w + 1;
+      int top_im_y = (height - 1) / stride_h + 1;
+      int top_x = top_im_y * top_im_x;
+      for (int z = 0; z < input_channel; ++z) {
+        int row_offset = kernel_win_size * z;
+        int im_offset = z * width * height;
+        for (int y = 0; y < height; y += stride_h) {
+          for (int x = 0; x < width; x += stride_w) {
+            int col_offset = x / stride_w + y / stride_h * top_im_x;
+            for (int ky = 0; ky < kernel_h; ++ky) {
+              for (int kx = 0; kx < kernel_w; ++kx) {
+                int im_y = y + ky - half_kernel_h;
+                int im_x = x + kx - half_kernel_w;
+                if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                  top_data[t_offset +
+                           (row_offset + ky * kernel_w + kx) * top_x +
+                           col_offset] =
+                      bottom_data[b_offset + im_offset + im_y * width + im_x];
+                } else {
+                  top_data[t_offset +
+                           (row_offset + ky * kernel_w + kx) * top_x +
+                           col_offset] = 0;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto& context = ctx_->As<X86Context>();
+    auto* bottom = param.X;
+    // auto* in_row = param.ROW;
+    // auto* in_col = param.COLUMN;
+    auto* w = param.W;
+    auto* top = param.Out;
+    auto* col = param.Col;
+
+    int output_channel = param.output_channel;
+    int input_channel = param.input_channel;
+    int kernel_h = param.kernel_h;
+    int kernel_w = param.kernel_w;
+    int stride_h = param.stride_h;
+    int stride_w = param.stride_w;
+
+    Im2Col(*bottom, col);
+    int batch = bottom->lod()[0].size() - 1;
+    const auto& col_offset = col->lod()[0];
+    // const auto& offset_x = in_col->lod()[0];
+    // const auto& offset_y = in_row->lod()[0];
+    const auto& offset_y = param.X->lod()[1];
+    const auto& offset_x = param.X->lod()[2];
+    std::vector<size_t> top_offset;
+    int top_size = 0;
+    top_offset.push_back(top_size);
+    for (int b = 0; b < batch; ++b) {
+      int width = offset_x[b + 1] - offset_x[b];
+      int height = offset_y[b + 1] - offset_y[b];
+      int top_im_x = 0;
+      if (width == 0) {
+        top_im_x = 0;
+      } else {
+        top_im_x = (width - 1) / stride_w + 1;
+      }
+      int top_im_y = 0;
+      if (height == 0) {
+        top_im_y = 0;
+      } else {
+        top_im_y = (height - 1) / stride_h + 1;
+      }
+      int top_im_size = top_im_y * top_im_x;
+      top_size += output_channel * top_im_size;
+      top_offset.push_back(top_size);
+    }
+
+    LoD top_lod;
+    top_lod.push_back(top_offset);
+    top->set_lod(top_lod);
+    std::vector<int64_t> top_dims_vec{top_size};
+    top_dims_vec.push_back(1);
+    top->Resize(top_dims_vec);
+    auto* top_data = top->mutable_data<T>();
+    const auto* w_data = w->data<T>();
+    const auto* col_data = col->data<T>();
+
+    auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, T>(context);
+    for (int b = 0; b < batch; ++b) {
+      int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+      if (top_im_size == 0) {
+        continue;
+      }
+
+      blas.GEMM(false,
+                false,
+                output_channel,
+                top_im_size,
+                input_channel * kernel_h * kernel_w,
+                1.0,
+                w_data,
+                input_channel * kernel_h * kernel_w,
+                col_data + col_offset[b],
+                top_im_size,
+                0.0,
+                top_data + top_offset[b],
+                top_im_size);
+    }
+  }
+
+  virtual ~VarConv2DCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/x86/var_conv_2d_compute_test.cc b/lite/kernels/x86/var_conv_2d_compute_test.cc
new file mode 100644
index 0000000000..d6ae5a67bf
--- /dev/null
+++ b/lite/kernels/x86/var_conv_2d_compute_test.cc
@@ -0,0 +1,315 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/x86/var_conv_2d_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/core/tensor.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+static void im2col_ref(const lite::Tensor& input,
+                       const lite::Tensor* in_row,
+                       const lite::Tensor* in_col,
+                       const int kernel_h,
+                       const int kernel_w,
+                       const int stride_h,
+                       const int stride_w,
+                       const int input_channel,
+                       lite::Tensor* col) {
+  int batch = input.lod()[0].size() - 1;
+  const auto& bottom_offset = input.lod()[0];
+  // 2-D lod info.
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+
+  // top offset is the whole size of each data sample
+  std::vector<uint64_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_x = top_im_x * top_im_y;
+    int top_y = input_channel * kernel_h * kernel_w;
+    top_size += top_y * top_x;
+    top_offset.push_back(top_size);
+  }
+  LoD col_lod;
+  col_lod.push_back(top_offset);
+  col->set_lod(col_lod);
+  std::vector<int64_t> col_dims_vec{top_size};
+  col_dims_vec.push_back(1);
+  col->Resize(col_dims_vec);
+  auto* top_data = col->mutable_data<float>();
+  const auto* bottom_data = input.data<float>();
+
+  int kernel_win_size = kernel_h * kernel_w;
+  int half_kernel_h = kernel_h / 2;
+  int half_kernel_w = kernel_w / 2;
+  for (int b = 0; b < batch; ++b) {
+    int t_offset = top_offset[b];
+    int b_offset = bottom_offset[b];
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    if (width == 0 || height == 0) {
+      continue;
+    }
+    int top_im_x = (width - 1) / stride_w + 1;
+    int top_im_y = (height - 1) / stride_h + 1;
+    int top_x = top_im_y * top_im_x;
+    for (int z = 0; z < input_channel; ++z) {
+      int row_offset = kernel_win_size * z;
+      int im_offset = z * width * height;
+      for (int y = 0; y < height; y += stride_h) {
+        for (int x = 0; x < width; x += stride_w) {
+          int col_offset = x / stride_w + y / stride_h * top_im_x;
+          for (int ky = 0; ky < kernel_h; ++ky) {
+            for (int kx = 0; kx < kernel_w; ++kx) {
+              int im_y = y + ky - half_kernel_h;
+              int im_x = x + kx - half_kernel_w;
+              if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] =
+                    bottom_data[b_offset + im_offset + im_y * width + im_x];
+              } else {
+                top_data[t_offset + (row_offset + ky * kernel_w + kx) * top_x +
+                         col_offset] = 0;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void var_conv_2d_ref(const lite::Tensor* bottom,
+                            const lite::Tensor* w,
+                            const lite::Tensor* in_row,
+                            const lite::Tensor* in_col,
+                            const int kernel_h,
+                            const int kernel_w,
+                            const int stride_h,
+                            const int stride_w,
+                            const int input_channel,
+                            const int output_channel,
+                            lite::Tensor* top,
+                            lite::Tensor* col) {
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<X86Context>();
+
+  im2col_ref(*bottom,
+             in_row,
+             in_col,
+             kernel_h,
+             kernel_w,
+             stride_h,
+             stride_w,
+             input_channel,
+             col);
+  int batch = bottom->lod()[0].size() - 1;
+  const auto& col_offset = col->lod()[0];
+  const auto& offset_x = in_col->lod()[0];
+  const auto& offset_y = in_row->lod()[0];
+  std::vector<size_t> top_offset;
+  int top_size = 0;
+  top_offset.push_back(top_size);
+  for (int b = 0; b < batch; ++b) {
+    int width = offset_x[b + 1] - offset_x[b];
+    int height = offset_y[b + 1] - offset_y[b];
+    int top_im_x = 0;
+    if (width == 0) {
+      top_im_x = 0;
+    } else {
+      top_im_x = (width - 1) / stride_w + 1;
+    }
+    int top_im_y = 0;
+    if (height == 0) {
+      top_im_y = 0;
+    } else {
+      top_im_y = (height - 1) / stride_h + 1;
+    }
+    int top_im_size = top_im_y * top_im_x;
+    top_size += output_channel * top_im_size;
+    top_offset.push_back(top_size);
+  }
+
+  LoD top_lod;
+  top_lod.push_back(top_offset);
+  top->set_lod(top_lod);
+  std::vector<int64_t> top_dims_vec{top_size};
+  top_dims_vec.push_back(1);
+  top->Resize(top_dims_vec);
+  auto* top_data = top->mutable_data<float>();
+  const auto* w_data = w->data<float>();
+  const auto* col_data = col->data<float>();
+
+  auto blas = lite::x86::math::GetBlas<lite::TargetType::kX86, float>(context);
+  for (int b = 0; b < batch; ++b) {
+    int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel;
+    if (top_im_size == 0) {
+      continue;
+    }
+
+    blas.GEMM(false,
+              false,
+              output_channel,
+              top_im_size,
+              input_channel * kernel_h * kernel_w,
+              1.0,
+              w_data,
+              input_channel * kernel_h * kernel_w,
+              col_data + col_offset[b],
+              top_im_size,
+              0.0,
+              top_data + top_offset[b],
+              top_im_size);
+  }
+}
+
+TEST(var_conv_2d_x86, retrive_op) {
+  auto var_conv_2d =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "var_conv_2d");
+  ASSERT_FALSE(var_conv_2d.empty());
+  ASSERT_TRUE(var_conv_2d.front());
+}
+
+TEST(var_conv_2d_x86, init) {
+  VarConv2DCompute<float> var_conv_2d;
+  ASSERT_EQ(var_conv_2d.precision(), PRECISION(kFloat));
+  ASSERT_EQ(var_conv_2d.target(), TARGET(kX86));
+}
+
+TEST(var_conv_2d_x86, run_test) {
+  VarConv2DCompute<float> var_conv_2d;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+
+  operators::VarConv2DParam param;
+
+  lite::Tensor X, W, ROW, COLUMN;
+  lite::Tensor Out, Col;
+  int kernel_h, kernel_w;
+  int stride_h, stride_w;
+  int input_channel, output_channel;
+
+  output_channel = 5;
+  input_channel = 5;
+  kernel_h = 5;
+  kernel_w = 5;
+  stride_h = 1;
+  stride_w = 1;
+  std::vector<int64_t> w_dims_vec;
+  w_dims_vec.push_back(output_channel);
+  w_dims_vec.push_back(input_channel * kernel_h * kernel_w);
+  W.Resize(w_dims_vec);
+  auto* w_data = W.mutable_data<float>();
+  for (int i = 0; i < W.numel(); ++i) {
+    w_data[i] = i - 1.f;
+  }
+
+  std::vector<uint64_t> row_lod_vec{0, 10, 20};
+  LoD row_lod;
+  row_lod.push_back(row_lod_vec);
+  ROW.set_lod(row_lod);
+
+  std::vector<uint64_t> column_lod_vec{0, 10, 20};
+  LoD column_lod;
+  column_lod.push_back(column_lod_vec);
+  COLUMN.set_lod(column_lod);
+
+  int x_size = 0;
+  std::vector<uint64_t> x_lod_vec;
+  x_lod_vec.push_back(0);
+  for (size_t i = 0; i < row_lod_vec.size() - 1; ++i) {
+    int height = row_lod_vec[i + 1] - row_lod_vec[i];
+    int width = column_lod_vec[i + 1] - column_lod_vec[i];
+    x_lod_vec.push_back(height * width * input_channel);
+    x_size += height * width * input_channel;
+  }
+  std::vector<int64_t> x_dims_vec{x_size, 1};
+  LoD x_lod;
+  x_lod.push_back(x_lod_vec);
+  x_lod.push_back(row_lod_vec);
+  x_lod.push_back(column_lod_vec);
+  X.Resize(x_dims_vec);
+  X.set_lod(x_lod);
+  auto* x_data = X.mutable_data<float>();
+  for (int i = 0; i < X.numel(); ++i) {
+    x_data[i] = i % 20 * 1.f;
+  }
+
+  param.X = &X;
+  param.W = &W;
+  // param.ROW = &ROW;
+  // param.COLUMN = &COLUMN;
+  param.Out = &Out;
+  param.Col = &Col;
+  param.stride_h = stride_h;
+  param.stride_w = stride_w;
+  param.kernel_h = kernel_h;
+  param.kernel_w = kernel_w;
+  param.input_channel = input_channel;
+  param.output_channel = output_channel;
+  var_conv_2d.SetParam(param);
+  var_conv_2d.SetContext(std::move(ctx));
+  var_conv_2d.Run();
+
+  lite::Tensor top_ref, col_ref;
+  var_conv_2d_ref(&X,
+                  &W,
+                  &ROW,
+                  &COLUMN,
+                  kernel_h,
+                  kernel_w,
+                  stride_h,
+                  stride_w,
+                  input_channel,
+                  output_channel,
+                  &top_ref,
+                  &col_ref);
+
+  for (int i = 0; i < Out.numel(); ++i) {
+    EXPECT_NEAR(Out.data<float>()[i], top_ref.data<float>()[i], 1e-5);
+  }
+  for (int i = 0; i < Col.numel(); ++i) {
+    EXPECT_NEAR(Col.data<float>()[i], col_ref.data<float>()[i], 1e-5);
+  }
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(var_conv_2d, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc
index 2c758cf950..d6fc806ad4 100644
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "lite/operators/conv_op.h"
 #include "lite/backends/xpu/builder.h"
 #include "lite/kernels/xpu/bridges/registry.h"
 
@@ -46,14 +47,36 @@ node_map_type ConvConverter(const std::shared_ptr<lite::OpLite> op,
   auto groups = op_info->GetAttr<int>("groups");
   auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
   auto fuse_relu = op_info->GetAttr<bool>("fuse_relu");
-  CHECK_EQ(strides.size(), 2);
-  CHECK_EQ(paddings.size(), 2);
-  CHECK_EQ(dilations.size(), 2);
+  CHECK_EQ(strides.size(), 2L);
+  CHECK_EQ(dilations.size(), 2L);
+
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  }
+  CHECK_EQ(paddings.size(), 4L)
+      << "Paddings size should be the same or twice as the input size.";
+
+  std::string padding_algorithm("");
+  if (op_info->HasAttr("padding_algorithm")) {
+    padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
+  }
+  operators::UpdatePaddingAndDilation(&paddings,
+                                      &dilations,
+                                      strides,
+                                      padding_algorithm,
+                                      input_dims,
+                                      filter_dims);
+
   std::vector<int64_t> output_shape({bs, oc});
   for (size_t i = 0; i < 2; i++) {
     const int dkernel = dilations[i] * (filter_dims[2 + i] - 1) + 1;
     output_shape.push_back(
-        (input_dims[i + 2] + 2 * paddings[i] - dkernel) / strides[i] + 1);
+        (input_dims[i + 2] + paddings[2 * i] + paddings[2 * i + 1] - dkernel) /
+            strides[i] +
+        1);
   }
   DDim output_dims(output_shape);
 
diff --git a/lite/kernels/xpu/bridges/conv_op_test.cc b/lite/kernels/xpu/bridges/conv_op_test.cc
index ebdb67bd0d..70929ffcd5 100644
--- a/lite/kernels/xpu/bridges/conv_op_test.cc
+++ b/lite/kernels/xpu/bridges/conv_op_test.cc
@@ -54,7 +54,7 @@ void conv_ref(const std::shared_ptr<operators::ConvOpLite> op) {
   int stride_h = strides[0];
   int dila_w = dilations[1];
   int dila_h = dilations[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int pad_h = paddings[0];
   int batch_size = input_dims[0];
   int in_ch_size = input_dims[1];
@@ -175,7 +175,8 @@ void test_conv(int bs,
   opdesc.SetOutput("Output", {output_var_name});
   opdesc.SetAttr("dilations", std::vector<int32_t>({dilation, dilation}));
   opdesc.SetAttr("strides", std::vector<int32_t>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int32_t>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int32_t>({padding, padding, padding, padding}));
   opdesc.SetAttr("groups", groups);
   opdesc.SetAttr("fuse_relu", static_cast<bool>(fuse_relu));
   if (has_bias) {
diff --git a/lite/kernels/xpu/bridges/pool_op_test.cc b/lite/kernels/xpu/bridges/pool_op_test.cc
index ed5f922d59..7efc6b464c 100644
--- a/lite/kernels/xpu/bridges/pool_op_test.cc
+++ b/lite/kernels/xpu/bridges/pool_op_test.cc
@@ -60,7 +60,7 @@ void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
 
   if (global_pooling == true) {
     for (int n = 0; n < in_n; ++n) {
@@ -162,7 +162,8 @@ void test_pool(int bs,
   opdesc.SetAttr("global_pooling", global_pooling);
   opdesc.SetAttr("exclusive", exclusive);
   opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings", std::vector<int>({padding, padding}));
+  opdesc.SetAttr("paddings",
+                 std::vector<int>({padding, padding, padding, padding}));
   opdesc.SetAttr("ceil_mode", ceil_mode);
 
   // create and convert op to XPU model, then run it on XPU
diff --git a/lite/model_parser/model_parser.cc b/lite/model_parser/model_parser.cc
index 13b6cb5b77..ed3f45c598 100644
--- a/lite/model_parser/model_parser.cc
+++ b/lite/model_parser/model_parser.cc
@@ -568,7 +568,7 @@ void SaveModelNaive(const std::string &model_dir,
       SaveParamNaive(path, exec_scope, var.Name());
     }
   }
-  VLOG(4) << "Save naive buffer model in '" << model_dir << "'' successfully";
+  LOG(INFO) << "Save naive buffer model in '" << model_dir << "' successfully";
 }
 #endif
 
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 49badbb27b..7c4048c204 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -2,11 +2,10 @@ set(op_DEPS tensor op op_params scope memory)
 
 lite_cc_library(op_params SRCS op_params.cc DEPS tensor any)
 
+# 1.baisc ops used in basic models
 add_operator(conv_op basic SRCS conv_op.cc DEPS ${op_DEPS})
 add_operator(pool_op basic SRCS pool_op.cc DEPS ${op_DEPS})
 add_operator(fc_op basic SRCS fc_op.cc DEPS ${op_DEPS})
-add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS})
-add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
 add_operator(mul_op basic SRCS mul_op.cc DEPS ${op_DEPS})
 add_operator(matmul_op basic SRCS matmul_op.cc DEPS ${op_DEPS})
 add_operator(scale_op basic SRCS scale_op.cc DEPS ${op_DEPS})
@@ -15,57 +14,64 @@ add_operator(reshape_op basic SRCS reshape_op.cc DEPS ${op_DEPS} )
 add_operator(batch_norm_op basic SRCS batch_norm_op.cc DEPS ${op_DEPS})
 add_operator(feed_op basic SRCS feed_op.cc DEPS ${op_DEPS})
 add_operator(fetch_op basic SRCS fetch_op.cc DEPS ${op_DEPS})
-add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS})
-add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
 add_operator(activation_ops basic SRCS activation_ops.cc DEPS ${op_DEPS})
 add_operator(elementwise_ops basic SRCS elementwise_ops.cc DEPS ${op_DEPS})
-add_operator(lrn_op_lite basic SRCS lrn_op.cc DEPS ${op_DEPS})
-add_operator(decode_bboxes_op_lite basic SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
 add_operator(box_coder_op_lite basic SRCS box_coder_op.cc DEPS ${op_DEPS})
 add_operator(multiclass_nms_op_lite basic SRCS multiclass_nms_op.cc DEPS ${op_DEPS})
-add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
 add_operator(mean_op basic SRCS mean_op.cc DEPS ${op_DEPS})
 add_operator(fill_constant_op basic SRCS fill_constant_op.cc DEPS ${op_DEPS})
-#add_operator(sgd_op basic SRCS sgd_op.cc DEPS ${op_DEPS})
-add_operator(uniform_random_op basic SRCS uniform_random_op.cc DEPS ${op_DEPS})
-add_operator(power_op basic SRCS power_op.cc DEPS ${op_DEPS})
 add_operator(shuffle_channel_op basic SRCS shuffle_channel_op.cc DEPS ${op_DEPS})
 add_operator(yolo_box_op basic SRCS yolo_box_op.cc DEPS ${op_DEPS})
 add_operator(interpolate_op basic SRCS interpolate_op.cc DEPS ${op_DEPS})
 add_operator(argmax_op basic SRCS argmax_op.cc DEPS ${op_DEPS})
-add_operator(axpy_op basic SRCS axpy_op.cc DEPS ${op_DEPS})
-add_operator(gru_unit_op basic SRCS gru_unit_op.cc DEPS ${op_DEPS})
-add_operator(gru_op basic SRCS gru_op.cc DEPS ${op_DEPS})
-add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
-add_operator(layout_once_op basic SRCS layout_once_op.cc DEPS ${op_DEPS})
 add_operator(prior_box_op basic SRCS prior_box_op.cc DEPS ${op_DEPS})
-add_operator(density_prior_box_op basic SRCS density_prior_box_op.cc DEPS ${op_DEPS})
-add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
 add_operator(concat_op basic SRCS concat_op.cc DEPS ${op_DEPS})
 add_operator(pad2d_op basic SRCS pad2d_op.cc DEPS ${op_DEPS})
-add_operator(negative_op basic SRCS negative_op.cc DEPS ${op_DEPS})
-add_operator(crop_op basic SRCS crop_op.cc DEPS ${op_DEPS})
 add_operator(calib_op basic SRCS calib_op.cc DEPS ${op_DEPS})
-add_operator(calib_once_op basic SRCS calib_once_op.cc DEPS ${op_DEPS})
 add_operator(split_op basic SRCS split_op.cc DEPS ${op_DEPS})
 add_operator(transpose_op basic SRCS transpose_op.cc DEPS ${op_DEPS})
 add_operator(fake_quant basic SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 add_operator(fake_dequant basic SRCS fake_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(conv_transpose_op basic SRCS conv_transpose_op.cc DEPS ${op_DEPS})
-add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
 add_operator(expand_op_lite basic SRCS expand_op.cc DEPS ${op_DEPS})
-add_operator(reduce_max_op_lite basic SRCS reduce_max_op.cc DEPS ${op_DEPS})
-add_operator(norm_op basic SRCS norm_op.cc DEPS ${op_DEPS})
-add_operator(shape_op_lite basic SRCS shape_op.cc DEPS ${op_DEPS})
-add_operator(sequence_expand_op_lite basic SRCS sequence_expand_op.cc DEPS ${op_DEPS})
 add_operator(squeeze_op_lite basic SRCS squeeze_op.cc DEPS ${op_DEPS})
-add_operator(unsqueeze_op_lite extra SRCS unsqueeze_op.cc DEPS ${op_DEPS})
-add_operator(im2sequence_op basic SRCS im2sequence_op.cc DEPS ${op_DEPS})
+add_operator(unsqueeze_op_lite basic SRCS unsqueeze_op.cc DEPS ${op_DEPS})
+add_operator(stack_op basic SRCS stack_op.cc DEPS ${op_DEPS})
+add_operator(cast_op_lite basic SRCS cast_op.cc DEPS ${op_DEPS})
+add_operator(affine_channel_op basic SRCS affine_channel_op.cc DEPS ${op_DEPS})
+add_operator(range_op basic SRCS range_op.cc DEPS ${op_DEPS})
+add_operator(reduce_mean_op basic SRCS reduce_mean_op.cc DEPS ${op_DEPS})
+add_operator(relu_op basic SRCS relu_op.cc DEPS ${op_DEPS})
+add_operator(io_copy_op basic SRCS io_copy_op.cc DEPS ${op_DEPS})
+add_operator(fusion_elementwise_activation_ops basic SRCS fusion_elementwise_activation_ops.cc DEPS elementwise_ops ${op_DEPS})
+add_operator(io_copy_once_op basic SRCS io_copy_once_op.cc DEPS io_copy_op ${op_DEPS})
+add_operator(dropout_op basic SRCS dropout_op.cc DEPS ${op_DEPS})
+add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
+add_operator(graph_op basic SRCS graph_op.cc DEPS ${op_DEPS})
+
+# 2.basic ops not used in basic models
+add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
+add_operator(crop_op extra SRCS crop_op.cc DEPS ${op_DEPS})
+add_operator(assign_op extra SRCS assign_op.cc DEPS ${op_DEPS})
+add_operator(power_op extra SRCS power_op.cc DEPS ${op_DEPS})
+add_operator(norm_op extra SRCS norm_op.cc DEPS ${op_DEPS})
+
+# 3.extra ops
+add_operator(search_group_padding extra SRCS search_group_padding_op.cc DEPS ${op_DEPS})
+add_operator(lrn_op_lite extra SRCS lrn_op.cc DEPS ${op_DEPS})
+add_operator(decode_bboxes_op_lite extra SRCS decode_bboxes_op.cc DEPS ${op_DEPS})
+add_operator(uniform_random_op extra SRCS uniform_random_op.cc DEPS ${op_DEPS})
+add_operator(axpy_op extra SRCS axpy_op.cc DEPS ${op_DEPS})
+add_operator(gru_unit_op extra SRCS gru_unit_op.cc DEPS ${op_DEPS})
+add_operator(gru_op extra SRCS gru_op.cc DEPS ${op_DEPS})
+add_operator(layout_once_op extra SRCS layout_once_op.cc DEPS ${op_DEPS})
+add_operator(density_prior_box_op extra SRCS density_prior_box_op.cc DEPS ${op_DEPS})
+add_operator(calib_once_op extra SRCS calib_once_op.cc DEPS ${op_DEPS})
+add_operator(reduce_max_op_lite extra SRCS reduce_max_op.cc DEPS ${op_DEPS})
+add_operator(shape_op_lite extra SRCS shape_op.cc DEPS ${op_DEPS})
+add_operator(sequence_expand_op_lite extra SRCS sequence_expand_op.cc DEPS ${op_DEPS})
+add_operator(im2sequence_op extra SRCS im2sequence_op.cc DEPS ${op_DEPS})
 add_operator(gather_op extra SRCS gather_op.cc DEPS ${op_DEPS})
-add_operator(reduce_mean_op extra SRCS reduce_mean_op.cc DEPS ${op_DEPS})
-add_operator(stack_op extra SRCS stack_op.cc DEPS ${op_DEPS})
-add_operator(cast_op_lite extra SRCS cast_op.cc DEPS ${op_DEPS})
-add_operator(affine_channel_op extra SRCS affine_channel_op.cc DEPS ${op_DEPS})
 add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEPS})
 add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS})
 add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS})
@@ -73,16 +79,26 @@ add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS})
 add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
-add_operator(range_op extra SRCS range_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS})
+
 add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(sequence_reshape_op_lite extra SRCS sequence_reshape_op.cc DEPS ${op_DEPS})
+add_operator(sequence_reverse_op_lite extra SRCS sequence_reverse_op.cc DEPS ${op_DEPS})
 add_operator(reduce_sum_op_lite extra SRCS reduce_ops.cc DEPS ${op_DEPS})
+add_operator(match_matrix_tensor_op_lite extra SRCS match_matrix_tensor_op.cc DEPS ${op_DEPS})
+add_operator(search_seq_depadding_op_lite extra SRCS search_seq_depadding_op.cc DEPS ${op_DEPS})
+add_operator(search_grnn_op_lite extra SRCS search_grnn_op.cc DEPS ${op_DEPS})
+add_operator(search_seq_softmax_op_lite extra SRCS search_seq_softmax_op.cc DEPS ${op_DEPS})
+add_operator(sequence_concat_op_lite extra SRCS sequence_concat_op.cc DEPS ${op_DEPS})
+add_operator(var_conv_2d_op_lite extra SRCS var_conv_2d_op.cc DEPS ${op_DEPS})
+add_operator(attention_padding_mask_op_lite extra SRCS attention_padding_mask_op.cc DEPS ${op_DEPS})
+add_operator(sequence_arithmetic_op_lite extra SRCS sequence_arithmetic_op.cc DEPS ${op_DEPS})
 
 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})
 add_operator(lookup_table_op extra SRCS lookup_table_op.cc DEPS ${op_DEPS})
+add_operator(lookup_table_v2_op extra SRCS lookup_table_v2_op.cc DEPS ${op_DEPS})
 add_operator(beam_search_decode_op extra SRCS beam_search_decode_op.cc DEPS ${op_DEPS})
 add_operator(graph_op_lite extra SRCS graph_op.cc DEPS ${op_DEPS})
 add_operator(logical_xor  extra SRCS logical_op.cc DEPS ${op_DEPS})
@@ -106,7 +122,11 @@ add_operator(topk_op extra SRCS topk_op.cc DEPS ${op_DEPS})
 add_operator(increment_op extra SRCS increment_op.cc DEPS ${op_DEPS})
 add_operator(layer_norm_op extra SRCS layer_norm_op.cc DEPS ${op_DEPS})
 add_operator(sequence_softmax_op extra SRCS sequence_softmax_op.cc DEPS ${op_DEPS})
-
+# for content-dnn specific
+add_operator(search_aligned_mat_mul_op extra SRCS search_aligned_mat_mul_op.cc DEPS ${op_DEPS})
+add_operator(search_seq_fc_op extra SRCS search_seq_fc_op.cc DEPS ${op_DEPS})
+add_operator(sequence_topk_avg_pooling_op basic SRCS sequence_topk_avg_pooling_op.cc DEPS ${op_DEPS})
+add_operator(search_fc_op basic SRCS search_fc_op.cc DEPS ${op_DEPS})
 
 if (NOT LITE_WITH_X86)
     lite_cc_test(test_fc_op SRCS fc_op_test.cc
@@ -122,8 +142,8 @@ if (NOT LITE_WITH_X86)
     lite_cc_test(test_batch_norm_op SRCS batch_norm_op_test.cc DEPS batch_norm_op memory)
     lite_cc_test(test_concat_op SRCS concat_op_test.cc DEPS concat_op memory scope)
     lite_cc_test(test_calib_op SRCS calib_op_test.cc DEPS calib_op memory ARM_DEPS calib_compute_arm)
-    lite_cc_test(test_fusion_elementwise_activation_ops
-                SRCS fusion_elementwise_activation_ops_test.cc
-                DEPS fusion_elementwise_activation_ops memory)
     lite_cc_test(test_transpose_op SRCS transpose_op_test.cc DEPS transpose_op memory)
+    lite_cc_test(test_fusion_elementwise_activation_ops
+                 SRCS fusion_elementwise_activation_ops_test.cc
+                 DEPS fusion_elementwise_activation_ops memory)
 endif()
diff --git a/lite/operators/activation_ops.cc b/lite/operators/activation_ops.cc
index c3c5de311f..6ddcee0cb9 100644
--- a/lite/operators/activation_ops.cc
+++ b/lite/operators/activation_ops.cc
@@ -117,6 +117,7 @@ REGISTER_LITE_OP(log, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(exp, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(floor, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(hard_sigmoid, paddle::lite::operators::ActivationOp);
+REGISTER_LITE_OP(sqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(rsqrt, paddle::lite::operators::ActivationOp);
 REGISTER_LITE_OP(softsign, paddle::lite::operators::ActivationOp);
 
diff --git a/lite/operators/attention_padding_mask_op.cc b/lite/operators/attention_padding_mask_op.cc
new file mode 100644
index 0000000000..a88df0e7a9
--- /dev/null
+++ b/lite/operators/attention_padding_mask_op.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/attention_padding_mask_op.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool AttentionPaddingMaskOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.pad_begin);
+  return true;
+}
+
+bool AttentionPaddingMaskOp::InferShape() const {
+  auto src_len = param_.X->lod()[0][1];
+  CHECK_EQ(src_len, param_.X->dims()[1])
+      << "Mismatch source length, expect: " << src_len
+      << ", get: " << param_.X->lod()[0][1];
+  auto att_batch = param_.X->lod()[0].size() - 1;
+  auto src_batch = param_.Y->lod()[0].size() - 1;
+  CHECK_EQ(att_batch % src_batch, 0)
+      << "Mismatch batch size, bottom0: " << att_batch
+      << ", bottom1: " << src_batch;
+
+  param_.pad_begin->Resize({static_cast<int64_t>(src_batch)});
+  param_.Out->Resize(param_.X->dims());
+  param_.Out->set_lod(param_.X->lod());
+
+  return true;
+}
+
+bool AttentionPaddingMaskOp::AttachImpl(const cpp::OpDesc &op_desc,
+                                        lite::Scope *scope) {
+  param_.X = scope->FindTensor(op_desc.Input("X").front());
+  param_.Y = scope->FindTensor(op_desc.Input("Y").front());
+  param_.Out = scope->FindMutableTensor(op_desc.Output("Out").front());
+  param_.pad_begin =
+      scope->FindMutableTensor(op_desc.Output("pad_begin").front());
+
+  param_.pad_id = op_desc.GetAttr<int>("pad_id");
+  param_.mask = op_desc.GetAttr<float>("mask");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(attention_padding_mask,
+                 paddle::lite::operators::AttentionPaddingMaskOp);
+REGISTER_LITE_OP(search_attention_padding_mask,
+                 paddle::lite::operators::AttentionPaddingMaskOp);
diff --git a/lite/operators/attention_padding_mask_op.h b/lite/operators/attention_padding_mask_op.h
new file mode 100644
index 0000000000..894d68f622
--- /dev/null
+++ b/lite/operators/attention_padding_mask_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class AttentionPaddingMaskOp : public OpLite {
+ public:
+  AttentionPaddingMaskOp() {}
+
+  explicit AttentionPaddingMaskOp(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "attention_padding_mask"; }
+
+ private:
+  mutable AttentionPaddingMaskParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/conv_op.cc b/lite/operators/conv_op.cc
index ceca1a61ce..6dab55ff3b 100644
--- a/lite/operators/conv_op.cc
+++ b/lite/operators/conv_op.cc
@@ -39,56 +39,38 @@ bool ConvOpLite::CheckShape() const {
   return true;
 }
 
-inline int ConvOutputSize(
-    int input_size, int filter_size, int dilation, int padding, int stride) {
+inline int ConvOutputSize(int input_size,
+                          int filter_size,
+                          int dilation,
+                          int pad_left,
+                          int pad_right,
+                          int stride) {
   const int dkernel = dilation * (filter_size - 1) + 1;
-  int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
-  // CHECK_GT_OR_FALSE(output_size, 0);
+  int output_size =
+      (input_size + (pad_left + pad_right) - dkernel) / stride + 1;
 
   return output_size;
 }
 
-inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
-                                     std::vector<int>* dilations,
-                                     const std::vector<int>& strides,
-                                     const std::string padding_algorithm,
-                                     const lite::DDim data_dims,
-                                     const lite::DDim& ksize) {
-  // when padding_desc is "VALID" or "SAME"
-  if (padding_algorithm == "SAME") {
-    for (size_t i = 0; i < strides.size(); ++i) {
-      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
-      int pad_sum =
-          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
-                   (int64_t)0);
-      // pad
-      *(paddings->begin() + i) = pad_sum / 2;
-      // dilation
-      *(dilations->begin() + i) = 1;
-    }
-  } else if (padding_algorithm == "VALID") {
-    for (auto& it : *paddings) {
-      it = 0;
-    }
-  }
-}
-
 bool ConvOpLite::InferShape() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
-  UpdatePaddingAndDilation(&param_.paddings,
-                           &param_.dilations,
+  UpdatePaddingAndDilation(param_.paddings.get(),
+                           param_.dilations.get(),
                            param_.strides,
                            padding_algorithm_,
                            in_dims,
                            filter_dims);
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  auto paddings = *param_.paddings;
+  auto dilations = *param_.dilations;
   for (size_t i = 0; i < param_.strides.size(); ++i) {
     output_shape.push_back(ConvOutputSize(in_dims[i + 2],
                                           filter_dims[i + 2],
-                                          param_.dilations[i],
-                                          param_.paddings[i],
+                                          dilations[i],
+                                          paddings[i * 2],
+                                          paddings[i * 2 + 1],
                                           param_.strides[i]));
   }
 
diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h
index e764819f63..3ab34bc1d0 100644
--- a/lite/operators/conv_op.h
+++ b/lite/operators/conv_op.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/kernel.h"
@@ -47,9 +48,10 @@ class ConvOpLite : public OpLite {
     param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
     param_.groups = op_desc.GetAttr<int>("groups");
-    param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+    auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+    param_.dilations = std::make_shared<std::vector<int>>(dilations);
 
     // optional params
     std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
@@ -109,12 +111,24 @@ class ConvOpLite : public OpLite {
         param_.output_scale = op_desc.GetAttr<float>("output_scale");
       }
     }
+
+    // 2-pad to 4-pad
+    if (paddings.size() == 2L) {
+      for (size_t i = 0; i < param_.strides.size(); ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+    } else {
+      if (paddings.size() != 4L) {
+        LOG(FATAL)
+            << "Paddings size should be the same or twice as the input size.";
+      }
+    }
+    param_.paddings = std::make_shared<std::vector<int>>(paddings);
     return true;
   }
 
-  void AttachKernel(KernelBase* kernel) override { 
-      kernel->SetParam(param_);
-  }
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
 
   std::string DebugString() const override { return "conv2d"; }
 
@@ -123,6 +137,34 @@ class ConvOpLite : public OpLite {
   std::string padding_algorithm_{""};
 };
 
+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const lite::DDim data_dims,
+                                     const lite::DDim& ksize) {
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum = std::max(
+          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
+          (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      // pad
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+      // dilation
+      *(dilations->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto& it : *paddings) {
+      it = 0;
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/conv_transpose_op.cc b/lite/operators/conv_transpose_op.cc
index fb6b431fff..a472ae0745 100644
--- a/lite/operators/conv_transpose_op.cc
+++ b/lite/operators/conv_transpose_op.cc
@@ -11,8 +11,8 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
 #include "lite/operators/conv_transpose_op.h"
+#include <memory>
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 
@@ -32,24 +32,75 @@ bool ConvTransposeOpLite::CheckShape() const {
 
   CHECK_EQ_OR_FALSE(in_dims.size(), filter_dims.size());
   CHECK_OR_FALSE(in_dims.size() - param_.strides.size() == 2U);
-  CHECK_EQ_OR_FALSE(param_.paddings.size(), param_.strides.size());
 
   CHECK_OR_FALSE(in_dims[1] % param_.groups == 0);
+  CHECK_EQ_OR_FALSE(filter_dims.size(), 4UL);
   return true;
 }
 
+inline int ConvTransposeOutputSize(int input_size,
+                                   int filter_size,
+                                   int dilation,
+                                   int pad_left,
+                                   int pad_right,
+                                   int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  int output_size = (input_size - 1) * stride - pad_left - pad_right + dkernel;
+
+  return output_size;
+}
+
+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const lite::DDim data_dims,
+                                     const lite::DDim& ksize) {
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum = std::max(
+          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
+          (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      // pad
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+      // dilation
+      *(dilations->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto& it : *paddings) {
+      it = 0;
+    }
+  }
+}
+
 bool ConvTransposeOpLite::InferShape() const {
   const auto in_dims = param_.x->dims();
   const auto filter_dims = param_.filter->dims();
 
+  UpdatePaddingAndDilation(param_.paddings.get(),
+                           param_.dilations.get(),
+                           param_.strides,
+                           padding_algorithm_,
+                           in_dims,
+                           filter_dims);
+  auto paddings = *param_.paddings;
+  auto dilations = *param_.dilations;
+
   std::vector<int64_t> output_shape;
   output_shape.push_back(in_dims[0]);
   output_shape.push_back(filter_dims[1] * param_.groups);
-  for (int i = 0; i < param_.strides.size(); i++) {
-    int kernel_extent = param_.dilations[i] * (filter_dims[i + 2] - 1) + 1;
-    int output_len = (in_dims[i + 2] - 1) * param_.strides[i] + kernel_extent -
-                     2 * param_.paddings[i];
-    output_shape.push_back(output_len);
+  for (size_t i = 0; i < param_.strides.size(); ++i) {
+    output_shape.push_back(ConvTransposeOutputSize(in_dims[i + 2],
+                                                   filter_dims[i + 2],
+                                                   dilations[i],
+                                                   paddings[i * 2],
+                                                   paddings[i * 2 + 1],
+                                                   param_.strides[i]));
   }
 
   // Set output dims
@@ -58,8 +109,8 @@ bool ConvTransposeOpLite::InferShape() const {
 }
 
 // TODO(Superjomn) replace framework::OpDesc with a lite one.
-bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
-                                     lite::Scope *scope) {
+bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                     lite::Scope* scope) {
   auto X = op_desc.Input("Input").front();
   auto Filter = op_desc.Input("Filter").front();
   auto Out = op_desc.Output("Output").front();
@@ -68,9 +119,27 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
   param_.output = scope->FindVar(Out)->GetMutable<lite::Tensor>();
 
   param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-  param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+  auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
   param_.groups = op_desc.GetAttr<int>("groups");
-  param_.dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+  auto dilations = op_desc.GetAttr<std::vector<int>>("dilations");
+
+  if (op_desc.HasAttr("padding_algorithm")) {
+    padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+  }
+  // 2-pad to 4-pad
+  if (paddings.size() == 2L) {
+    for (size_t i = 0; i < 2L; ++i) {
+      int copy_pad = *(paddings.begin() + 2 * i);
+      paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+    }
+  } else {
+    if (paddings.size() != 4L) {
+      LOG(FATAL)
+          << "Paddings size should be the same or twice as the input size.";
+    }
+  }
+  param_.paddings = std::make_shared<std::vector<int>>(paddings);
+  param_.dilations = std::make_shared<std::vector<int>>(dilations);
 
   // optional params
   std::vector<std::string> input_arg_names = op_desc.InputArgumentNames();
@@ -81,7 +150,7 @@ bool ConvTransposeOpLite::AttachImpl(const cpp::OpDesc &op_desc,
       auto bias_var = scope->FindVar(bias_arguments.front());
       if (bias_var != nullptr) {
         param_.bias =
-            const_cast<lite::Tensor *>(&(bias_var->Get<lite::Tensor>()));
+            const_cast<lite::Tensor*>(&(bias_var->Get<lite::Tensor>()));
       }
     }
   }
diff --git a/lite/operators/conv_transpose_op.h b/lite/operators/conv_transpose_op.h
index d8b64c78ef..fb25c022f9 100644
--- a/lite/operators/conv_transpose_op.h
+++ b/lite/operators/conv_transpose_op.h
@@ -44,6 +44,7 @@ class ConvTransposeOpLite : public OpLite {
 
  private:
   mutable ConvParam param_;
+  std::string padding_algorithm_{""};
 };
 
 }  // namespace operators
diff --git a/lite/operators/fill_constant_op.cc b/lite/operators/fill_constant_op.cc
index 6e4bee4da8..acf9701cbd 100644
--- a/lite/operators/fill_constant_op.cc
+++ b/lite/operators/fill_constant_op.cc
@@ -29,6 +29,12 @@ class FillConstantOp : public OpLite {
   }
 
   bool InferShape() const override {
+    lite::Tensor* shape_tensor_ = param_.shape_tensor;
+    if (param_.shape.empty() && shape_tensor_ != nullptr) {
+      param_.Out->Resize(shape_tensor_->dims());
+      return true;
+    }
+
     param_.Out->Resize(param_.shape);
     return true;
   }
@@ -41,6 +47,23 @@ class FillConstantOp : public OpLite {
     param_.shape = opdesc.GetAttr<std::vector<int64_t>>("shape");
     param_.value = opdesc.GetAttr<float>("value");
     param_.force_cpu = opdesc.GetAttr<bool>("force_cpu");
+    param_.shape_tensor = nullptr;
+    param_.shape_tensor_list = {};
+
+    std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+    if (std::find(input_arg_names.begin(),
+                  input_arg_names.end(),
+                  "ShapeTensor") != input_arg_names.end()) {
+      auto args = opdesc.Input("ShapeTensor");
+      auto* var = scope->FindVar(args.front());
+      param_.shape_tensor = var->GetMutable<lite::Tensor>();
+    }
+    if (opdesc.HasAttr("ShapeTensorList")) {
+      auto args = opdesc.Input("ShapeTensorList");
+      auto* var = scope->FindVar(args.front());
+      param_.shape_tensor_list =
+          *(var->GetMutable<std::vector<lite::Tensor*>>());
+    }
     return true;
   }
 
diff --git a/lite/operators/interpolate_op.cc b/lite/operators/interpolate_op.cc
index b98240ba4f..936da73d89 100644
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
@@ -45,23 +45,42 @@ bool InterpolateOp::InferShape() const {
   int out_h;
   int out_w;
 
-  if (OutSize != nullptr) {
-    auto outsize_data = OutSize->data<int>();
-    int h_out = outsize_data[0];  // HW
-    int w_out = outsize_data[1];  // HW
-    param_.Out->Resize({n, c, h_out, w_out});
+  auto SizeTensor = param_.SizeTensor;
+  if (!SizeTensor.empty()) {
+    CHECK(SizeTensor.size() == 2)
+        << "Input(SizeTensor)'size of Op(interpolate) must be 2. "
+           "Attr(out_shape)'s length must be 2 for 4-D input tensor.";
+    out_h = param_.out_h;
+    out_w = param_.out_w;
+    param_.Out->Resize({n, c, out_h, out_w});
+    return true;
+  }
+
+  auto Scale = param_.Scale;
+  if (Scale) {
+    auto scale_dims = Scale->dims();
+    CHECK(scale_dims.size() == 1) << "Scale's dimension size must be 1.";
+    out_h = -1;
+    out_w = -1;
   } else {
-    if (0 >= param_.out_h && 0 >= param_.out_w) {
-      out_h = h * param_.scale;
-      out_w = w * param_.scale;
+    auto scale = param_.scale;
+    if (scale > 0) {
+      out_h = static_cast<int>(h * scale);
+      out_w = static_cast<int>(w * scale);
       out_h = out_h > 0 ? out_h : -1;
       out_w = out_w > 0 ? out_w : -1;
     } else {
       out_h = param_.out_h;
       out_w = param_.out_w;
     }
-    param_.Out->Resize({n, c, out_h, out_w});
   }
+
+  if (OutSize != nullptr) {
+    auto out_lod = param_.Out->mutable_lod();
+    *out_lod = param_.X->lod();
+  }
+  param_.Out->Resize({n, c, out_h, out_w});
+
   return true;
 }
 
@@ -76,6 +95,24 @@ bool InterpolateOp::AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) {
   } else {
     param_.OutSize = nullptr;
   }
+
+  if (op_desc.HasInput("SizeTensor")) {
+    auto size_tensor = op_desc.Input("SizeTensor");
+    for (auto var : size_tensor) {
+      param_.SizeTensor.push_back(
+          scope->FindVar(var)->GetMutable<lite::Tensor>());
+    }
+  }
+
+  if (op_desc.HasInput("Scale")) {
+    auto scale_var_names = op_desc.Input("Scale");
+    if (scale_var_names.size() > 0) {
+      param_.Scale =
+          scope->FindVar(scale_var_names.front())->GetMutable<lite::Tensor>();
+    }
+  } else {
+    param_.Scale = nullptr;
+  }
   auto Out = op_desc.Output("Out").front();
   param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
   param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
diff --git a/lite/operators/lookup_table_v2_op.cc b/lite/operators/lookup_table_v2_op.cc
new file mode 100644
index 0000000000..c783695163
--- /dev/null
+++ b/lite/operators/lookup_table_v2_op.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/lookup_table_v2_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool LookupTableV2OpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.W)
+  CHECK_OR_FALSE(param_.Ids)
+  CHECK_OR_FALSE(param_.Out)
+
+  auto table_dims = param_.W->dims();
+
+  CHECK_EQ_OR_FALSE(table_dims.size(), 2)
+
+  return true;
+}
+
+bool LookupTableV2OpLite::InferShape() const {
+  auto table_dims = param_.W->dims();
+  auto ids_dims = param_.Ids->dims();
+
+  std::vector<int64_t> out_dims;
+  for (int i = 0; i < ids_dims.size(); ++i) {
+    out_dims.push_back(ids_dims[i]);
+  }
+  out_dims.push_back(table_dims[1]);
+  param_.Out->Resize(lite::DDim{out_dims});
+  param_.Out->set_lod(param_.Ids->lod());
+  return true;
+}
+
+bool LookupTableV2OpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                     lite::Scope *scope) {
+  auto input = op_desc.Input("W").front();
+  auto ids = op_desc.Input("Ids").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.W = scope->FindVar(input)->GetMutable<lite::Tensor>();
+  param_.Ids = scope->FindVar(ids)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  param_.padding_idx = op_desc.GetAttr<int64_t>("padding_idx");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(lookup_table_v2, paddle::lite::operators::LookupTableV2OpLite)
diff --git a/lite/operators/lookup_table_v2_op.h b/lite/operators/lookup_table_v2_op.h
new file mode 100644
index 0000000000..dabff3f0ca
--- /dev/null
+++ b/lite/operators/lookup_table_v2_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class LookupTableV2OpLite : public OpLite {
+ public:
+  LookupTableV2OpLite() {}
+  explicit LookupTableV2OpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "LookupTable"; }
+
+ private:
+  mutable LookupTableParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/lrn_op.cc b/lite/operators/lrn_op.cc
index 34b00653f9..aff3e5af55 100644
--- a/lite/operators/lrn_op.cc
+++ b/lite/operators/lrn_op.cc
@@ -37,11 +37,13 @@ bool LrnOpLite::AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) {
   auto Out_name = opdesc.Output("Out").front();
   param_.X = GetVar<lite::Tensor>(scope, X_name);
   param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
-  param_.local_size = opdesc.GetAttr<int>("local_size");
+  param_.n = opdesc.GetAttr<int>("n");
   param_.alpha = opdesc.GetAttr<float>("alpha");
   param_.beta = opdesc.GetAttr<float>("beta");
   param_.k = opdesc.GetAttr<float>("k");
-  param_.norm_region = opdesc.GetAttr<std::string>("norm_region");
+  if (opdesc.HasAttr("norm_region")) {
+    param_.norm_region = opdesc.GetAttr<std::string>("norm_region");
+  }
   return true;
 }
 
diff --git a/lite/operators/match_matrix_tensor_op.cc b/lite/operators/match_matrix_tensor_op.cc
new file mode 100644
index 0000000000..a8095a94bf
--- /dev/null
+++ b/lite/operators/match_matrix_tensor_op.cc
@@ -0,0 +1,105 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/match_matrix_tensor_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MatchMatrixTensorOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.y);
+  CHECK_OR_FALSE(param_.w);
+  CHECK_OR_FALSE(param_.out);
+  CHECK_OR_FALSE(param_.tmp);
+
+  DDim x_dims = param_.x->dims();
+  DDim y_dims = param_.y->dims();
+  DDim w_dims = param_.w->dims();
+  int dim_t = param_.dim_t;
+
+  CHECK_OR_FALSE(x_dims.size() == 2);
+  CHECK_OR_FALSE(y_dims.size() == 2);
+  CHECK_OR_FALSE(w_dims.size() == 3);
+
+  CHECK_OR_FALSE(x_dims[1] == w_dims[0] && y_dims[1] == w_dims[2] &&
+                 w_dims[1] == dim_t);
+
+  return true;
+}
+
+bool MatchMatrixTensorOpLite::InferShape() const {
+  const Tensor* x = param_.x;
+  const Tensor* y = param_.y;
+  DDim x_dims = param_.x->dims();
+  DDim y_dims = param_.y->dims();
+  DDim w_dims = param_.w->dims();
+  int dim_t = param_.dim_t;
+
+  const auto& x_lod = x->lod();
+  CHECK_OR_FALSE(!x_lod.empty());
+  const auto& x_lod_0 = x_lod[0];
+  CHECK_OR_FALSE(x_lod_0.size() >= 2);
+  CHECK_OR_FALSE(x_dims[0] == x_lod_0.back());
+
+  const auto& y_lod = y->lod();
+  CHECK_OR_FALSE(!y_lod.empty());
+  const auto& y_lod_0 = y_lod[0];
+  CHECK_OR_FALSE(y_lod_0.size() >= 2);
+  CHECK_OR_FALSE(y_dims[0] == y_lod_0.back());
+
+  CHECK_OR_FALSE(x_lod_0.size() == y_lod_0.size());
+
+  int out_dim_0 = 0;
+  for (size_t i = 1; i < x_lod_0.size(); i++) {
+    int x_len = x_lod_0[i] - x_lod_0[i - 1];
+    int y_len = y_lod_0[i] - y_lod_0[i - 1];
+    out_dim_0 += (x_len * y_len);
+  }
+  out_dim_0 *= dim_t;
+  int tmp_dim_0 = x_dims[0] * dim_t * x_dims[1];
+
+  param_.out->Resize({out_dim_0, 1});
+  param_.tmp->Resize({tmp_dim_0, 1});
+  return true;
+}
+
+bool MatchMatrixTensorOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                         lite::Scope* scope) {
+  auto x = op_desc.Input("X").front();
+  auto w = op_desc.Input("W").front();
+  auto y = op_desc.Input("Y").front();
+  auto out = op_desc.Output("Out").front();
+  auto tmp = op_desc.Output("Tmp").front();
+
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.w = scope->FindVar(w)->GetMutable<lite::Tensor>();
+  param_.y = scope->FindVar(y)->GetMutable<lite::Tensor>();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  param_.tmp = scope->FindVar(tmp)->GetMutable<lite::Tensor>();
+
+  param_.dim_t = op_desc.GetAttr<int32_t>("dim_t");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(match_matrix_tensor,
+                 paddle::lite::operators::MatchMatrixTensorOpLite);
diff --git a/lite/operators/match_matrix_tensor_op.h b/lite/operators/match_matrix_tensor_op.h
new file mode 100644
index 0000000000..404183ea5b
--- /dev/null
+++ b/lite/operators/match_matrix_tensor_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MatchMatrixTensorOpLite : public OpLite {
+ public:
+  MatchMatrixTensorOpLite() {}
+
+  explicit MatchMatrixTensorOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "match_matrix_tensor"; }
+
+ private:
+  mutable MatchMatrixTensorParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/op_params.h b/lite/operators/op_params.h
index 8609f17888..4f0c707484 100644
--- a/lite/operators/op_params.h
+++ b/lite/operators/op_params.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -89,11 +90,21 @@ struct FcParam {
   WITH_INT8_CONFIG
 };
 
+struct SearchSeqFcParam {
+  lite::Tensor* x{nullptr};
+  lite::Tensor* w{nullptr};
+  lite::Tensor* b{nullptr};
+  lite::Tensor* out{nullptr};
+  int out_size;
+};
+
 // For Interpolate Op
 struct InterpolateParam {
   lite::Tensor* X{};
   lite::Tensor* OutSize{};
   lite::Tensor* Out{};
+  std::vector<const lite::Tensor*> SizeTensor;
+  lite::Tensor* Scale{};
 
   float scale{0.f};
   int out_h{-1};
@@ -101,6 +112,7 @@ struct InterpolateParam {
   bool align_corners{true};
   int align_mode{1};
   std::string interp_method{"Nearest"};
+  DataLayoutType data_layout{DATALAYOUT(kNCHW)};
 };
 
 // For Mul Op
@@ -242,9 +254,19 @@ struct ConvParam {
   lite::Tensor* residualData{nullptr};
   lite::Tensor* output{};
   std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0};
+  /* paddings type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> paddings;
   int groups{1};
-  std::vector<int> dilations{1, 1};
+  /* dilations type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> dilations;
   bool fuse_relu_before_depthwise_conv{false};
   bool use_mkldnn{false};
   bool fuse_relu{false};  // only used in mkldnn kernel
@@ -291,7 +313,12 @@ struct PoolParam {
   bool global_pooling{
       false};  // if true, knernel size and paddings will be ignored
   std::vector<int> strides{1, 1};
-  std::vector<int> paddings{0, 0};
+  /* paddings type change
+  * from std::vector<int> to std::shared_ptr<std::vector<int>>
+  * to support dynamically modify padding
+  * let kernel param and operator param Synchronous update
+  */
+  std::shared_ptr<std::vector<int>> paddings;
   bool exclusive{true};
   bool adaptive{false};
   bool ceil_mode{false};
@@ -317,6 +344,9 @@ struct DropoutParam {
 struct SplitParam {
   lite::Tensor* x{};
   std::vector<lite::Tensor*> output{};
+  lite::Tensor* axis_tensor;
+  std::vector<lite::Tensor*> sections_tensor_list{};
+
   int axis{-1};
   int num{0};
   std::vector<int> sections;
@@ -378,6 +408,9 @@ struct MeanGradParam {
 struct FillConstantParam {
   int dtype{static_cast<int>(VarDescAPI::VarDataType::FP32)};
   std::vector<int64_t> shape{};
+  lite::Tensor* shape_tensor;
+  std::vector<lite::Tensor*> shape_tensor_list{};
+
   float value{0.0f};
   // useless for x86, keep it for compatibility
   bool force_cpu{false};
@@ -511,8 +544,8 @@ struct GRUUnitParam {
 struct LrnParam {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
-  int local_size{5};
-  float alpha{1.};
+  int n{5};
+  float alpha{1e-4};
   float beta{0.75};
   float k{1.};
   std::string norm_region{"AcrossChannels"};
@@ -729,6 +762,14 @@ struct SequencePoolParam {
 #endif
 };
 
+struct SearchGroupPaddingParam {
+  lite::Tensor* x{};
+  lite::Tensor* out_emb_padding{};
+  lite::Tensor* out_new{};
+  lite::Tensor* out_padding{};
+  int pad_id;
+};
+
 struct SequenceReshapeParam {
   lite::Tensor* x{};
   lite::Tensor* output{};
@@ -748,6 +789,32 @@ struct SequenceExpandAsParam {
   lite::Tensor* out{nullptr};
 };
 
+struct SequenceReverseParam {
+  const lite::Tensor* X{};
+  lite::Tensor* Out{};
+};
+
+struct SequenceConcatParam {
+  std::vector<lite::Tensor*> X{};
+  lite::Tensor* Out{};
+};
+
+struct AttentionPaddingMaskParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* Y{};
+  int pad_id;
+  float mask;
+  lite::Tensor* Out{};
+  lite::Tensor* pad_begin{};
+};
+
+struct SequenceArithmeticParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* Y{};
+  int op_type{1};
+  lite::Tensor* Out{};
+};
+
 struct ReduceMaxParam {
   const lite::Tensor* X{};
   lite::Tensor* Out{};
@@ -776,6 +843,22 @@ struct ReduceParam {
   bool reduce_all{false};
 };
 
+struct VarConv2DParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* ROW{};
+  const lite::Tensor* COLUMN{};
+  const lite::Tensor* W{};
+  lite::Tensor* Out{};
+  lite::Tensor* Col{};
+
+  int input_channel;
+  int output_channel;
+  int stride_h;
+  int stride_w;
+  int kernel_h;
+  int kernel_w;
+};
+
 /// ----------------------- shape operators ----------------------
 struct ShapeParam {
   const lite::Tensor* X{};
@@ -856,7 +939,7 @@ struct UnsqueezeParam {
   lite::Tensor* XShape{};
   std::vector<int> axes{};
   const lite::Tensor* axes_tensor{};
-  std::vector<lite::Tensor>* axes_tensor_vct{};
+  std::vector<const lite::Tensor*> axes_tensor_vct{};
 };
 
 /// ----------------------- expand operators ----------------------
@@ -922,6 +1005,57 @@ struct AssignValueParam {
   lite::Tensor* Out{};
 };
 
+/// --------------- sequence_topk_avg_pooling operators ------------------
+struct SequenceTopkAvgPoolingParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* ROW{};
+  const lite::Tensor* COLUMN{};
+  lite::Tensor* Out{};
+  lite::Tensor* pos{};
+  int channel_num{};
+  std::vector<int> topks{};
+};
+
+/// --------------- search_fc operators ------------------
+struct SearchFcParam {
+  const lite::Tensor* X{};
+  const lite::Tensor* W{};
+  const lite::Tensor* b{};
+  lite::Tensor* Out{};
+  int out_size{};
+};
+/// --------------------- match_matrix_tensor operators --------------------
+struct MatchMatrixTensorParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* y{};
+  const lite::Tensor* w{};
+  lite::Tensor* out{};
+  lite::Tensor* tmp{};
+
+  int dim_t;
+};
+
+/// --------------------- search_seq_depadding operators --------------------
+struct SearchSeqDepaddingParam {
+  const lite::Tensor* pad{};
+  const lite::Tensor* src{};
+  lite::Tensor* out{};
+};
+
+/// --------------------- search_grnn operators --------------------
+struct SearchGrnnParam {
+  const lite::Tensor* x{};
+  const lite::Tensor* wi{};
+  const lite::Tensor* wh{};
+  int num_input;
+  int num_hidden;
+
+  lite::Tensor* out{};
+  lite::Tensor* tmp_buffer{};
+  lite::Tensor* idx_sorted_by_width{};
+  lite::Tensor* layout_input{};
+};
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/pool_op.cc b/lite/operators/pool_op.cc
index 1ebbc059b7..c6f6eed28f 100644
--- a/lite/operators/pool_op.cc
+++ b/lite/operators/pool_op.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "lite/operators/pool_op.h"
+#include <algorithm>
 #include "lite/core/op_registry.h"
 
 namespace paddle {
@@ -26,7 +27,7 @@ bool PoolOpLite::CheckShape() const {
   const auto& x_dims = param_.x->dims();
   const auto& ksize = param_.ksize;
   const auto& strides = param_.strides;
-  const auto& paddings = param_.paddings;
+  const auto& paddings = *param_.paddings;
 
   // "Pooling intput should be 4-D or 5-D tensor."
   CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5);
@@ -34,20 +35,27 @@ bool PoolOpLite::CheckShape() const {
   CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U);
   // Strides size and pooling size should be the same.
   CHECK_OR_FALSE(ksize.size() == strides.size());
-  // Paddings size and pooling size should be the same.
-  CHECK_OR_FALSE(ksize.size() == paddings.size());
+  // Paddings size must be 4.
+  CHECK_OR_FALSE(paddings.size() == 4L);
 
   return true;
 }
 
-int PoolOutputSize(
-    int input_size, int filter_size, int padding, int stride, bool ceil_mode) {
+int PoolOutputSize(int input_size,
+                   int filter_size,
+                   int pad_left,
+                   int pad_right,
+                   int stride,
+                   bool ceil_mode) {
   int output_size;
   if (!ceil_mode) {
-    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+    output_size =
+        (input_size - filter_size + pad_left + pad_right) / stride + 1;
   } else {
     output_size =
-        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
+        (input_size - filter_size + pad_left + pad_right + stride - 1) /
+            stride +
+        1;
   }
   return output_size;
 }
@@ -55,14 +63,21 @@ int PoolOutputSize(
 bool PoolOpLite::InferShape() const {
   const auto x_dims = param_.x->dims();
   std::vector<int>& ksize = param_.ksize;
+  // dynamic update 4-pad
+  UpdatePadding(param_.paddings.get(),
+                param_.global_pooling,
+                param_.adaptive,
+                padding_algorithm_,
+                x_dims,
+                param_.strides,
+                ksize);
   if (param_.global_pooling) {
     ksize.resize(static_cast<size_t>(x_dims.size()) - 2);
     for (size_t i = 0; i < ksize.size(); ++i) {
-      param_.paddings[i] = 0;
       ksize[i] = static_cast<int>(x_dims[i + 2]);
     }
   }
-
+  auto paddings = *param_.paddings;
   std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
   if (param_.adaptive) {
     output_shape.insert(
@@ -71,15 +86,14 @@ bool PoolOpLite::InferShape() const {
     for (size_t i = 0; i < param_.ksize.size(); ++i) {
       output_shape.push_back(PoolOutputSize(x_dims[i + 2],
                                             param_.ksize[i],
-                                            param_.paddings[i],
+                                            paddings[2 * i],
+                                            paddings[2 * i + 1],
                                             param_.strides[i],
                                             param_.ceil_mode));
     }
   }
   param_.output->Resize(lite::DDim(output_shape));
 
-  // ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  // ctx->ShareLoD("X", "Out");
   return true;
 }
 
diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h
index aecec4c619..c44875ff95 100644
--- a/lite/operators/pool_op.h
+++ b/lite/operators/pool_op.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
 #include "lite/core/kernel.h"
@@ -51,7 +53,7 @@ class PoolOpLite : public OpLite {
     param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
     param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
     param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
-    param_.paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
 
     if (op_desc.HasAttr("exclusive")) {
       param_.exclusive = op_desc.GetAttr<bool>("exclusive");
@@ -65,7 +67,23 @@ class PoolOpLite : public OpLite {
     if (op_desc.HasAttr("use_quantizer")) {
       param_.use_quantizer = op_desc.GetAttr<bool>("use_quantizer");
     }
-    // param_.data_format = op_desc.GetAttr<bool>("data_format");
+    if (op_desc.HasAttr("padding_algorithm")) {
+      padding_algorithm_ = op_desc.GetAttr<std::string>("padding_algorithm");
+    }
+    // 2-pad to 4-pad
+    if (paddings.size() == 2L) {
+      for (size_t i = 0; i < 2L; ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+    } else {
+      if (paddings.size() != 4L) {
+        LOG(FATAL)
+            << "Paddings size should be the same or twice as the inputs size.";
+      }
+    }
+    param_.paddings = std::make_shared<std::vector<int>>(paddings);
+
     return true;
   }
 
@@ -75,8 +93,42 @@ class PoolOpLite : public OpLite {
 
  private:
   mutable PoolParam param_;
+  std::string padding_algorithm_{""};
 };
 
+inline void UpdatePadding(std::vector<int> *paddings,
+                          const bool global_pooling,
+                          const bool adaptive,
+                          const std::string padding_algorithm,
+                          const lite::DDim data_dims,
+                          const std::vector<int> &strides,
+                          const std::vector<int> &ksize) {
+  // when padding_algorithm is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (int i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum =
+          std::max((out_size - 1) * strides[i] + ksize[i] - data_dims[i + 2],
+                   (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+
+  // if global_pooling == true or adaptive == true, padding will be ignore
+  if (global_pooling || adaptive) {
+    for (auto it = paddings->begin(); it != paddings->end(); it++) {
+      *it = 0;
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/operators/search_aligned_mat_mul_op.cc b/lite/operators/search_aligned_mat_mul_op.cc
new file mode 100644
index 0000000000..43a276e3c7
--- /dev/null
+++ b/lite/operators/search_aligned_mat_mul_op.cc
@@ -0,0 +1,101 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_aligned_mat_mul_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchAlignedMatMulOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+
+  return true;
+}
+
+bool SearchAlignedMatMulOpLite::InferShape() const {
+  const auto x_dims = param_.X->dims();
+  const auto y_dims = param_.Y->dims();
+  const auto& x_lod = param_.X->lod();
+  const auto& y_lod = param_.Y->lod();
+  bool x_transpose = param_.transpose_X;
+  bool y_transpose = param_.transpose_Y;
+
+  CHECK_EQ(x_dims.size(), 2) << "X should be 2-D tensor";
+  CHECK_EQ(y_dims.size(), 2) << "Y should be 2-D tensor";
+  CHECK(!x_lod.empty()) << "The Input(X) must hold lod info.";
+  CHECK(!y_lod.empty()) << "The Input(Y) must hold lod info.";
+
+  const auto& x_lod_0 = x_lod[0];
+  const auto& y_lod_0 = y_lod[0];
+  CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted.";
+  CHECK_GE(y_lod_0.size(), 2) << "The Input(Y)'s lod info is corrupted.";
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+  CHECK_EQ(y_dims[0], static_cast<int64_t>(y_lod_0.back()))
+      << "The Input(Y)'s lod info mismatches the actual tensor shape.";
+  CHECK_EQ(x_lod_0.size(), y_lod_0.size())
+      << "The Length of X and Y must be equal.";
+
+  int seq_num = x_lod_0.size() - 1;
+  int x_inner_size = x_dims[1];
+  int y_inner_size = y_dims[1];
+  int x_batch_size = x_lod_0[1];
+  int y_batch_size = y_lod_0[1];
+  int M = x_transpose ? x_inner_size : x_batch_size;
+  int N = y_transpose ? y_batch_size : y_inner_size;
+  int X_K = x_transpose ? x_batch_size : x_inner_size;
+  int Y_K = y_transpose ? y_inner_size : y_batch_size;
+  CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+
+  LoD out_lod;
+  std::vector<uint64_t> out_lod_0(seq_num + 1);
+  out_lod_0[0] = 0;
+  for (int i = 0; i < seq_num; i++) {
+    out_lod_0[i + 1] = out_lod_0[i] + M;
+  }
+  out_lod.push_back(out_lod_0);
+  DDim out_dims(
+      {static_cast<int64_t>(out_lod_0.back()), static_cast<int64_t>(N)});
+  param_.Out->set_lod(out_lod);
+  param_.Out->Resize(out_dims);
+  return true;
+}
+
+bool SearchAlignedMatMulOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                           lite::Scope* scope) {
+  CHECK(!op_desc.Input("X").empty());
+  CHECK(!op_desc.Input("Y").empty());
+  CHECK(!op_desc.Output("Out").empty());
+  auto X = op_desc.Input("X").front();
+  auto Y = op_desc.Input("Y").front();
+  auto Out = op_desc.Output("Out").front();
+  param_.X = GetVar<lite::Tensor>(scope, X);
+  param_.Y = GetVar<lite::Tensor>(scope, Y);
+  param_.Out = GetMutableVar<lite::Tensor>(scope, Out);
+  param_.transpose_X = op_desc.GetAttr<bool>("transpose_X");
+  param_.transpose_Y = op_desc.GetAttr<bool>("transpose_Y");
+  param_.alpha = op_desc.GetAttr<float>("alpha");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_aligned_mat_mul,
+                 paddle::lite::operators::SearchAlignedMatMulOpLite);
diff --git a/lite/operators/search_aligned_mat_mul_op.h b/lite/operators/search_aligned_mat_mul_op.h
new file mode 100644
index 0000000000..7321b7e9d1
--- /dev/null
+++ b/lite/operators/search_aligned_mat_mul_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchAlignedMatMulOpLite : public OpLite {
+ public:
+  SearchAlignedMatMulOpLite() {}
+
+  explicit SearchAlignedMatMulOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+  std::string DebugString() const override { return "search_aligned_mat_mul"; }
+
+ private:
+  mutable MatMulParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_fc_op.cc b/lite/operators/search_fc_op.cc
new file mode 100644
index 0000000000..2e77e36162
--- /dev/null
+++ b/lite/operators/search_fc_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_fc_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchFcOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.W);
+  CHECK_OR_FALSE(param_.b);
+  CHECK_OR_FALSE(param_.Out);
+
+  auto x_dims = param_.X->dims();
+  CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) should be 2.";
+  auto w_dims = param_.W->dims();
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+  auto b_dims = param_.b->dims();
+  CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+  CHECK_EQ(w_dims[1], x_dims[1]) << "wrong shape: w_dims[1] != x_dims[1]";
+  return true;
+}
+
+bool SearchFcOpLite::InferShape() const {
+  auto out_size = param_.out_size;
+  lite::DDim dims(std::vector<int64_t>({-1, out_size}));
+  param_.Out->Resize(dims);
+  return true;
+}
+
+bool SearchFcOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                lite::Scope *scope) {
+  auto X = op_desc.Input("X").front();
+  auto W = op_desc.Input("W").front();
+  auto b = op_desc.Input("b").front();
+  auto Out = op_desc.Output("Out").front();
+
+  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
+  param_.W = scope->FindVar(W)->GetMutable<lite::Tensor>();
+  param_.b = scope->FindVar(b)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
+  param_.out_size = op_desc.GetAttr<int>("out_size");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_fc, paddle::lite::operators::SearchFcOpLite);
diff --git a/lite/operators/search_fc_op.h b/lite/operators/search_fc_op.h
new file mode 100644
index 0000000000..a871cadd33
--- /dev/null
+++ b/lite/operators/search_fc_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchFcOpLite : public OpLite {
+ public:
+  SearchFcOpLite() {}
+  explicit SearchFcOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "search_fc"; }
+
+ private:
+  mutable SearchFcParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_grnn_op.cc b/lite/operators/search_grnn_op.cc
new file mode 100644
index 0000000000..b56ae820bf
--- /dev/null
+++ b/lite/operators/search_grnn_op.cc
@@ -0,0 +1,94 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_grnn_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchGrnnOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.wi);
+  CHECK_OR_FALSE(param_.wh);
+  CHECK_OR_FALSE(param_.out);
+  CHECK_OR_FALSE(param_.tmp_buffer);
+  CHECK_OR_FALSE(param_.idx_sorted_by_width);
+  CHECK_OR_FALSE(param_.layout_input);
+
+  int _cap_h = param_.num_hidden;
+  int _cap_e = param_.num_input;
+
+  const auto& x_dims = param_.x->dims();
+  CHECK_OR_FALSE(x_dims.size() == 2);
+  CHECK_OR_FALSE(x_dims[1] == _cap_e);
+
+  const auto& wi_dims = param_.wi->dims();
+  CHECK_OR_FALSE(wi_dims.size() == 3);
+  CHECK_OR_FALSE(wi_dims[0] == 3);
+  CHECK_OR_FALSE(wi_dims[1] == _cap_h);
+  CHECK_OR_FALSE(wi_dims[2] == _cap_e);
+
+  const auto& wh_dims = param_.wh->dims();
+  CHECK_OR_FALSE(wh_dims.size() == 3);
+  CHECK_OR_FALSE(wh_dims[0] == 3);
+  CHECK_OR_FALSE(wh_dims[1] == _cap_h);
+  CHECK_OR_FALSE(wh_dims[2] == _cap_h);
+
+  return true;
+}
+
+bool SearchGrnnOpLite::InferShape() const {
+  const auto& x_dims = param_.x->dims();
+  const auto& x_lod = param_.x->lod();
+  CHECK_OR_FALSE(!x_lod.empty());
+  CHECK_OR_FALSE(x_dims[0] == x_lod[0].back());
+  param_.out->set_lod(x_lod);
+
+  return true;
+}
+
+bool SearchGrnnOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                  lite::Scope* scope) {
+  auto x = op_desc.Input("X").front();
+  auto wi = op_desc.Input("Wi").front();
+  auto wh = op_desc.Input("Wh").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.wi = scope->FindVar(wi)->GetMutable<lite::Tensor>();
+  param_.wh = scope->FindVar(wh)->GetMutable<lite::Tensor>();
+
+  param_.num_input = op_desc.GetAttr<int>("num_input");
+  param_.num_hidden = op_desc.GetAttr<int>("num_hidden");
+
+  auto out = op_desc.Output("Out").front();
+  auto tmp_buffer = op_desc.Output("tmp_buffer").front();
+  auto idx_sorted_by_width = op_desc.Output("idx_sorted_by_width").front();
+  auto layout_input = op_desc.Output("layout_input").front();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  param_.tmp_buffer = scope->FindVar(tmp_buffer)->GetMutable<lite::Tensor>();
+  param_.idx_sorted_by_width =
+      scope->FindVar(idx_sorted_by_width)->GetMutable<lite::Tensor>();
+  param_.layout_input =
+      scope->FindVar(layout_input)->GetMutable<lite::Tensor>();
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_grnn, paddle::lite::operators::SearchGrnnOpLite);
diff --git a/lite/operators/search_grnn_op.h b/lite/operators/search_grnn_op.h
new file mode 100644
index 0000000000..670af8a6c9
--- /dev/null
+++ b/lite/operators/search_grnn_op.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchGrnnOpLite : public OpLite {
+ public:
+  SearchGrnnOpLite() {}
+
+  explicit SearchGrnnOpLite(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "search_grnn"; }
+
+ private:
+  mutable SearchGrnnParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_group_padding_op.cc b/lite/operators/search_group_padding_op.cc
new file mode 100644
index 0000000000..5ba4dde275
--- /dev/null
+++ b/lite/operators/search_group_padding_op.cc
@@ -0,0 +1,67 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_group_padding_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchGroupPaddingOp::CheckShape() const {
+  CHECK_EQ(param_.x->dims().size(), 2) << "The rank of X(Input) should be 2.";
+  CHECK_EQ(param_.x->lod().empty(), false)
+      << "Input Tensor of X does not contain LoD information.";
+  CHECK_GE(param_.x->lod()[0].size(), 2)
+      << "The Input(X)'s lod info is corrupted.";
+  CHECK_EQ(param_.x->dims()[0], static_cast<int64_t>(param_.x->lod()[0].back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+
+  return true;
+}
+
+bool SearchGroupPaddingOp::InferShape() const {
+  std::vector<int64_t> x_dims = param_.x->dims().Vectorize();
+
+  param_.out_emb_padding->Resize({-1, x_dims[1]});
+  param_.out_new->Resize({x_dims[0], 1});
+  param_.out_padding->Resize({-1, 1});
+  return true;
+}
+
+bool SearchGroupPaddingOp::AttachImpl(const cpp::OpDesc &op_desc,
+                                      lite::Scope *scope) {
+  auto x = op_desc.Input("X").front();
+  auto out_emb_padding = op_desc.Output("Out_emb_padding").front();
+  auto out_new = op_desc.Output("Out_new").front();
+  auto out_padding = op_desc.Output("Out_padding").front();
+
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.out_emb_padding =
+      scope->FindVar(out_emb_padding)->GetMutable<lite::Tensor>();
+  param_.out_new = scope->FindVar(out_new)->GetMutable<lite::Tensor>();
+  param_.out_padding = scope->FindVar(out_padding)->GetMutable<lite::Tensor>();
+  param_.pad_id = op_desc.GetAttr<int>("pad_id");
+
+  CHECK(param_.out_emb_padding)
+      << "Output(Out_emb_padding) of SearchGroupPadding Op should not be null.";
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_group_padding,
+                 paddle::lite::operators::SearchGroupPaddingOp);
diff --git a/lite/operators/search_group_padding_op.h b/lite/operators/search_group_padding_op.h
new file mode 100644
index 0000000000..a8e96c9697
--- /dev/null
+++ b/lite/operators/search_group_padding_op.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchGroupPaddingOp : public OpLite {
+ public:
+  SearchGroupPaddingOp() {}
+  explicit SearchGroupPaddingOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "search_group_padding"; }
+
+ private:
+  mutable SearchGroupPaddingParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_seq_depadding_op.cc b/lite/operators/search_seq_depadding_op.cc
new file mode 100644
index 0000000000..12d5123e05
--- /dev/null
+++ b/lite/operators/search_seq_depadding_op.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_seq_depadding_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchSeqDepaddingOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.pad);
+  CHECK_OR_FALSE(param_.src);
+  CHECK_OR_FALSE(param_.out);
+
+  DDim pad_dims = param_.pad->dims();
+  DDim src_dims = param_.src->dims();
+  CHECK_OR_FALSE(pad_dims.size() == 2);
+  CHECK_OR_FALSE(src_dims.size() == 2);
+
+  const auto& pad_lod = param_.pad->lod();
+  CHECK_OR_FALSE(!pad_lod.empty());
+  const auto& pad_lod_0 = pad_lod[0];
+  CHECK_OR_FALSE(pad_lod_0.size() >= 2);
+  CHECK_OR_FALSE(pad_dims[0] == pad_lod_0.back());
+
+  const auto& src_lod = param_.src->lod();
+  CHECK_OR_FALSE(!src_lod.empty());
+  const auto& src_lod_0 = src_lod[0];
+  CHECK_OR_FALSE(src_lod_0.size() >= 2);
+  CHECK_OR_FALSE(src_dims[0] == src_lod_0.back());
+  return true;
+}
+
+bool SearchSeqDepaddingOpLite::InferShape() const {
+  DDim pad_dims = param_.pad->dims();
+  param_.out->Resize({-1, pad_dims[1]});
+  return true;
+}
+
+bool SearchSeqDepaddingOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                          lite::Scope* scope) {
+  auto pad = op_desc.Input("Pad").front();
+  auto src = op_desc.Input("Src").front();
+  auto out = op_desc.Output("Out").front();
+
+  param_.pad = scope->FindVar(pad)->GetMutable<lite::Tensor>();
+  param_.src = scope->FindVar(src)->GetMutable<lite::Tensor>();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_seq_depadding,
+                 paddle::lite::operators::SearchSeqDepaddingOpLite);
diff --git a/lite/operators/search_seq_depadding_op.h b/lite/operators/search_seq_depadding_op.h
new file mode 100644
index 0000000000..445d9e0f3b
--- /dev/null
+++ b/lite/operators/search_seq_depadding_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchSeqDepaddingOpLite : public OpLite {
+ public:
+  SearchSeqDepaddingOpLite() {}
+
+  explicit SearchSeqDepaddingOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "search_seq_depadding"; }
+
+ private:
+  mutable SearchSeqDepaddingParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_seq_fc_op.cc b/lite/operators/search_seq_fc_op.cc
new file mode 100644
index 0000000000..c5cca5331a
--- /dev/null
+++ b/lite/operators/search_seq_fc_op.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_seq_fc_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchSeqFcOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.w);
+  CHECK_OR_FALSE(param_.out);
+  return true;
+}
+
+bool SearchSeqFcOpLite::InferShape() const {
+  const auto x_dims = param_.x->dims();
+  const auto w_dims = param_.w->dims();
+  const auto& x_lod = param_.x->lod();
+  auto out_size = param_.out_size;
+  CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+  CHECK(!x_lod.empty()) << "The Input(X) must hold lod info.";
+  const auto& x_lod_0 = x_lod[0];
+  CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted.";
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+  CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+  CHECK_EQ(w_dims[0], out_size) << "Wrong shape: w_dims[0] != out_size";
+
+  if (param_.b != nullptr) {
+    const auto b_dims = param_.b->dims();
+    CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+    CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+  }
+
+  param_.out->set_lod(x_lod);
+  param_.out->Resize({x_dims[0], w_dims[0]});
+  return true;
+}
+
+bool SearchSeqFcOpLite::AttachImpl(const cpp::OpDesc& op_desc,
+                                   lite::Scope* scope) {
+  CHECK(!op_desc.Input("X").empty());
+  CHECK(!op_desc.Input("W").empty());
+  CHECK(!op_desc.Output("Out").empty());
+  auto x = op_desc.Input("X").front();
+  auto w = op_desc.Input("W").front();
+  auto out = op_desc.Output("Out").front();
+  param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+  param_.w = scope->FindVar(w)->GetMutable<lite::Tensor>();
+  param_.out = scope->FindVar(out)->GetMutable<lite::Tensor>();
+  param_.out_size = op_desc.GetAttr<int>("out_size");
+  bool has_bias = op_desc.GetAttr<bool>("has_bias");
+  if (has_bias) {
+    CHECK(!op_desc.Input("b").empty());
+    auto b = op_desc.Input("b").front();
+    param_.b = scope->FindVar(b)->GetMutable<lite::Tensor>();
+  }
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_seq_fc, paddle::lite::operators::SearchSeqFcOpLite);
diff --git a/lite/operators/search_seq_fc_op.h b/lite/operators/search_seq_fc_op.h
new file mode 100644
index 0000000000..3c4f7d82bf
--- /dev/null
+++ b/lite/operators/search_seq_fc_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchSeqFcOpLite : public OpLite {
+ public:
+  SearchSeqFcOpLite() {}
+
+  explicit SearchSeqFcOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
+  std::string DebugString() const override { return "search_seq_fc"; }
+
+ private:
+  mutable SearchSeqFcParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/search_seq_softmax_op.cc b/lite/operators/search_seq_softmax_op.cc
new file mode 100644
index 0000000000..973ffa04c4
--- /dev/null
+++ b/lite/operators/search_seq_softmax_op.cc
@@ -0,0 +1,52 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/search_seq_softmax_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SearchSeqSoftmaxOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+  return true;
+}
+
+bool SearchSeqSoftmaxOp::InferShape() const {
+  param_.output->Resize(param_.x->dims());
+  param_.output->set_lod(param_.x->lod());
+  return true;
+}
+
+bool SearchSeqSoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                    lite::Scope *scope) {
+  param_.x = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.output =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.axis = 1;
+
+  CHECK(param_.x);
+  CHECK(param_.output);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(search_seq_softmax,
+                 paddle::lite::operators::SearchSeqSoftmaxOp);
diff --git a/lite/operators/search_seq_softmax_op.h b/lite/operators/search_seq_softmax_op.h
new file mode 100644
index 0000000000..f97e8ddd3a
--- /dev/null
+++ b/lite/operators/search_seq_softmax_op.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SearchSeqSoftmaxOp : public OpLite {
+ public:
+  SearchSeqSoftmaxOp() {}
+  explicit SearchSeqSoftmaxOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "search_seq_softmax_op"; }
+
+ private:
+  mutable SoftmaxParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_arithmetic_op.cc b/lite/operators/sequence_arithmetic_op.cc
new file mode 100644
index 0000000000..29c39ebc23
--- /dev/null
+++ b/lite/operators/sequence_arithmetic_op.cc
@@ -0,0 +1,58 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_arithmetic_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceArithmeticOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_EQ(param_.X->dims().size(), 2) << "Input X should a 2-D Tensor";
+  CHECK_EQ(param_.Y->dims().size(), 2) << "Input Y should a 2-D Tensor";
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+
+bool SequenceArithmeticOp::InferShape() const {
+  param_.Out->Resize(param_.X->dims());
+  param_.Out->set_lod(param_.X->lod());
+  return true;
+}
+
+bool SequenceArithmeticOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                      lite::Scope *scope) {
+  param_.X = scope->FindTensor(opdesc.Input("X").front());
+  param_.Y = scope->FindTensor(opdesc.Input("Y").front());
+  param_.Out = scope->FindMutableTensor(opdesc.Output("Out").front());
+
+  param_.op_type = opdesc.GetAttr<int>("op_type");
+
+  CHECK(param_.X);
+  CHECK(param_.Y);
+  CHECK(param_.Out);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_arithmetic,
+                 paddle::lite::operators::SequenceArithmeticOp);
+REGISTER_LITE_OP(search_seq_arithmetic,
+                 paddle::lite::operators::SequenceArithmeticOp);
diff --git a/lite/operators/sequence_arithmetic_op.h b/lite/operators/sequence_arithmetic_op.h
new file mode 100644
index 0000000000..9f844dfbf4
--- /dev/null
+++ b/lite/operators/sequence_arithmetic_op.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceArithmeticOp : public OpLite {
+ public:
+  SequenceArithmeticOp() {}
+  explicit SequenceArithmeticOp(const std::string &op_type) : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "sequence_arithmetic"; }
+
+ private:
+  mutable SequenceArithmeticParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_concat_op.cc b/lite/operators/sequence_concat_op.cc
new file mode 100644
index 0000000000..2a54df890c
--- /dev/null
+++ b/lite/operators/sequence_concat_op.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_concat_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceConcatOp::CheckShape() const {
+  CHECK_GT(param_.X.size(), 1)
+      << "The number of input sequences is at least two.";
+  CHECK_OR_FALSE(param_.Out);
+  size_t lod_size = 0;
+  for (const auto &t : param_.X) {
+    CHECK_EQ(t->lod().empty(), false)
+        << "Input Tensor of X does not contain LoD information.";
+    // CHECK_EQ(t->lod().size(), 1) << "Only support one level sequence now.";
+    if (lod_size == 0) {
+      lod_size = t->lod()[0].size();
+    } else {
+      CHECK_EQ(t->lod()[0].size(), lod_size)
+          << "The number of sequence must be same between each input";
+    }
+  }
+  CHECK_NE(lod_size, 0) << "Each input must have sequence information";
+  return true;
+}
+
+bool SequenceConcatOp::InferShape() const {
+  int64_t batch_size = 0;
+  int64_t feature_size = 0;
+  std::vector<int64_t> out_dims;
+  for (const auto &tensor : param_.X) {
+    const auto x_dims = tensor->dims();
+    if (out_dims.empty()) {
+      out_dims = x_dims.Vectorize();
+    }
+    batch_size += x_dims[0];
+    if (feature_size == 0) {
+      feature_size = x_dims.production() / x_dims[0];
+    } else {
+      CHECK_EQ(feature_size, x_dims.production() / x_dims[0])
+          << "Inputs of sequence concat must have same feature size";
+    }
+  }
+  if (batch_size < 0) {
+    batch_size = -1;  // Normalize batch size for compile time.
+  }
+  out_dims[0] = batch_size;
+  param_.Out->Resize(out_dims);
+  // LoD info will be computed in Kernel.
+  return true;
+}
+
+bool SequenceConcatOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                  lite::Scope *scope) {
+  auto input_list = opdesc.Input("X");
+  param_.X.clear();
+  for (auto var : input_list) {
+    param_.X.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
+  }
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  CHECK(param_.Out) << "Output(Out) of Sequence Concat Op should not be null.";
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_concat, paddle::lite::operators::SequenceConcatOp);
diff --git a/lite/operators/sequence_concat_op.h b/lite/operators/sequence_concat_op.h
new file mode 100644
index 0000000000..8cdc07ebca
--- /dev/null
+++ b/lite/operators/sequence_concat_op.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceConcatOp : public OpLite {
+ public:
+  SequenceConcatOp() {}
+  explicit SequenceConcatOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_concat"; }
+
+ private:
+  mutable SequenceConcatParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_reverse_op.cc b/lite/operators/sequence_reverse_op.cc
new file mode 100644
index 0000000000..dd8fa2e8fd
--- /dev/null
+++ b/lite/operators/sequence_reverse_op.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_reverse_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceReverseOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Out);
+  CHECK_EQ(param_.X->lod().empty(), false)
+      << "Input(X) Tensor of SequenceReverseOp does not contain "
+         "LoD information.";
+  CHECK_GE(param_.X->dims().size(), 2)
+      << "Rank of Input(X) must be not less than 2.";
+  return true;
+}
+
+bool SequenceReverseOp::InferShape() const {
+  const auto *input = param_.X;
+  auto out_dims = input->dims();
+  param_.Out->Resize(out_dims);
+  return true;
+}
+
+bool SequenceReverseOp::AttachImpl(const cpp::OpDesc &opdesc,
+                                   lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  param_.Out =
+      scope->FindVar(opdesc.Output("Y").front())->GetMutable<lite::Tensor>();
+  CHECK(param_.X);
+  CHECK(param_.Out);
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_reverse, paddle::lite::operators::SequenceReverseOp);
diff --git a/lite/operators/sequence_reverse_op.h b/lite/operators/sequence_reverse_op.h
new file mode 100644
index 0000000000..326d0f6892
--- /dev/null
+++ b/lite/operators/sequence_reverse_op.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceReverseOp : public OpLite {
+ public:
+  SequenceReverseOp() {}
+  explicit SequenceReverseOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "sequence_reverse"; }
+
+ private:
+  mutable SequenceReverseParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/sequence_topk_avg_pooling_op.cc b/lite/operators/sequence_topk_avg_pooling_op.cc
new file mode 100644
index 0000000000..6f5cbeeeee
--- /dev/null
+++ b/lite/operators/sequence_topk_avg_pooling_op.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/sequence_topk_avg_pooling_op.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool SequenceTopkAvgPoolingOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.ROW);
+  CHECK_OR_FALSE(param_.COLUMN);
+  CHECK_OR_FALSE(param_.Out);
+  CHECK_OR_FALSE(param_.pos);
+  return true;
+}
+
+bool SequenceTopkAvgPoolingOpLite::InferShape() const {
+  int channel_num = param_.channel_num;
+  std::vector<int> topks = param_.topks;
+  auto row_dim = param_.ROW->dims();
+  auto num_k = topks.size();
+  auto row_shape_0 = row_dim[0];
+  std::vector<int64_t> vec_out_shape;
+  vec_out_shape.push_back(row_shape_0);
+  vec_out_shape.push_back(channel_num * num_k);
+
+  param_.Out->Resize(lite::DDim(vec_out_shape));
+  param_.Out->set_lod(param_.ROW->lod());
+  return true;
+}
+
+bool SequenceTopkAvgPoolingOpLite::AttachImpl(const cpp::OpDesc &op_desc,
+                                              lite::Scope *scope) {
+  auto X = op_desc.Input("X").front();
+  auto ROW = op_desc.Input("ROW").front();
+  auto COLUMN = op_desc.Input("COLUMN").front();
+  auto Out = op_desc.Output("Out").front();
+  auto pos = op_desc.Output("pos").front();
+
+  param_.X = scope->FindVar(X)->GetMutable<lite::Tensor>();
+  param_.ROW = scope->FindVar(ROW)->GetMutable<lite::Tensor>();
+  param_.COLUMN = scope->FindVar(COLUMN)->GetMutable<lite::Tensor>();
+  param_.Out = scope->FindVar(Out)->GetMutable<lite::Tensor>();
+  param_.pos = scope->FindVar(pos)->GetMutable<lite::Tensor>();
+  param_.channel_num = op_desc.GetAttr<int>("channel_num");
+  param_.topks = op_desc.GetAttr<std::vector<int>>("topks");
+
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(sequence_topk_avg_pooling,
+                 paddle::lite::operators::SequenceTopkAvgPoolingOpLite);
diff --git a/lite/operators/sequence_topk_avg_pooling_op.h b/lite/operators/sequence_topk_avg_pooling_op.h
new file mode 100644
index 0000000000..1c1cfe3a9c
--- /dev/null
+++ b/lite/operators/sequence_topk_avg_pooling_op.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class SequenceTopkAvgPoolingOpLite : public OpLite {
+ public:
+  SequenceTopkAvgPoolingOpLite() {}
+  explicit SequenceTopkAvgPoolingOpLite(const std::string &op_type)
+      : OpLite(op_type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShape() const override;
+
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override {
+    return "sequence_topk_avg_pooling";
+  }
+
+ private:
+  mutable SequenceTopkAvgPoolingParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/split_op.cc b/lite/operators/split_op.cc
index 18280616aa..ec98a0d6c3 100644
--- a/lite/operators/split_op.cc
+++ b/lite/operators/split_op.cc
@@ -39,8 +39,16 @@ bool SplitOp::InferShape() const {
   const int outs_number = outs.size();
   std::vector<lite::DDim> outs_dims;
   outs_dims.reserve(outs_number);
-
-  if (num > 0) {
+  std::vector<lite::Tensor *> sections_tensor_list_ =
+      param_.sections_tensor_list;
+  if (sections.size() > 0 && sections_tensor_list_.size() > 0) {
+    std::vector<int> vec_sections;
+    for (size_t i = 0; i < sections_tensor_list_.size(); ++i) {
+      auto dim = in_dims;
+      dim[axis] = sections_tensor_list_[i]->data<int>()[0];
+      outs_dims.push_back(dim);
+    }
+  } else if (num > 0) {
     int out_axis_dim = in_dims[axis] / num;
     for (int i = 0; i < outs_number; ++i) {
       auto dim = in_dims;
@@ -55,6 +63,10 @@ bool SplitOp::InferShape() const {
     }
   }
 
+  if (param_.axis_tensor != nullptr) {
+    axis = param_.axis_tensor->data<int>()[0];
+  }
+
   for (int j = 0; j < outs_dims.size(); ++j) {
     outs[j]->Resize(outs_dims[j]);
   }
@@ -73,6 +85,21 @@ bool SplitOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   for (auto var : outs) {
     param_.output.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
   }
+  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+  if (std::find(input_arg_names.begin(), input_arg_names.end(), "AxisTensor") !=
+      input_arg_names.end()) {
+    auto args = opdesc.Input("AxisTensor");
+    auto *var = scope->FindVar(args.front());
+    param_.axis_tensor = var->GetMutable<lite::Tensor>();
+  }
+  if (std::find(input_arg_names.begin(),
+                input_arg_names.end(),
+                "SectionsTensorList") != input_arg_names.end()) {
+    auto args = opdesc.Input("SectionsTensorList");
+    auto *var = scope->FindVar(args.front());
+    param_.sections_tensor_list =
+        *(var->GetMutable<std::vector<lite::Tensor *>>());
+  }
   return true;
 }
 
diff --git a/lite/operators/unsqueeze_op.cc b/lite/operators/unsqueeze_op.cc
index 8db14d0660..39b275b7b5 100644
--- a/lite/operators/unsqueeze_op.cc
+++ b/lite/operators/unsqueeze_op.cc
@@ -66,10 +66,7 @@ bool UnsqueezeOp::InferShape() const {
   std::vector<int> final_axes;
   auto axes = param_.axes;
   auto *axes_tensor = param_.axes_tensor;
-  std::vector<lite::Tensor> axes_tensor_vct;
-  if (param_.axes_tensor_vct) {
-    axes_tensor_vct = *(param_.axes_tensor_vct);
-  }
+  auto axes_tensor_vct = param_.axes_tensor_vct;
 
   if (!axes.empty()) {
     final_axes = axes;
@@ -79,7 +76,7 @@ bool UnsqueezeOp::InferShape() const {
                                   axes_tensor_data + axes_tensor->numel());
   } else if (!axes_tensor_vct.empty()) {
     for (int i = 0; i < axes_tensor_vct.size(); i++) {
-      final_axes.push_back(axes_tensor_vct[i].data<int>()[0]);
+      final_axes.push_back(axes_tensor_vct[i]->data<int>()[0]);
     }
   } else {
     LOG(FATAL) << "Input axis error";
@@ -114,16 +111,12 @@ bool UnsqueezeOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
   if (opdesc.HasInput("AxesTensorList") &&
       opdesc.Input("AxesTensorList").size() > 0) {
     auto args = opdesc.Input("AxesTensorList");
-    /*
     for (auto arg : args) {
       auto *var = scope->FindVar(arg);
       if (var != nullptr) {
         param_.axes_tensor_vct.push_back(var->GetMutable<lite::Tensor>());
       }
     }
-    */
-    auto *var = scope->FindVar(args.front());
-    param_.axes_tensor_vct = var->GetMutable<std::vector<lite::Tensor>>();
   }
   CHECK(param_.X) << "Input(X) of UnsqueezeOp should not be null.";
   CHECK(param_.Out) << "Output(Out) of UnsqueezeOp should not be null.";
diff --git a/lite/operators/var_conv_2d_op.cc b/lite/operators/var_conv_2d_op.cc
new file mode 100644
index 0000000000..5c7fe374fc
--- /dev/null
+++ b/lite/operators/var_conv_2d_op.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/var_conv_2d_op.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool VarConv2dOp::CheckShape() const {
+  auto x_dims = param_.X->dims();
+  CHECK_EQ(x_dims.size(), 2) << "The rank of X(Input) can't be less than 2.";
+  auto w_dims = param_.W->dims();
+  CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor";
+  CHECK_EQ(w_dims[0], param_.output_channel)
+      << "W dim[0] should be equal to OutputChannel";
+  CHECK_EQ(w_dims[1], param_.input_channel * param_.kernel_h * param_.kernel_w)
+      << "W dim[1] should be equal to InputChannel * KernelH * KernelW";
+  LoD x_lod = param_.X->lod();
+  CHECK_EQ(x_lod.empty(), false) << "The Input(X) must hold lod info.";
+  // CHECK_GE(x_lod.size(), 1) << "The Input(X)'s lod info is corrupted.";
+  CHECK_GE(x_lod.size(), 3) << "The Input(X)'s lod info is corrupted.";
+  CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod[0].back()))
+      << "The Input(X)'s lod info mismatches the actual tensor shape.";
+  // LoD row_lod = param_.ROW->lod();
+  // CHECK_EQ(row_lod.empty(), false) << "The Input(ROW) must hold lod info.";
+  // LoD col_lod = param_.COLUMN->lod();
+  // CHECK_EQ(col_lod.empty(), false) << "The Input(COLUMN) must hold lod
+  // info.";
+  return true;
+}
+
+bool VarConv2dOp::InferShape() const { return true; }
+
+bool VarConv2dOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  param_.X = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
+  // param_.ROW = const_cast<lite::Tensor *>(
+  //     &scope->FindVar(opdesc.Input("ROW").front())->Get<lite::Tensor>());
+  // param_.COLUMN = const_cast<lite::Tensor *>(
+  //     &scope->FindVar(opdesc.Input("COLUMN").front())->Get<lite::Tensor>());
+  param_.W = const_cast<lite::Tensor *>(
+      &scope->FindVar(opdesc.Input("W").front())->Get<lite::Tensor>());
+  param_.Out =
+      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
+  param_.Col =
+      scope->FindVar(opdesc.Output("Col").front())->GetMutable<lite::Tensor>();
+  CHECK(param_.X) << "X(Input) of VarConv2dOP should not be null.";
+  // CHECK(param_.ROW) << "Input(ROW) of VarConv2dOP should not be null.";
+  // CHECK(param_.COLUMN) << "Input(COLUMN) of VarConv2dOP should not be null.";
+  CHECK(param_.W) << "W(Input) of VarConv2dOP should not be null.";
+  CHECK(param_.Out) << "Out(Output) of VarConv2dOP should not be null.";
+  CHECK(param_.Col) << "Col(Output) of VarConv2dOP should not be null.";
+  param_.output_channel = opdesc.GetAttr<int>("OutputChannel");
+  param_.input_channel = opdesc.GetAttr<int>("InputChannel");
+  param_.kernel_h = opdesc.GetAttr<int>("KernelH");
+  param_.kernel_w = opdesc.GetAttr<int>("KernelW");
+  param_.stride_h = opdesc.GetAttr<int>("StrideH");
+  param_.stride_w = opdesc.GetAttr<int>("StrideW");
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(var_conv_2d, paddle::lite::operators::VarConv2dOp);
diff --git a/lite/operators/var_conv_2d_op.h b/lite/operators/var_conv_2d_op.h
new file mode 100644
index 0000000000..ce6309419c
--- /dev/null
+++ b/lite/operators/var_conv_2d_op.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class VarConv2dOp : public OpLite {
+ public:
+  VarConv2dOp() {}
+  explicit VarConv2dOp(const std::string &op_type) : OpLite(op_type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "var_conv_2d"; }
+
+ private:
+  mutable VarConv2DParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc
index 7c0f867fae..eefd30f74f 100644
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
@@ -17,8 +17,8 @@
 #include <math.h>
 #include <random>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/tests/cv/cv_basic.h"
-#include "lite/tests/utils/timer.h"
 #include "lite/utils/cv/paddle_image_preprocess.h"
 
 DEFINE_int32(cluster, 3, "cluster id");
@@ -46,7 +46,7 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
 typedef paddle::lite_api::Tensor Tensor_api;
 typedef paddle::lite::Tensor Tensor;
 
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 void fill_tensor_host_rand(uint8_t* dio, int64_t size) {
   uint seed = 256;
@@ -285,8 +285,8 @@ void test_img(const std::vector<int>& cluster_id,
       ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
 
       for (int i = 0; i < test_iter; ++i) {
-        t1.clear();
-        t1.start();
+        t1.Reset();
+        t1.Start();
 
         LOG(INFO) << "image convert saber compute";
         // 方法一: image_preprocess.imageCovert(src, lite_dst);
@@ -329,8 +329,8 @@ void test_img(const std::vector<int>& cluster_id,
                                       means,
                                       scales);
 
-        t1.end();
-        double tdiff = t1.get_average_ms();
+        t1.Stop();
+        double tdiff = t1.LapTimes().Avg();
         to += tdiff;
         if (tdiff < min_time) {
           min_time = tdiff;
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 02d40ce6cc..549fabab5a 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -39,6 +39,8 @@ if(LITE_BUILD_EXTRA)
     lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
     lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tests/kernels/bilinear_interp_compute_test.cc b/lite/tests/kernels/bilinear_interp_compute_test.cc
index 0779caf67a..7ea4293f08 100644
--- a/lite/tests/kernels/bilinear_interp_compute_test.cc
+++ b/lite/tests/kernels/bilinear_interp_compute_test.cc
@@ -22,6 +22,27 @@
 namespace paddle {
 namespace lite {
 
+inline std::vector<int> get_new_shape(
+    std::vector<const lite::Tensor*> list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*(tensor->data<int32_t>())));
+  }
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 template <typename dtype>
 void resize_bilinear_align(std::vector<const lite::Tensor*> inputs,
                            lite::Tensor* output) {
@@ -149,6 +170,9 @@ class BilinearInterpComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
   std::string input0_ = "X";
+  std::string sizetensor0_ = "SizeTensor0";
+  std::string sizetensor1_ = "SizeTensor1";
+  std::string input_scale_ = "Scale";
   std::string input1_ = "OutSize";
   std::string output_ = "Out";
 
@@ -162,6 +186,8 @@ class BilinearInterpComputeTester : public arena::TestCase {
   std::string interp_method_ = "Bilinear";
   DDim _dims0_{{1, 1, 16, 16}};
   DDim _dims1_{{2}};
+  DDim sizetensor_dims_{{1}};
+  DDim scale_dims_{{1}};
 
  public:
   BilinearInterpComputeTester(const Place& place,
@@ -190,33 +216,48 @@ class BilinearInterpComputeTester : public arena::TestCase {
     if (outsize_height_ > 0 && outsize_width_ > 0) {
       inputs.emplace_back(scope->FindTensor(input1_));
     }
+    std::vector<const lite::Tensor*> SizeTensor;
+    if (outsize_height_ > 0 && outsize_width_ > 0) {
+      SizeTensor.emplace_back(scope->FindTensor(sizetensor0_));
+      SizeTensor.emplace_back(scope->FindTensor(sizetensor1_));
+    }
+    const lite::Tensor* input_scale = scope->FindTensor(input_scale_);
+    float scale = height_scale_;
+    int in_h = inputs[0]->dims()[2];
+    int in_w = inputs[0]->dims()[3];
+    if (SizeTensor.size() > 0) {
+      auto new_size = get_new_shape(SizeTensor);
+      out_height_ = new_size[0];
+      out_width_ = new_size[1];
+    } else {
+      auto scale_tensor = input_scale;
+      if (scale_tensor != nullptr) {
+        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+        scale = scale_data[0];
+      }
+      if (scale > 0) {
+        out_height_ = static_cast<int>(in_h * scale);
+        out_width_ = static_cast<int>(in_w * scale);
+      }
+      if (inputs.size() > 1) {
+        auto out_size = inputs[1];
+        auto out_size_data = get_new_data_from_tensor<int>(out_size);
+        out_height_ = out_size_data[0];
+        out_width_ = out_size_data[1];
+      }
+    }
+    height_scale_ = scale;
+    width_scale_ = scale;
+
     if (out_width_ != -1 && out_height_ != -1) {
       height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
       width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
     }
     auto* outputs = scope->NewTensor(output_);
     CHECK(outputs);
-    if (inputs.size() > 1) {
-      auto outsize_data = inputs[1]->data<int>();
-      int h_out = outsize_data[0];  // HW
-      int w_out = outsize_data[1];  // HW
-      int num_cout = inputs[0]->dims()[0];
-      int c_cout = inputs[0]->dims()[1];
-      outputs->Resize({num_cout, c_cout, h_out, w_out});
-    } else {
-      int out_h;
-      int out_w;
-      if (-1 == out_height_ && -1 == out_width_) {
-        out_h = inputs[0]->dims()[2] * height_scale_;
-        out_w = inputs[0]->dims()[3] * width_scale_;
-      } else {
-        out_h = out_height_;
-        out_w = out_width_;
-      }
-      outputs->Resize(
-          {inputs[0]->dims()[0], inputs[0]->dims()[1], out_h, out_w});
-    }
-
+    int num_cout = inputs[0]->dims()[0];
+    int c_cout = inputs[0]->dims()[1];
+    outputs->Resize({num_cout, c_cout, out_height_, out_width_});
     if (align_corners_) {
       resize_bilinear_align<float>(inputs, outputs);
     } else {
@@ -229,6 +270,10 @@ class BilinearInterpComputeTester : public arena::TestCase {
     op_desc->SetInput("X", {input0_});
     if (outsize_height_ > 0 && outsize_width_ > 0) {
       op_desc->SetInput("OutSize", {input1_});
+      op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
+    }
+    if (height_scale_ > 0) {
+      op_desc->SetInput("Scale", {input_scale_});
     }
     op_desc->SetOutput("Out", {output_});
     op_desc->SetAttr("scale", height_scale_);
@@ -250,6 +295,19 @@ class BilinearInterpComputeTester : public arena::TestCase {
       data1[0] = outsize_height_;
       data1[1] = outsize_width_;
       SetCommonTensor(input1_, _dims1_, data1.data());
+
+      std::vector<int> sizetensor_data(1);
+      sizetensor_data[0] = outsize_height_;
+      SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data());
+
+      sizetensor_data[0] = outsize_width_;
+      SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data());
+    }
+
+    if (height_scale_ > 0) {
+      std::vector<float> scale_data(1);
+      scale_data[0] = height_scale_;
+      SetCommonTensor(input_scale_, scale_dims_, scale_data.data());
     }
   }
 };
diff --git a/lite/tests/kernels/conv2d_transpose_compute_test.cc b/lite/tests/kernels/conv2d_transpose_compute_test.cc
index a287f0bb66..6c348076ba 100644
--- a/lite/tests/kernels/conv2d_transpose_compute_test.cc
+++ b/lite/tests/kernels/conv2d_transpose_compute_test.cc
@@ -31,8 +31,10 @@ void col2im(const Dtype* data_col,
             const int width,
             const int kernel_h,
             const int kernel_w,
-            const int pad_h,
-            const int pad_w,
+            const int pad_h0,
+            const int pad_h1,
+            const int pad_w0,
+            const int pad_w1,
             const int stride_h,
             const int stride_w,
             const int dilation_h,
@@ -40,19 +42,22 @@ void col2im(const Dtype* data_col,
             Dtype* data_im) {
   memset(data_im, 0, height * width * channels * sizeof(float));
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
   const int channel_size = height * width;
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_h0 + kernel_row * dilation_h;
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             data_col += output_w;
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_w0 + kernel_col * dilation_w;
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
                 data_im[input_row * width + input_col] += *data_col;
@@ -104,6 +109,34 @@ void fill_bias_relu<float>(float* tensor,
   }
 }
 
+inline void UpdatePaddingAndDilation(std::vector<int>* paddings,
+                                     std::vector<int>* dilations,
+                                     const std::vector<int>& strides,
+                                     const std::string padding_algorithm,
+                                     const DDim data_dims,
+                                     const std::vector<int>& ksize) {
+  // when padding_desc is "VALID" or "SAME"
+  if (padding_algorithm == "SAME") {
+    for (size_t i = 0; i < strides.size(); ++i) {
+      int out_size = (data_dims[i + 2] + strides[i] - 1) / strides[i];
+      int pad_sum = std::max(
+          (out_size - 1) * strides[i] + ksize[i + 2] - data_dims[i + 2],
+          (int64_t)0);
+      int pad_0 = pad_sum / 2;
+      int pad_1 = pad_sum - pad_0;
+      // pad
+      *(paddings->begin() + i * 2) = pad_0;
+      *(paddings->begin() + i * 2 + 1) = pad_1;
+      // dilation
+      *(dilations->begin() + i) = 1;
+    }
+  } else if (padding_algorithm == "VALID") {
+    for (auto& it : *paddings) {
+      it = 0;
+    }
+  }
+}
+
 template <typename type, typename type2>
 static void basic_gemm(int m,
                        int n,
@@ -172,8 +205,10 @@ bool deconv_basic(const Dtype1* din,
                   int stride_h,
                   int dila_w,
                   int dila_h,
-                  int pad_w,
-                  int pad_h,
+                  int pad_w0,
+                  int pad_w1,
+                  int pad_h0,
+                  int pad_h1,
                   bool flag_bias,
                   bool flag_relu) {
   int m = chout * kernel_w * kernel_h / group;
@@ -193,8 +228,9 @@ bool deconv_basic(const Dtype1* din,
   int group_size_coldata = m * n;
   int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
   bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
+                      (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) &&
+                      (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) &&
+                      (dila_h == 1);
 
   for (int i = 0; i < num; ++i) {
     const Dtype1* din_batch = din + i * chin * hin * win;
@@ -204,7 +240,7 @@ bool deconv_basic(const Dtype1* din,
     if (flag_1x1s1p1) {
       col_data = dout_batch;
     }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
+    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group);
     for (int g = 0; g < group; ++g) {
       const Dtype1* din_group = din_batch + g * group_size_in;
       const Dtype1* weights_group = weights + g * group_size_weights;
@@ -230,8 +266,10 @@ bool deconv_basic(const Dtype1* din,
              wout,
              kernel_h,
              kernel_w,
-             pad_h,
-             pad_w,
+             pad_h0,
+             pad_h1,
+             pad_w0,
+             pad_w1,
              stride_h,
              stride_w,
              dila_h,
@@ -253,9 +291,10 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
   std::string output_ = "out";
   std::string filter_ = "filter";
   std::string bias_ = "bias";
+  std::string padding_algorithm_ = "";
 
   std::vector<int> strides_{1, 1};
-  std::vector<int> paddings_{0, 0};
+  std::vector<int> paddings_{0, 0, 0, 0};
   int groups_{1};
   std::vector<int> dilations_{1, 1};
   bool flag_relu_{false};
@@ -280,9 +319,13 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
                                bool flag_relu,
                                int dilation,
                                int stride,
-                               int padding,
+                               int pad_h0,
+                               int pad_h1,
+                               int pad_w0,
+                               int pad_w1,
                                int ks,
-                               int groups)
+                               int groups,
+                               std::string padding_algorithm)
       : TestCase(place, alias) {
     n_ = n;
     ic_ = ic;
@@ -291,20 +334,29 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
     iw_ = iw;
     ks_ = ks;
     flag_bias_ = flag_bias;
-
+    padding_algorithm_ = padding_algorithm;
     strides_ = std::vector<int>({stride, stride});
-    paddings_ = std::vector<int>({padding, padding});
-    groups_ = groups;
+    paddings_ = std::vector<int>({pad_h0, pad_h1, pad_w0, pad_w1});
     dilations_ = std::vector<int>({dilation, dilation});
+    groups_ = groups;
     flag_relu_ = flag_relu;
   }
 
   void RunBaseline(Scope* scope) override {
     auto* out = scope->NewTensor(output_);
     CHECK(out);
-    int oh = (ih_ - 1) * strides_[0] - 2 * paddings_[0] +
+    auto* x = scope->FindTensor(x_);
+    auto input_dim = x->dims();
+    std::vector<int> ksize({1, 1, ks_, ks_});
+    UpdatePaddingAndDilation(&paddings_,
+                             &dilations_,
+                             strides_,
+                             padding_algorithm_,
+                             input_dim,
+                             ksize);
+    int oh = (ih_ - 1) * strides_[0] - paddings_[0] - paddings_[1] +
              dilations_[0] * (ks_ - 1) + 1;
-    int ow = (iw_ - 1) * strides_[1] - 2 * paddings_[1] +
+    int ow = (iw_ - 1) * strides_[1] - paddings_[2] - paddings_[3] +
              dilations_[1] * (ks_ - 1) + 1;
     CHECK(oh > 0 || ow > 0);
 
@@ -313,7 +365,6 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
     out->Resize(output_dims);
     auto* output_data = out->mutable_data<float>();
 
-    auto* x = scope->FindTensor(x_);
     const auto* x_data = x->data<float>();
     auto* filter = scope->FindTensor(filter_);
     const auto* filter_data = filter->data<float>();
@@ -341,8 +392,10 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
                                strides_[0],
                                dilations_[1],
                                dilations_[0],
-                               paddings_[1],
+                               paddings_[2],
+                               paddings_[3],
                                paddings_[0],
+                               paddings_[1],
                                flag_bias_,
                                flag_relu_);
   }
@@ -360,6 +413,7 @@ class Conv2DTransposeComputeTester : public arena::TestCase {
       op_desc->SetInput("Bias", {bias_});
     }
     op_desc->SetAttr("fuse_relu", flag_relu_);
+    op_desc->SetAttr("padding_algorithm", padding_algorithm_);
   }
 
   void PrepareData() override {
@@ -402,49 +456,66 @@ TEST(conv2d_transpose, precision) {
   LOG(INFO) << "test conv2d_transpose op";
 #ifdef LITE_WITH_ARM
   Place place(TARGET(kARM));
-  for (auto n : {1, 2}) {
+  for (auto n : {2}) {
     for (auto ic : {1, 4 /*, 128*/}) {
       for (auto oc : {1, 4 /*, 128*/}) {
         LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" << oc;
-        for (auto ih : {8, 16 /*, 56 , 112, 224, 512*/}) {
+        for (auto ih : {8, 8 /*, 56 , 112, 224, 512*/}) {
           for (auto iw : {8, 16 /*, 56, 112, 224, 512*/}) {
             for (auto flag_bias : {false, true}) {
               for (auto flag_relu : {false, true}) {
                 for (auto dilation : {1, 2}) {
                   for (auto stride : {1, 2}) {
-                    for (auto padding : {0, 2}) {
-                      for (auto ks : {2, 5}) {
-                        for (auto group : {1, 2}) {
-                          // obtain shape
-                          // LOG(INFO) << "n:" << n << ",ic:" << ic << ",oc:" <<
-                          // oc
-                          //           << ",ih:" << ih << ",iw:" << iw
-                          //           << ",flag_bias:" << flag_bias
-                          //           << ",flag_relu:" << flag_relu
-                          //           << ",dila:" << dilation
-                          //           << ",stride:" << stride
-                          //           << ",padding:" << padding << ",ks:" << ks
-                          //           << ",group:" << group;
-                          if (ic % group != 0 || oc % group != 0) {
-                            group = 1;
+                    for (auto pad_h0 : {0, 1}) {
+                      for (auto pad_h1 : {0, 1}) {
+                        for (auto pad_w0 : {0, 1}) {
+                          for (auto pad_w1 : {0, 1}) {
+                            for (auto ks : {1, 4}) {
+                              for (auto group : {1, 2}) {
+                                for (auto padding_algorithm :
+                                     {"", "SAME", "VALID"}) {
+                                  // obtain shape
+                                  // LOG(INFO) << "n:" << n << ",ic:" << ic <<
+                                  // ",oc:" <<
+                                  // oc
+                                  //           << ",ih:" << ih << ",iw:" << iw
+                                  //           << ",flag_bias:" << flag_bias
+                                  //           << ",flag_relu:" << flag_relu
+                                  //           << ",dila:" << dilation
+                                  //           << ",stride:" << stride
+                                  //           << ",padding:" << padding <<
+                                  //           ",ks:" << ks
+                                  //           << ",group:" << group;
+                                  if (ic % group != 0 || oc % group != 0) {
+                                    group = 1;
+                                  }
+                                  std::unique_ptr<arena::TestCase> tester(
+                                      new Conv2DTransposeComputeTester(
+                                          place,
+                                          "def",
+                                          n,
+                                          ic,
+                                          oc,
+                                          ih,
+                                          iw,
+                                          flag_bias,
+                                          flag_relu,
+                                          dilation,
+                                          stride,
+                                          pad_h0,
+                                          pad_h1,
+                                          pad_w0,
+                                          pad_w1,
+                                          ks,
+                                          group,
+                                          padding_algorithm));
+                                  arena::Arena arena(
+                                      std::move(tester), place, 2e-5);
+                                  arena.TestPrecision();
+                                }
+                              }
+                            }
                           }
-                          std::unique_ptr<arena::TestCase> tester(
-                              new Conv2DTransposeComputeTester(place,
-                                                               "def",
-                                                               n,
-                                                               ic,
-                                                               oc,
-                                                               ih,
-                                                               iw,
-                                                               flag_bias,
-                                                               flag_relu,
-                                                               dilation,
-                                                               stride,
-                                                               padding,
-                                                               ks,
-                                                               group));
-                          arena::Arena arena(std::move(tester), place, 2e-5);
-                          arena.TestPrecision();
                         }
                       }
                     }
diff --git a/lite/tests/kernels/fill_constant_compute_test.cc b/lite/tests/kernels/fill_constant_compute_test.cc
new file mode 100644
index 0000000000..e211582b04
--- /dev/null
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
@@ -0,0 +1,178 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+
+namespace paddle {
+namespace lite {
+
+class FillConstantComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string out_ = "out";
+  int dtype_{static_cast<int>(VarDescAPI::VarDataType::FP32)};
+  std::vector<int64_t> shape_{};
+  std::string shape_tensor_ = "ShapeTensor";
+  std::vector<std::string> shape_tensor_list_;
+  bool is_use_shape_tensor_{false};
+  bool is_use_shape_tensor_list_{false};
+
+  float value_{0.0f};
+  // useless for x86, keep it for compatibility
+  bool force_cpu_{false};
+  // DDim shape_tensor_data{{5, 3}};
+  std::vector<int32_t> shape_tensor_data;
+  DDim shape_test{{1, 2}};
+
+ public:
+  FillConstantComputeTester(const Place& place,
+                            const std::string& alias,
+                            std::vector<int64_t> shape,
+                            const bool is_use_shape_tensor,
+                            const bool is_use_shape_tensor_list,
+                            float value,
+                            bool force_cpu)
+      : TestCase(place, alias) {
+    shape_ = shape;
+    value_ = value;
+    force_cpu_ = force_cpu;
+    is_use_shape_tensor_ = is_use_shape_tensor;
+    is_use_shape_tensor_list_ = is_use_shape_tensor_list;
+
+    for (int i = 0; i < shape_test.size(); i++) {
+      shape_tensor_data.push_back(i + 1);
+    }
+  }
+
+  void RunBaseline(Scope* scope) override {
+    auto* out = scope->NewTensor(out_);
+    DDim output_dims{shape_};
+    if (is_use_shape_tensor_) {
+      auto* temp_shape = scope->FindTensor(shape_tensor_);
+      auto* shape_data = temp_shape->data<int>();
+      auto vec_shape =
+          std::vector<int64_t>(shape_data, shape_data + temp_shape->numel());
+      output_dims.ConstructFrom(vec_shape);
+    }
+    if (is_use_shape_tensor_list_) {
+      std::vector<int64_t> vec_shape;
+      for (int i = 0; i < shape_tensor_list_.size(); i++) {
+        auto* temp_shape = scope->FindTensor(shape_tensor_list_[i]);
+        vec_shape.push_back(*temp_shape->data<int>());
+      }
+
+      output_dims.ConstructFrom(vec_shape);
+    }
+    out->Resize(output_dims);
+
+    auto* output_data = out->mutable_data<float>();
+    for (int i = 0; i < out->numel(); i++) {
+      output_data[i] = value_;
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    LOG(INFO) << "PrepareOpDesc";
+
+    op_desc->SetType("fill_constant");
+    op_desc->SetAttr("dtype", dtype_);
+    op_desc->SetAttr("shape", shape_);
+    op_desc->SetAttr("value", value_);
+    op_desc->SetAttr("force_cpu", force_cpu_);
+    if (is_use_shape_tensor_) {
+      op_desc->SetInput("ShapeTensor", {shape_tensor_});
+    }
+    if (is_use_shape_tensor_list_) {
+      // std::vector<std::string> shape_tensor_list_;
+      for (int i = 0; i < shape_test.size(); ++i) {
+        shape_tensor_list_.push_back("shape_tensor_list_" + std::to_string(i));
+      }
+      op_desc->SetInput("ShapeTensorList", {shape_tensor_list_});
+    }
+    op_desc->SetOutput("Out", {out_});
+  }
+
+  void PrepareData() override {
+    if (is_use_shape_tensor_) {
+      // std::vector<int64_t> temp = x_dims_.data();
+      // int64_t* data = temp.data();
+      SetCommonTensor(shape_tensor_, shape_test, shape_tensor_data.data());
+    }
+    if (is_use_shape_tensor_list_) {
+      Scope& scope_ = this->scope();
+      for (int i = 0; i < shape_test.size(); ++i) {
+        auto* tensor =
+            scope_.NewTensor("shape_tensor_list_" + std::to_string(i));
+        tensor->Resize(DDim({1}));
+        auto* d = tensor->mutable_data<int>();
+        d[0] = shape_tensor_data[i];
+      }
+    }
+  }
+};
+
+TEST(fill_constant, precision) {
+  LOG(INFO) << "test fill_constant op, kARM";
+#ifdef LITE_WITH_ARM
+  Place place(TARGET(kARM));
+  std::vector<int64_t> shape{1, 2};
+
+  for (int dtype : {static_cast<int>(VarDescAPI::VarDataType::INT32)}) {
+    for (float value : {1, 2}) {
+      for (bool is_use_shape_tensor_list : {false, true}) {
+        for (bool is_use_shape_tensor : {false, true}) {
+          if (is_use_shape_tensor && is_use_shape_tensor_list) break;
+          LOG(INFO) << "value:" << value
+                    << ", is_use_shape_tensor:" << is_use_shape_tensor
+                    << ", is_use_shape_tensor_list:"
+                    << is_use_shape_tensor_list;
+
+          std::unique_ptr<arena::TestCase> tester(
+              new FillConstantComputeTester(place,
+                                            "def",
+                                            shape,
+                                            is_use_shape_tensor,
+                                            is_use_shape_tensor_list,
+                                            value,
+                                            false));
+          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+#endif
+
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  LOG(INFO) << "test concate op, x86";
+  for (int axis : {1, 2}) {
+    for (bool is_use_axis_tensor : {false, true}) {
+      LOG(INFO) << "axis:" << axis
+                << ", is_use_axis_tensor:" << is_use_axis_tensor;
+      std::unique_ptr<arena::TestCase> tester(
+          new ConcateComputeTester(place, "def", axis, is_use_axis_tensor));
+      arena::Arena arena(std::move(tester), place, 2e-5);
+      arena.TestPrecision();
+    }
+  }
+
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/lrn_compute_test.cc b/lite/tests/kernels/lrn_compute_test.cc
index 9ee43c5c60..e306155514 100644
--- a/lite/tests/kernels/lrn_compute_test.cc
+++ b/lite/tests/kernels/lrn_compute_test.cc
@@ -158,7 +158,7 @@ class LrnComputeTester : public arena::TestCase {
     op_desc->SetOutput("Out", {output_});
     op_desc->SetAttr("alpha", alpha_);
     op_desc->SetAttr("beta", beta_);
-    op_desc->SetAttr("local_size", local_size_);
+    op_desc->SetAttr("n", local_size_);
     op_desc->SetAttr("k", k_);
     op_desc->SetAttr("norm_region", norm_region_);
   }
diff --git a/lite/tests/kernels/nearest_interp_compute_test.cc b/lite/tests/kernels/nearest_interp_compute_test.cc
index 3256ababca..894959f909 100644
--- a/lite/tests/kernels/nearest_interp_compute_test.cc
+++ b/lite/tests/kernels/nearest_interp_compute_test.cc
@@ -22,6 +22,28 @@
 namespace paddle {
 namespace lite {
 
+inline std::vector<int> get_new_shape(
+    const std::vector<const lite::Tensor*>& list_new_shape_tensor) {
+  // get tensor from
+  std::vector<int> vec_new_shape;
+  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
+    auto tensor = list_new_shape_tensor[i];
+    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
+  }
+
+  return vec_new_shape;
+}
+
+template <typename T>
+inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
+  std::vector<T> vec_new_data;
+  auto* new_data = new_data_tensor->data<T>();
+  lite::Tensor cpu_starts_tensor;
+  vec_new_data =
+      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
+  return vec_new_data;
+}
+
 template <typename dtype>
 void resize_nearest_align(std::vector<const lite::Tensor*> inputs,
                           lite::Tensor* output,
@@ -73,6 +95,9 @@ class NearestInterpComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
   std::string input0_ = "X";
+  std::string sizetensor0_ = "SizeTensor0";
+  std::string sizetensor1_ = "SizeTensor1";
+  std::string input_scale_ = "Scale";
   std::string input1_ = "OutSize";
   std::string output_ = "Out";
 
@@ -85,6 +110,8 @@ class NearestInterpComputeTester : public arena::TestCase {
   DDim dims_{{2, 3}};
   DDim _dims0_{{2, 3, 3, 2}};
   DDim _dims1_{{2}};
+  DDim sizetensor_dims_{{1}};
+  DDim scale_dims_{{1}};
 
  public:
   NearestInterpComputeTester(const Place& place,
@@ -112,24 +139,54 @@ class NearestInterpComputeTester : public arena::TestCase {
     inputs.emplace_back(scope->FindTensor(input0_));
     inputs.emplace_back(scope->FindTensor(input1_));
 
-    auto outsize_data = inputs[1]->data<int>();
+    std::vector<const lite::Tensor*> SizeTensor(2);
+    SizeTensor[0] = scope->FindTensor(sizetensor0_);
+    SizeTensor[1] = scope->FindTensor(sizetensor1_);
+    const lite::Tensor* input_scale = scope->FindTensor(input_scale_);
+
+    float scale = height_scale_;
+    int in_h = inputs[0]->dims()[2];
+    int in_w = inputs[0]->dims()[3];
+    if (SizeTensor.size() > 0) {
+      auto new_size = get_new_shape(SizeTensor);
+      out_height_ = new_size[0];
+      out_width_ = new_size[1];
+    } else {
+      auto scale_tensor = input_scale;
+      if (scale_tensor != nullptr) {
+        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
+        scale = scale_data[0];
+      }
+      if (scale > 0) {
+        out_height_ = static_cast<int>(in_h * scale);
+        out_width_ = static_cast<int>(in_w * scale);
+      }
+      auto out_size = inputs[1];
+      if (out_size != nullptr) {
+        auto out_size_data = get_new_data_from_tensor<int>(out_size);
+        out_height_ = out_size_data[0];
+        out_width_ = out_size_data[1];
+      }
+    }
+    height_scale_ = scale;
+    width_scale_ = scale;
+
     if (out_width_ != -1 && out_height_ != -1) {
       height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
       width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
     }
-    if (inputs.size() > 1) {
-      int h_out = outsize_data[0];  // HW
-      int w_out = outsize_data[1];  // HW
-      int num_cout = outputs->dims()[0];
-      int c_cout = outputs->dims()[1];
-      outputs->Resize({num_cout, c_cout, h_out, w_out});
-    }
+    int num_cout = inputs[0]->dims()[0];
+    int c_cout = inputs[0]->dims()[1];
+    outputs->Resize({num_cout, c_cout, out_height_, out_width_});
+
     resize_nearest_align<float>(inputs, outputs, align_corners_);
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("nearest_interp");
     op_desc->SetInput("X", {input0_});
+    op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
+    op_desc->SetInput("Scale", {input_scale_});
     op_desc->SetInput("OutSize", {input1_});
     op_desc->SetOutput("Out", {output_});
     op_desc->SetAttr("scale", height_scale_);
@@ -152,6 +209,17 @@ class NearestInterpComputeTester : public arena::TestCase {
 
     SetCommonTensor(input0_, _dims0_, data0.data());
     SetCommonTensor(input1_, _dims1_, data1.data());
+
+    std::vector<int> sizetensor_data(1);
+    sizetensor_data[0] = out_height_;
+    SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data());
+
+    sizetensor_data[0] = out_width_;
+    SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data());
+
+    std::vector<float> scale_data(1);
+    scale_data[0] = height_scale_;
+    SetCommonTensor(input_scale_, scale_dims_, scale_data.data());
   }
 };
 
diff --git a/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
new file mode 100644
index 0000000000..cb824931ae
--- /dev/null
+++ b/lite/tests/kernels/search_aligned_mat_mul_compute_test.cc
@@ -0,0 +1,220 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+
+namespace paddle {
+namespace lite {
+
+class SearchAlignedMatMulComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "X";
+  std::string y_ = "Y";
+  bool x_transpose_;
+  bool y_transpose_;
+  float alpha_;
+  std::string out_ = "Out";
+  DDim x_dims_;
+  DDim y_dims_;
+  LoD x_lod_;
+  LoD y_lod_;
+
+ public:
+  SearchAlignedMatMulComputeTester(const Place& place,
+                                   const std::string& alias,
+                                   bool x_transpose,
+                                   bool y_transpose,
+                                   float alpha,
+                                   const DDim& x_dims,
+                                   const DDim& y_dims,
+                                   const LoD& x_lod,
+                                   const LoD& y_lod)
+      : TestCase(place, alias),
+        x_transpose_(x_transpose),
+        y_transpose_(y_transpose),
+        alpha_(alpha),
+        x_dims_(x_dims),
+        y_dims_(y_dims),
+        x_lod_(x_lod),
+        y_lod_(y_lod) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(x_);
+    auto y = scope->FindTensor(y_);
+    CHECK(x);
+    CHECK(y);
+    const auto x_data = x->data<float>();
+    const auto y_data = y->data<float>();
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+
+    const auto x_dims = x->dims();
+    const auto y_dims = y->dims();
+    const auto& x_lod = x->lod();
+    const auto& y_lod = y->lod();
+    const auto& x_lod_0 = x_lod[0];
+    const auto& y_lod_0 = y_lod[0];
+
+    int seq_num = x_lod_0.size() - 1;
+    int x_inner_size = x_dims[1];
+    int y_inner_size = y_dims[1];
+    int x_batch_size = x_lod_0[1];
+    int y_batch_size = y_lod_0[1];
+    int M = x_transpose_ ? x_inner_size : x_batch_size;
+    int N = y_transpose_ ? y_batch_size : y_inner_size;
+    int X_K = x_transpose_ ? x_batch_size : x_inner_size;
+    int Y_K = y_transpose_ ? y_inner_size : y_batch_size;
+    CHECK_EQ(X_K, Y_K) << "K of Input(X) and Input(Y) is not equal";
+    int K = X_K;
+    int x_stride = x_batch_size * x_inner_size;
+    int y_stride = y_batch_size * y_inner_size;
+    int out_stride = M * N;
+    int lda = x_transpose_ ? M : K;
+    int ldb = y_transpose_ ? K : N;
+    int ldc = N;
+
+    LoD out_lod;
+    std::vector<uint64_t> out_lod_0(seq_num + 1);
+    out_lod_0[0] = 0;
+    for (int i = 0; i < seq_num; i++) {
+      out_lod_0[i + 1] = out_lod_0[i] + M;
+    }
+    out_lod.push_back(out_lod_0);
+    DDim out_dims(
+        {static_cast<int64_t>(out_lod_0.back()), static_cast<int64_t>(N)});
+    out->set_lod(out_lod);
+    out->Resize(out_dims);
+
+    auto out_data = out->mutable_data<float>();
+    for (int i = 0; i < seq_num; i++) {
+      basic_gemm<float, float>(x_transpose_,
+                               y_transpose_,
+                               M,
+                               N,
+                               K,
+                               alpha_,
+                               x_data + i * x_stride,
+                               lda,
+                               y_data + i * y_stride,
+                               ldb,
+                               0,
+                               out_data + i * out_stride,
+                               ldc,
+                               nullptr,
+                               false,
+                               false);
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("search_aligned_mat_mul");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("Y", {y_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("transpose_X", x_transpose_);
+    op_desc->SetAttr("transpose_Y", y_transpose_);
+    op_desc->SetAttr("alpha", alpha_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> x_data(x_dims_.production());
+    std::vector<float> y_data(y_dims_.production());
+    fill_data_rand(x_data.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(y_data.data(), -1.f, 1.f, y_dims_.production());
+    SetCommonTensor(x_, x_dims_, x_data.data(), x_lod_);
+    SetCommonTensor(y_, y_dims_, y_data.data(), y_lod_);
+  }
+};
+
+void test_search_aligned_mat_mul(Place place) {
+  for (int seq_num : {1, 2}) {
+    for (int x_batch_size : {1, 3}) {
+      for (int x_inner_size : {1, 5}) {
+        for (int out_inner_size : {1, 4}) {
+          for (bool x_transpose : {true, false}) {
+            for (bool y_transpose : {true, false}) {
+              for (float alpha : {1., 2.}) {
+                // infer x_dims and y_dims
+                int y_batch_size;
+                int y_inner_size;
+                if (x_transpose) {
+                  if (y_transpose) {
+                    y_batch_size = out_inner_size;
+                    y_inner_size = x_batch_size;
+                  } else {
+                    y_batch_size = x_batch_size;
+                    y_inner_size = out_inner_size;
+                  }
+                } else {
+                  if (y_transpose) {
+                    y_batch_size = out_inner_size;
+                    y_inner_size = x_inner_size;
+                  } else {
+                    y_batch_size = x_inner_size;
+                    y_inner_size = out_inner_size;
+                  }
+                }
+                std::vector<uint64_t> x_lod_0(seq_num + 1);
+                std::vector<uint64_t> y_lod_0(seq_num + 1);
+                x_lod_0[0] = 0;
+                y_lod_0[0] = 0;
+                for (int i = 0; i < seq_num; i++) {
+                  x_lod_0[i + 1] = x_lod_0[i] + x_batch_size;
+                  y_lod_0[i + 1] = y_lod_0[i] + y_batch_size;
+                }
+                LoD x_lod;
+                LoD y_lod;
+                x_lod.push_back(x_lod_0);
+                y_lod.push_back(y_lod_0);
+                DDim x_dims({static_cast<int64_t>(x_lod_0.back()),
+                             static_cast<int64_t>(x_inner_size)});
+                DDim y_dims({static_cast<int64_t>(y_lod_0.back()),
+                             static_cast<int64_t>(y_inner_size)});
+
+                std::unique_ptr<arena::TestCase> tester(
+                    new SearchAlignedMatMulComputeTester(place,
+                                                         "def",
+                                                         x_transpose,
+                                                         y_transpose,
+                                                         alpha,
+                                                         x_dims,
+                                                         y_dims,
+                                                         x_lod,
+                                                         y_lod));
+                arena::Arena arena(std::move(tester), place, 5e-4);
+                arena.TestPrecision();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(SearchAlignedMatMul, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  test_search_aligned_mat_mul(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/search_seq_fc_compute_test.cc b/lite/tests/kernels/search_seq_fc_compute_test.cc
new file mode 100644
index 0000000000..988d3a27cc
--- /dev/null
+++ b/lite/tests/kernels/search_seq_fc_compute_test.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+
+namespace paddle {
+namespace lite {
+
+class SearchSeqFcOPTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "x";
+  std::string w_ = "w";
+  std::string b_ = "b";
+  std::string out_ = "out";
+  DDim x_dims_;
+  DDim w_dims_;
+  DDim b_dims_;
+  LoD x_lod_;
+  bool has_bias_;
+  int out_size_;
+
+ public:
+  SearchSeqFcOPTest(const Place& place,
+                    const std::string& alias,
+                    DDim x_dims,
+                    DDim w_dims,
+                    DDim b_dims,
+                    LoD x_lod,
+                    bool has_bias,
+                    int out_size)
+      : TestCase(place, alias),
+        x_dims_(x_dims),
+        w_dims_(w_dims),
+        b_dims_(b_dims),
+        x_lod_(x_lod),
+        has_bias_(has_bias),
+        out_size_(out_size) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(x_);
+    auto w = scope->FindTensor(w_);
+    CHECK(x);
+    CHECK(w);
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+
+    const auto x_data = x->data<float>();
+    const auto w_data = w->data<float>();
+    const auto x_dims = x->dims();
+    const auto w_dims = w->dims();
+    const auto& x_lod = x->lod();
+    CHECK_EQ(x_dims.size(), 2) << "The Input(X) should be 2-D tensor.";
+    CHECK(!x_lod.empty()) << "The Input(X) must hold lod info.";
+    const auto& x_lod_0 = x_lod[0];
+    CHECK_GE(x_lod_0.size(), 2) << "The Input(X)'s lod info is corrupted.";
+    CHECK_EQ(x_dims[0], static_cast<int64_t>(x_lod_0.back()))
+        << "The Input(X)'s lod info mismatches the actual tensor shape.";
+    CHECK_EQ(w_dims.size(), 2) << "W should be 2-D tensor.";
+    CHECK_EQ(x_dims[1], w_dims[1]) << "Wrong shape: x_dims[1] != w_dims[1]";
+    CHECK_EQ(w_dims[0], out_size_) << "Wrong shape: w_dims[0] != out_size";
+
+    const float* b_data = nullptr;
+    if (has_bias_) {
+      auto b = scope->FindTensor(b_);
+      CHECK(b);
+      auto b_dims = b->dims();
+      CHECK_EQ(b_dims.size(), 1) << "b should be 1-D tensor.";
+      CHECK_EQ(b_dims[0], w_dims[0]) << "Wrong shape: b_dims[0] != w_dims[0]";
+      b_data = b->data<float>();
+    }
+
+    out->set_lod(x_lod);
+    out->Resize({x_dims[0], w_dims[0]});
+
+    int M = x_dims[0];
+    int K = x_dims[1];
+    int N = w_dims[0];
+    auto out_data = out->mutable_data<float>();
+    basic_gemm<float, float>(false,
+                             true,
+                             M,
+                             N,
+                             K,
+                             1.f,
+                             x_data,
+                             K,
+                             w_data,
+                             K,
+                             0,
+                             out_data,
+                             N,
+                             nullptr,
+                             false,
+                             false);
+    if (b_data != nullptr) {
+      for (int i = 0; i < M; i++) {
+        for (int j = 0; j < N; j++) {
+          out_data[i * N + j] += b_data[j];
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType("search_seq_fc");
+    op_desc->SetInput("X", {x_});
+    op_desc->SetInput("W", {w_});
+    if (has_bias_) {
+      op_desc->SetInput("b", {b_});
+    }
+    op_desc->SetAttr<bool>("has_bias", has_bias_);
+    op_desc->SetAttr<int>("out_size", out_size_);
+    op_desc->SetOutput("Out", {out_});
+  }
+
+  void PrepareData() override {
+    std::vector<float> x_data(x_dims_.production());
+    std::vector<float> w_data(w_dims_.production());
+    fill_data_rand(x_data.data(), -1.f, 1.f, x_dims_.production());
+    fill_data_rand(w_data.data(), -1.f, 1.f, w_dims_.production());
+    SetCommonTensor(x_, x_dims_, x_data.data(), x_lod_);
+    SetCommonTensor(w_, w_dims_, w_data.data());
+    if (has_bias_) {
+      std::vector<float> b_data(b_dims_.production());
+      fill_data_rand(b_data.data(), -1.f, 1.f, b_dims_.production());
+      SetCommonTensor(b_, b_dims_, b_data.data());
+    }
+  }
+};
+
+void test_search_seq_fc(Place place) {
+  for (auto x_lod_0 : {std::vector<uint64_t>({0, 1, 3}),
+                       std::vector<uint64_t>({0, 3, 4, 5})}) {
+    for (auto feature_size : {2, 9}) {
+      for (auto out_size : {3, 5}) {
+        for (auto has_bias : {true, false}) {
+          DDim x_dims({static_cast<int64_t>(x_lod_0.back()), feature_size});
+          DDim w_dims({out_size, feature_size});
+          DDim b_dims({has_bias ? out_size : 0});
+          LoD x_lod;
+          x_lod.push_back(x_lod_0);
+          std::unique_ptr<arena::TestCase> tester(new SearchSeqFcOPTest(
+              place, "def", x_dims, w_dims, b_dims, x_lod, has_bias, out_size));
+          arena::Arena arena(std::move(tester), place, 6e-5);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+}
+
+TEST(SearchSeqFcOP, precision) {
+#ifdef LITE_WITH_X86
+  Place place(TARGET(kX86));
+  test_search_seq_fc(place);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/shuffle_channel_compute_test.cc b/lite/tests/kernels/shuffle_channel_compute_test.cc
index d0e9912e65..66123625fa 100644
--- a/lite/tests/kernels/shuffle_channel_compute_test.cc
+++ b/lite/tests/kernels/shuffle_channel_compute_test.cc
@@ -12,12 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// TODO(zhengxi)
-// shuffle_channel_test can pass on local compilation
-// while on ci compilation, the test will be killed immediately.
-
-/*
-#include <gtest/gtest.h>
+// TODO(FrostML): shaffle_channel cannot pass on CI, but ok in local machine.
+// Open this.
+/*#include <gtest/gtest.h>
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
@@ -30,8 +27,8 @@ class ShuffleChannelComputeTester : public arena::TestCase {
   // common attributes for this op.
   std::string input_ = "X";
   std::string output_ = "Out";
-  int group_ = 1;
-  DDim dims_{{1, 2}};
+  int group_ = 4;
+  DDim dims_{{10, 16, 4, 4}};
 
  public:
   ShuffleChannelComputeTester(const Place& place,
@@ -87,7 +84,7 @@ class ShuffleChannelComputeTester : public arena::TestCase {
 };
 
 void test_shuffle_channel(Place place) {
-  for (int group : {1, 2, 3}) {
+  for (int group : {4}) {
     std::unique_ptr<arena::TestCase> tester(
         new ShuffleChannelComputeTester(place, "def", group));
     arena::Arena arena(std::move(tester), place, 2e-5);
diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc
index 9bbf39b70d..22e475672a 100644
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
@@ -125,8 +125,7 @@ class UnsqueezeComputeTester : public arena::TestCase {
       for (size_t i = 0; i < axes_.size(); i++) {
         name = name + std::to_string(i);
         axes_tensor_list_.push_back(name);
-        std::vector<int> in_data = {axes_[i]};
-        SetCommonTensor(name, DDim({1}), in_data.data());
+        SetCommonTensor(name, DDim({1}), &axes_[i]);
       }
     }
   }
@@ -230,7 +229,7 @@ void test_unsqueeze(Place place) {
       for (int C : {3}) {
         for (int H : {1}) {
           for (int W : {5}) {
-            for (int input_axes_flag : {1, 2}) {
+            for (int input_axes_flag : {1, 2, 3}) {
               LOG(INFO) << N << " " << C << " " << H << " " << W << " "
                         << input_axes_flag;
               std::unique_ptr<arena::TestCase> tester(
diff --git a/lite/tests/math/CMakeLists.txt b/lite/tests/math/CMakeLists.txt
index 87324375e0..7dd4f522db 100644
--- a/lite/tests/math/CMakeLists.txt
+++ b/lite/tests/math/CMakeLists.txt
@@ -1,9 +1,17 @@
 if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
     lite_cc_test(sgemm_compute_test SRCS sgemm_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(sgemv_compute_test SRCS sgemv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(sgemm_c4_compute_test SRCS sgemm_c4_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(gemm_int8_compute_test SRCS gemm_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(gemv_int8_compute_test SRCS gemv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(conv_compute_test SRCS conv_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(conv_transpose_compute_test SRCS conv_transpose_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(conv_int8_compute_test SRCS conv_int8_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(pool_compute_test SRCS pool_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+
+    if(LITE_BUILD_EXTRA)
+        lite_cc_test(layout_compute_test SRCS layout_compute_test.cc DEPS arena_framework ${arm_kernels} ${lite_ops} ${host_kernels})
+    endif()
+    
+
 endif()
diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc
index bfb74e6e0a..bda50d3563 100644
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_compute.h"
@@ -59,26 +59,30 @@ DEFINE_bool(flag_bias, true, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
   DDim dim_out = dim_in;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   dim_out[1] = param.filter->dims()[0];
   auto kernel_h = param.filter->dims()[2];
   auto kernel_w = param.filter->dims()[3];
   auto h = dim_in[2];
   auto w = dim_in[3];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
+  int pad_top = paddings[0];
+  int pad_bottom = paddings[1];
+  int pad_left = paddings[2];
+  int pad_right = paddings[3];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
   auto kernel_exten = dila_h * (kernel_h - 1) + 1;
-  auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1;
+  auto hout = (h + pad_top + pad_bottom - kernel_exten) / stride_h + 1;
   kernel_exten = dila_w * (kernel_w - 1) + 1;
-  auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1;
+  auto wout = (w + pad_left + pad_right - kernel_exten) / stride_w + 1;
   dim_out[2] = hout;
   dim_out[3] = wout;
   return dim_out;
@@ -110,8 +114,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
     param.bias->set_precision(PRECISION(kFloat));
   }
   param.strides = strides;
-  param.paddings = pads;
-  param.dilations = dilas;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilas);
   param.fuse_relu = flag_relu;
   param.groups = group;
 
@@ -162,7 +166,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
         param.output->Resize(dim_out);
 
         paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
-        //        paddle::lite::fill_tensor_const(*param.x, 1.f);
+        // paddle::lite::fill_tensor_const(*param.x, 1.f);
         auto din = param.x->data<float>();
 
         Tensor tout_basic;
@@ -189,7 +193,7 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
                                    strides[0],
                                    dilas[1],
                                    dilas[0],
-                                   pads[1],
+                                   pads[2],
                                    pads[0],
                                    flag_bias,
                                    flag_relu);
@@ -201,19 +205,19 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
                       weight_dim[3] / param.groups;
         LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
@@ -235,7 +239,8 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test fp32 conv: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -280,27 +285,33 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
 TEST(TestConv3x3DW, test_conv3x3_depthwise) {
   if (FLAGS_basic_test) {
     for (auto& stride : {1, 2}) {
-      for (auto& pad : {0, 1}) {
-        for (auto& flag_bias : {false, true}) {
-          for (auto& flag_relu : {false, true}) {
-            for (auto& c : {1, 3, 5, 8, 16, 32}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({c, 1, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
-                  dims.push_back(DDim({batch, c, h, h}));
+      for (auto& pad_left : {0, 1, 2}) {
+        for (auto& pad_right : {0, 1, 2}) {
+          for (auto& pad_top : {0, 1, 2}) {
+            for (auto& pad_bottom : {0, 1, 2}) {
+              for (auto& flag_bias : {false, true}) {
+                for (auto& flag_relu : {false, true}) {
+                  for (auto& c : {1, 3, 5, 8, 16, 32}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({c, 1, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 3, 15, 19, 28, 32, 75}) {
+                        dims.push_back(DDim({batch, c, h, h}));
+                      }
+                    }
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   c,
+                                   {stride, stride},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             c,
-                             {stride, stride},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -329,7 +340,7 @@ TEST(TestConv5x5DW, test_conv5x5_depthwise) {
                              weights_dim,
                              c,
                              {stride, stride},
-                             {pad, pad},
+                             {pad, pad, pad, pad},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -366,7 +377,7 @@ TEST(TestConv1x1s1, test_conv1x1s1) {
                              weights_dim,
                              g,
                              {1, 1},
-                             {0, 0},
+                             {0, 0, 0, 0},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -386,26 +397,32 @@ TEST(TestConv3x3s1, test_conv_3x3s1) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 32, 48}) {
       for (auto& cout : {1, 5, 8, 32, 48}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+        for (auto& pad_left : {1, 2}) {
+          for (auto& pad_right : {1, 2}) {
+            for (auto& pad_top : {1, 2}) {
+              for (auto& pad_bottom : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 19, 56, 32}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   1,
+                                   {1, 1},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             1,
-                             {1, 1},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -420,26 +437,32 @@ TEST(TestConv3x3s2, test_conv_3x3s2) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 32}) {
       for (auto& cout : {1, 5, 8, 32}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+        for (auto& pad_left : {1, 2}) {
+          for (auto& pad_right : {1, 2}) {
+            for (auto& pad_top : {1, 2}) {
+              for (auto& pad_bottom : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_fp32(dims,
+                                   weights_dim,
+                                   1,
+                                   {2, 2},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_fp32(dims,
-                             weights_dim,
-                             1,
-                             {2, 2},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -458,30 +481,37 @@ TEST(TestConvRand, test_conv_rand) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
-                for (auto& pad : {0, 1, 2}) {
-                  for (auto& dila : {1, 2}) {
-                    for (auto& flag_bias : {false, true}) {
-                      for (auto& flag_relu : {false, true}) {
-                        if (cin % g != 0 || cout % g != 0) {
-                          continue;
-                        }
-                        std::vector<DDim> dims;
-                        DDim weights_dim({cout, cin / g, kh, kw});
-                        for (auto& batch : {1, 2}) {
-                          for (auto& h : {1, 3, 19, 32, 28}) {
-                            dims.push_back(DDim({batch, cin, h, h}));
+                for (auto& pad_left : {0, 1, 2}) {
+                  for (auto& pad_right : {0, 1, 2}) {
+                    for (auto& pad_top : {0, 1, 2}) {
+                      for (auto& pad_bottom : {0, 1, 2}) {
+                        for (auto& dila : {1, 2}) {
+                          for (auto& flag_bias : {false, true}) {
+                            for (auto& flag_relu : {false, true}) {
+                              if (cin % g != 0 || cout % g != 0) {
+                                continue;
+                              }
+                              std::vector<DDim> dims;
+                              DDim weights_dim({cout, cin / g, kh, kw});
+                              for (auto& batch : {1, 2}) {
+                                for (auto& h : {1, 3, 19, 32, 28}) {
+                                  dims.push_back(DDim({batch, cin, h, h}));
+                                }
+                              }
+                              test_conv_fp32(
+                                  dims,
+                                  weights_dim,
+                                  g,
+                                  {stride, stride},
+                                  {pad_top, pad_bottom, pad_left, pad_right},
+                                  {dila, dila},
+                                  flag_bias,
+                                  flag_relu,
+                                  {1, 2, 4},
+                                  {FLAGS_power_mode});
+                            }
                           }
                         }
-                        test_conv_fp32(dims,
-                                       weights_dim,
-                                       g,
-                                       {stride, stride},
-                                       {pad, pad},
-                                       {dila, dila},
-                                       flag_bias,
-                                       flag_relu,
-                                       {1, 2, 4},
-                                       {FLAGS_power_mode});
                       }
                     }
                   }
@@ -510,7 +540,7 @@ TEST(TestConvCustom, test_conv_fp32_custom_size) {
             FLAGS_kernel_w}),
       FLAGS_group,
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       {FLAGS_dila_h, FLAGS_dila_w},
       FLAGS_flag_bias,
       FLAGS_flag_relu,
diff --git a/lite/tests/math/conv_int8_compute_test.cc b/lite/tests/math/conv_int8_compute_test.cc
index e15b7d22bc..27c186d7ce 100644
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_compute.h"
@@ -59,26 +59,26 @@ DEFINE_bool(flag_bias, true, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   DDim dim_out = dim_in;
   dim_out[1] = param.filter->dims()[0];
   auto kernel_h = param.filter->dims()[2];
   auto kernel_w = param.filter->dims()[3];
   auto h = dim_in[2];
   auto w = dim_in[3];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  int dila_h = dilations[0];
+  int dila_w = dilations[1];
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
   auto kernel_exten = dila_h * (kernel_h - 1) + 1;
-  auto hout = (h + 2 * pad_h - kernel_exten) / stride_h + 1;
+  auto hout = (h + paddings[0] + paddings[1] - kernel_exten) / stride_h + 1;
   kernel_exten = dila_w * (kernel_w - 1) + 1;
-  auto wout = (w + 2 * pad_w - kernel_exten) / stride_w + 1;
+  auto wout = (w + paddings[2] + paddings[3] - kernel_exten) / stride_w + 1;
   dim_out[2] = hout;
   dim_out[3] = wout;
   return dim_out;
@@ -104,8 +104,8 @@ void get_conv_param(const DDim& dim_w,
     param->bias->set_precision(PRECISION(kFloat));
   }
   param->strides = strides;
-  param->paddings = pads;
-  param->dilations = dila;
+  param->paddings = std::make_shared<std::vector<int>>(pads);
+  param->dilations = std::make_shared<std::vector<int>>(dila);
   param->fuse_relu = flag_relu;
   param->groups = g;
 
@@ -288,7 +288,7 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
                                    strides[0],
                                    dilas[1],
                                    dilas[0],
-                                   pads[1],
+                                   pads[2],
                                    pads[0],
                                    flag_bias,
                                    flag_relu);
@@ -309,30 +309,30 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
         /// compute fp32 output
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_int8_fp32.Launch();
-          t0.end();
+          t0.Stop();
         }
         LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out
-                  << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         /// compute int8 output
-        t0.clear();
+        t0.Reset();
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_int8_int8.Launch();
-          t0.end();
+          t0.Stop();
         }
         LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out
-                  << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         /// compare result fp32 output
         if (FLAGS_check_result) {
@@ -358,7 +358,8 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test int8 conv, fp32 out: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -416,7 +417,8 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test int8 conv, int8 out: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -428,9 +430,9 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
         }
         LOG(INFO) << "test int8 conv: input: " << dim_in
                   << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1]
-                  << ", stride: " << strides[0] << ", " << strides[1]
-                  << ", dila_: " << dilas[0] << ", " << dilas[1]
+                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
                   << ", bias: " << (flag_bias ? "true" : "false")
                   << ", relu: " << (flag_relu ? "true" : "false")
                   << ", threads: " << th << ", power_mode: " << cls
@@ -473,7 +475,7 @@ TEST(TestConv3x3DWInt8, test_conv3x3_depthwise) {
                              weights_dim,
                              c,
                              {stride, stride},
-                             {pad, pad},
+                             {pad, pad, pad, pad},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -507,7 +509,7 @@ TEST(TestConv5x5DWInt8, test_conv5x5_depthwise) {
                              weights_dim,
                              c,
                              {stride, stride},
-                             {pad, pad},
+                             {pad, pad, pad, pad},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -544,7 +546,7 @@ TEST(TestConv1x1s1Int8, test_conv1x1s1) {
                              weights_dim,
                              g,
                              {1, 1},
-                             {0, 0},
+                             {0, 0, 0, 0},
                              {1, 1},
                              flag_bias,
                              flag_relu,
@@ -564,26 +566,32 @@ TEST(TestConv3x3s1Int8, test_conv_3x3s1) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 32, 48}) {
       for (auto& cout : {1, 5, 8, 32, 48}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+        for (auto& pad_top : {1, 2}) {
+          for (auto& pad_bottom : {1, 2}) {
+            for (auto& pad_left : {1, 2}) {
+              for (auto& pad_right : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 19, 56, 32}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_int8(dims,
+                                   weights_dim,
+                                   1,
+                                   {1, 1},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_int8(dims,
-                             weights_dim,
-                             1,
-                             {1, 1},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -598,26 +606,32 @@ TEST(TestConv3x3s2Int8, test_conv_3x3s2) {
   if (FLAGS_basic_test) {
     for (auto& cin : {1, 3, 8, 32}) {
       for (auto& cout : {1, 5, 8, 32}) {
-        for (auto& pad : {1, 2}) {
-          for (auto& flag_bias : {false, true}) {
-            for (auto& flag_relu : {false, true}) {
-              std::vector<DDim> dims;
-              DDim weights_dim({cout, cin, 3, 3});
-              for (auto& batch : {1, 2}) {
-                for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
-                  dims.push_back(DDim({batch, cin, h, h}));
+        for (auto& pad_top : {1, 2}) {
+          for (auto& pad_bottom : {1, 2}) {
+            for (auto& pad_left : {1, 2}) {
+              for (auto& pad_right : {1, 2}) {
+                for (auto& flag_bias : {false, true}) {
+                  for (auto& flag_relu : {false, true}) {
+                    std::vector<DDim> dims;
+                    DDim weights_dim({cout, cin, 3, 3});
+                    for (auto& batch : {1, 2}) {
+                      for (auto& h : {1, 7, 19, 28, 75, 56, 32}) {
+                        dims.push_back(DDim({batch, cin, h, h}));
+                      }
+                    }
+                    test_conv_int8(dims,
+                                   weights_dim,
+                                   1,
+                                   {2, 2},
+                                   {pad_top, pad_bottom, pad_left, pad_right},
+                                   {1, 1},
+                                   flag_bias,
+                                   flag_relu,
+                                   {1, 2, 4},
+                                   {FLAGS_power_mode});
+                  }
                 }
               }
-              test_conv_int8(dims,
-                             weights_dim,
-                             1,
-                             {2, 2},
-                             {pad, pad},
-                             {1, 1},
-                             flag_bias,
-                             flag_relu,
-                             {1, 2, 4},
-                             {FLAGS_power_mode});
             }
           }
         }
@@ -636,30 +650,37 @@ TEST(TestConvRandInt8, test_conv_rand) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
-                for (auto& pad : {0, 1, 2}) {
-                  for (auto& dila : {1, 2}) {
-                    for (auto& flag_bias : {false, true}) {
-                      for (auto& flag_relu : {false, true}) {
-                        if (cin % g != 0 || cout % g != 0) {
-                          continue;
-                        }
-                        std::vector<DDim> dims;
-                        DDim weights_dim({cout, cin / g, kh, kw});
-                        for (auto& batch : {1, 2}) {
-                          for (auto& h : {1, 3, 19, 32, 28}) {
-                            dims.push_back(DDim({batch, cin, h, h}));
+                for (auto& pad_top : {0, 1, 2}) {
+                  for (auto& pad_bottom : {0, 1, 2}) {
+                    for (auto& pad_left : {0, 1, 2}) {
+                      for (auto& pad_right : {0, 1, 2}) {
+                        for (auto& dila : {1, 2}) {
+                          for (auto& flag_bias : {false, true}) {
+                            for (auto& flag_relu : {false, true}) {
+                              if (cin % g != 0 || cout % g != 0) {
+                                continue;
+                              }
+                              std::vector<DDim> dims;
+                              DDim weights_dim({cout, cin / g, kh, kw});
+                              for (auto& batch : {1, 2}) {
+                                for (auto& h : {1, 3, 19, 32, 28}) {
+                                  dims.push_back(DDim({batch, cin, h, h}));
+                                }
+                              }
+                              test_conv_int8(
+                                  dims,
+                                  weights_dim,
+                                  g,
+                                  {stride, stride},
+                                  {pad_top, pad_bottom, pad_left, pad_right},
+                                  {dila, dila},
+                                  flag_bias,
+                                  flag_relu,
+                                  {1, 2, 4},
+                                  {FLAGS_power_mode});
+                            }
                           }
                         }
-                        test_conv_int8(dims,
-                                       weights_dim,
-                                       g,
-                                       {stride, stride},
-                                       {pad, pad},
-                                       {dila, dila},
-                                       flag_bias,
-                                       flag_relu,
-                                       {1, 2, 4},
-                                       {FLAGS_power_mode});
                       }
                     }
                   }
@@ -688,7 +709,7 @@ TEST(TestConvCustomInt8, test_conv_custom_size) {
             FLAGS_kernel_w}),
       FLAGS_group,
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       {FLAGS_dila_h, FLAGS_dila_w},
       FLAGS_flag_bias,
       FLAGS_flag_relu,
diff --git a/lite/tests/math/conv_transpose_compute_test.cc b/lite/tests/math/conv_transpose_compute_test.cc
index e0da07a534..398e745d94 100644
--- a/lite/tests/math/conv_transpose_compute_test.cc
+++ b/lite/tests/math/conv_transpose_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_transpose_compute.h"
@@ -59,17 +59,19 @@ DEFINE_bool(flag_bias, false, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
   auto filter_dims = param.filter->dims();
   DDim output_shape = dim_in;
   output_shape[1] = filter_dims[1] * param.groups;
+  auto paddings = *param.paddings;
+  auto dilations = *param.dilations;
   for (int i = 0; i < 2; i++) {
-    int kernel_extent = param.dilations[i] * (filter_dims[i + 2] - 1) + 1;
+    int kernel_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
     int output_len = (dim_in[i + 2] - 1) * param.strides[i] + kernel_extent -
-                     2 * param.paddings[i];
+                     (paddings[2 * i] + paddings[2 * i + 1]);
     output_shape[i + 2] = output_len;
   }
   return output_shape;
@@ -101,19 +103,19 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
     param.bias->set_precision(PRECISION(kFloat));
   }
   param.strides = strides;
-  param.paddings = pads;
-  param.dilations = dilas;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
+  param.dilations = std::make_shared<std::vector<int>>(dilas);
   param.fuse_relu = flag_relu;
   param.groups = group;
 
   param.output = new Tensor;
   param.output->set_precision(PRECISION(kFloat));
 
-  //  paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
-  paddle::lite::fill_tensor_const(*param.filter, 1.f);
+  paddle::lite::fill_tensor_rand(*param.filter, -1.f, 1.f);
+  // paddle::lite::fill_tensor_const(*param.filter, 1.f);
   if (flag_bias) {
-    //    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
-    paddle::lite::fill_tensor_const(*param.bias, 1.f);
+    paddle::lite::fill_tensor_rand(*param.bias, -1.f, 1.f);
+    // paddle::lite::fill_tensor_const(*param.bias, 1.f);
   }
   Tensor tmp_weights;
   tmp_weights.Resize(weight_dim);
@@ -128,21 +130,8 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
           new paddle::lite::KernelContext);
       auto& ctx = ctx1->As<paddle::lite::ARMContext>();
       ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
-      /// set param and context
-      for (auto& dim_in : input_dims) {
-        param.x->Resize(dim_in);
-        DDim out_tmp_dims = compute_out_dim(dim_in, param);
-        if (out_tmp_dims[2] < 1 || out_tmp_dims[3] < 1) {
-          continue;
-        }
-        param.output->Resize(out_tmp_dims);
-        break;
-      }
       conv_t.SetParam(param);
       conv_t.SetContext(std::move(ctx1));
-      /// prepare for run
-      conv_t.PrepareForRun();
-
       for (auto& dim_in : input_dims) {
         CHECK_EQ(weight_dim[0], dim_in[1])
             << "input channel must equal to weights channel";
@@ -152,9 +141,11 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         }
         param.x->Resize(dim_in);
         param.output->Resize(dim_out);
-
-        //        paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
-        paddle::lite::fill_tensor_const(*param.x, 1.f);
+        param.filter->CopyDataFrom(tmp_weights);
+        // prepare for run
+        conv_t.PrepareForRun();
+        paddle::lite::fill_tensor_rand(*param.x, -1.f, 1.f);
+        // paddle::lite::fill_tensor_const(*param.x, 1.f);
         auto din = param.x->data<float>();
 
         Tensor tout_basic;
@@ -182,8 +173,10 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
                                      strides[0],
                                      dilas[1],
                                      dilas[0],
-                                     pads[1],
+                                     pads[2],
+                                     pads[3],
                                      pads[0],
+                                     pads[1],
                                      flag_bias,
                                      flag_relu);
         }
@@ -194,19 +187,19 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_t.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         float gops =
             2.f * tmp_weights.numel() * dim_in[0] * dim_in[2] * dim_in[3];
         LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
@@ -228,7 +221,8 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
               LOG(FATAL) << "test fp32 conv: input: " << dim_in
                          << ", output: " << dim_out
                          << ", weight dim: " << weight_dim
-                         << ", pad: " << pads[0] << ", " << pads[1]
+                         << ", pad: " << pads[0] << ", " << pads[1] << ", "
+                         << pads[2] << ", " << pads[3]
                          << ", stride: " << strides[0] << ", " << strides[1]
                          << ", dila_: " << dilas[0] << ", " << dilas[1]
                          << ", bias: " << (flag_bias ? "true" : "false")
@@ -240,9 +234,9 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         }
         LOG(INFO) << "test fp32 conv: input: " << dim_in
                   << ", output: " << dim_out << ", weight dim: " << weight_dim
-                  << ", pad: " << pads[0] << ", " << pads[1]
-                  << ", stride: " << strides[0] << ", " << strides[1]
-                  << ", dila_: " << dilas[0] << ", " << dilas[1]
+                  << ", pad: " << pads[0] << ", " << pads[1] << ", " << pads[2]
+                  << ", " << pads[3] << ", stride: " << strides[0] << ", "
+                  << strides[1] << ", dila_: " << dilas[0] << ", " << dilas[1]
                   << ", bias: " << (flag_bias ? "true" : "false")
                   << ", relu: " << (flag_relu ? "true" : "false")
                   << ", threads: " << th << ", power_mode: " << cls
@@ -278,30 +272,37 @@ TEST(TestConvRand, test_conv_transpose_rand) {
           for (auto& kw : {1, 2, 3}) {
             for (auto& kh : {1, 2, 3}) {
               for (auto& stride : {1, 2}) {
-                for (auto& pad : {0, 1, 2}) {
-                  for (auto& dila : {1, 2}) {
-                    for (auto& flag_bias : {false, true}) {
-                      for (auto& flag_relu : {false, true}) {
-                        if (cin % g != 0 || cout % g != 0) {
-                          continue;
-                        }
-                        std::vector<DDim> dims;
-                        DDim weights_dim({cin, cout / g, kh, kw});
-                        for (auto& batch : {1, 2}) {
-                          for (auto& h : {1, 3, 19, 32, 28}) {
-                            dims.push_back(DDim({batch, cin, h, h}));
+                for (auto& pad_h0 : {0, 1, 2}) {
+                  for (auto& pad_h1 : {0, 1, 2}) {
+                    for (auto& pad_w0 : {0, 1, 2}) {
+                      for (auto& pad_w1 : {0, 1, 2}) {
+                        for (auto& dila : {1, 2}) {
+                          for (auto& flag_bias : {false, true}) {
+                            for (auto& flag_relu : {false, true}) {
+                              if (cin % g != 0 || cout % g != 0) {
+                                continue;
+                              }
+                              std::vector<DDim> dims;
+                              DDim weights_dim({cin, cout / g, kh, kw});
+                              for (auto& batch : {1, 2}) {
+                                for (auto& h : {1, 3, 19, 32, 28}) {
+                                  dims.push_back(DDim({batch, cin, h, h}));
+                                }
+                              }
+                              test_conv_transpose_fp32(
+                                  dims,
+                                  weights_dim,
+                                  g,
+                                  {stride, stride},
+                                  {pad_h0, pad_h1, pad_w0, pad_w1},
+                                  {dila, dila},
+                                  flag_bias,
+                                  flag_relu,
+                                  {1, 4},
+                                  {FLAGS_power_mode});
+                            }
                           }
                         }
-                        test_conv_transpose_fp32(dims,
-                                                 weights_dim,
-                                                 g,
-                                                 {stride, stride},
-                                                 {pad, pad},
-                                                 {dila, dila},
-                                                 flag_bias,
-                                                 flag_relu,
-                                                 {1, 2, 4},
-                                                 {FLAGS_power_mode});
                       }
                     }
                   }
@@ -330,7 +331,7 @@ TEST(TestConvCustom, test_conv_transpose_fp32_custom_size) {
             FLAGS_kernel_w}),
       FLAGS_group,
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       {FLAGS_dila_h, FLAGS_dila_w},
       FLAGS_flag_bias,
       FLAGS_flag_relu,
diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc
index 06a1a0a65e..fde5aacb1c 100644
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -193,7 +193,7 @@ bool test_gemm_int8(bool tra,
     dbias_int8[l] = dbias[l] / scale_c[0];
   }
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
                                                db,
                                                dbias_int8,
@@ -206,21 +206,21 @@ bool test_gemm_int8(bool tra,
                                                trb,
                                                scale_merge_int8.data(),
                                                &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_int8 output: M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   /// fp32 output compute
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
                                                db,
                                                dbias,
@@ -233,15 +233,15 @@ bool test_gemm_int8(bool tra,
                                                trb,
                                                scale_merge_fp32.data(),
                                                &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc
index c64e78d66a..623615c8da 100644
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -165,7 +165,7 @@ bool test_gemv_int8(
     dbias_int8[l] = dbias[l] / scale_c[0];
   }
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemv_int8(da,
                                        db,
                                        dc_fp32,
@@ -177,21 +177,21 @@ bool test_gemv_int8(
                                        dbias,
                                        has_relu,
                                        &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemv_int8_int8 output: M: " << m << ", N: " << n
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   /// fp32 output compute
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemv_int8(da,
                                        db,
                                        dc_int8,
@@ -203,15 +203,15 @@ bool test_gemv_int8(
                                        dbias_int8,
                                        has_relu,
                                        &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/layout_compute_test.cc b/lite/tests/math/layout_compute_test.cc
new file mode 100644
index 0000000000..a566924548
--- /dev/null
+++ b/lite/tests/math/layout_compute_test.cc
@@ -0,0 +1,608 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/operators/op_params.h"
+#include "lite/tests/utils/naive_math_impl.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+#ifdef LITE_WITH_ARM
+#include "lite/kernels/arm/layout_compute.h"
+#endif  // LITE_WITH_ARM
+
+DEFINE_int32(power_mode,
+             3,
+             "power mode: "
+             "0 for POWER_HIGH;"
+             "1 for POWER_LOW;"
+             "2 for POWER_FULL;"
+             "3 for NO_BIND");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+
+DEFINE_int32(batch, 1, "batch size");
+DEFINE_int32(in_channel, 32, "input channel");
+DEFINE_int32(in_height, 112, "input height");
+DEFINE_int32(in_width, 112, "input width");
+
+DEFINE_bool(flag_nchw, true, "do nchw to nhwc");
+
+typedef paddle::lite::DDim DDim;
+typedef paddle::lite::Tensor Tensor;
+typedef paddle::lite::operators::LayoutParam LayoutParam;
+
+using paddle::lite::profile::Timer;
+
+#define IN(n, c, h, w)                                 \
+  input_data[w + h * input_w + c * input_h * input_w + \
+             n * input_c * input_h * input_w]
+#define OUT(n, c, h, w)                                    \
+  output_data[w + h * output_w + c * output_h * output_w + \
+              n * output_c * output_h * output_w]
+
+template <typename Dtype>
+void nchw2nhwc_ref(const Tensor* input, Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_c = input->dims()[1];
+  int input_h = input->dims()[2];
+  int input_w = input->dims()[3];
+  int output_c = output->dims()[1];
+  int output_h = output->dims()[2];
+  int output_w = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, h, w, c) = IN(n, c, h, w);
+        }
+      }
+    }
+  }
+}
+#undef IN
+#undef OUT
+
+#define IN(n, h, w, c)                                 \
+  input_data[c + w * input_c + h * input_w * input_c + \
+             n * input_h * input_w * input_c]
+#define OUT(n, h, w, c)                                    \
+  output_data[c + w * output_c + h * output_w * output_c + \
+              n * output_h * output_w * output_c]
+template <typename Dtype>
+void nhwc2nchw_ref(const Tensor* input, Tensor* output) {
+  auto* input_data = input->data<Dtype>();
+  auto* output_data = output->mutable_data<Dtype>();
+
+  int input_n = input->dims()[0];
+  int input_h = input->dims()[1];
+  int input_w = input->dims()[2];
+  int input_c = input->dims()[3];
+  int output_h = output->dims()[1];
+  int output_w = output->dims()[2];
+  int output_c = output->dims()[3];
+
+  for (int n = 0; n < input_n; ++n) {
+    for (int c = 0; c < input_c; ++c) {
+      for (int h = 0; h < input_h; ++h) {
+        for (int w = 0; w < input_w; ++w) {
+          OUT(n, c, h, w) = IN(n, h, w, c);
+        }
+      }
+    }
+  }
+}
+
+#ifdef LITE_WITH_ARM
+void test_layout_fp32_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kFloat));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kFloat));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kFloat)> layout;
+      DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(
+          *(const_cast<Tensor*>(param.x)), -1.f, 1.f);
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<float>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kFloat));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0.f);
+        auto dout_basic = tout_basic.mutable_data<float>();
+        nchw2nhwc_ref<float>(param.x, &tout_basic);
+      }
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kFloat));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test fp32 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test fp32 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+void test_layout_fp32_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kFloat));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kFloat));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kFloat)> layout;
+      // n h w c == n c h w
+      DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(
+          *(const_cast<Tensor*>(param.x)), -1.f, 1.f);
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<float>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kFloat));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0.f);
+        auto dout_basic = tout_basic.mutable_data<float>();
+        nhwc2nchw_ref<float>(param.x, &tout_basic);
+      }
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kFloat));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test fp32 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test fp32 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+void test_layout_int8_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kInt8));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kInt8));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NCHWToNHWCCompute<PRECISION(kInt8)> layout;
+      DDim dim_out({dim_in[0], dim_in[2], dim_in[3], dim_in[1]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(*(const_cast<Tensor*>(param.x)));
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<int8_t>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kInt8));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0);
+        auto dout_basic = tout_basic.mutable_data<int8_t>();
+        nchw2nhwc_ref<int8_t>(param.x, &tout_basic);
+      }
+      LOG(INFO) << "saber compute";
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      LOG(INFO) << "saber compute end";
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kInt8));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test int8 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test int8 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+void test_layout_int8_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+
+  LayoutParam param;
+  param.x = new Tensor;
+  const_cast<Tensor*>(param.x)->set_precision(PRECISION(kInt8));
+
+  param.y = new Tensor;
+  param.y->set_precision(PRECISION(kInt8));
+
+  for (auto& cls : power_mode) {
+    for (auto& th : thread_num) {
+      paddle::lite::kernels::arm::NHWCToNCHWCompute<PRECISION(kInt8)> layout;
+      // n h w c == n c h w
+      DDim dim_out({dim_in[0], dim_in[3], dim_in[1], dim_in[2]});
+
+      std::unique_ptr<paddle::lite::KernelContext> ctx1(
+          new paddle::lite::KernelContext);
+      auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+      ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), th);
+      /// set param and context
+      const_cast<Tensor*>(param.x)->Resize(dim_in);
+      param.y->Resize(dim_out);
+
+      layout.SetParam(param);
+
+      paddle::lite::fill_tensor_rand(*(const_cast<Tensor*>(param.x)));
+      //   paddle::lite::fill_tensor_const(*param.x, 1.f);
+
+      auto din = param.x->data<int8_t>();
+
+      Tensor tout_basic;
+
+      if (FLAGS_check_result) {
+        tout_basic.set_precision(PRECISION(kInt8));
+        tout_basic.Resize(dim_out);
+        fill_tensor_const(tout_basic, 0.f);
+        auto dout_basic = tout_basic.mutable_data<int8_t>();
+        nhwc2nchw_ref<int8_t>(param.x, &tout_basic);
+      }
+      LOG(INFO) << "saber compute";
+      /// warm up
+      for (int i = 0; i < FLAGS_warmup; ++i) {
+        layout.Run();
+      }
+      /// compute
+      Timer t0;
+      for (int i = 0; i < FLAGS_repeats; ++i) {
+        t0.Start();
+        layout.Run();
+        t0.Stop();
+      }
+      LOG(INFO) << "run";
+      double gops = 2.0 * dim_out.production();
+      LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
+                << ", total GOPS: " << 1e-9 * gops
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
+
+      if (FLAGS_check_result) {
+        double max_ratio = 0;
+        double max_diff = 0;
+        tensor_cmp_host(tout_basic, *param.y, max_ratio, max_diff);
+        LOG(INFO) << "compare result, max diff: " << max_diff
+                  << ", max ratio: " << max_ratio;
+        if (std::abs(max_ratio) > 1e-3f) {
+          if (max_diff > 5e-4f) {
+            LOG(WARNING) << "din";
+            print_tensor(*(const_cast<Tensor*>(param.x)));
+            LOG(WARNING) << "basic result";
+            print_tensor(tout_basic);
+            LOG(WARNING) << "lite result";
+            print_tensor(*param.y);
+            Tensor tdiff;
+            tdiff.Resize(tout_basic.dims());
+            tdiff.set_precision(PRECISION(kInt8));
+            tensor_diff(tout_basic, *param.y, tdiff);
+            print_tensor(tdiff);
+            LOG(FATAL) << "test int8 layout: input: " << dim_in
+                       << ", output: " << dim_out << ", flag_nchw: "
+                       << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                       << ", threads: " << th << ", power_mode: " << cls
+                       << " failed!!\n";
+          }
+        }
+        LOG(INFO) << "test int8 layout: input: " << dim_in
+                  << ", output: " << dim_out
+                  << ", flag_nchw: " << (flag_nchw ? "nchw2nhwc" : "nhwc2nchw")
+                  << ", threads: " << th << ", power_mode: " << cls
+                  << " successed!!\n";
+      }
+    }
+  }
+
+  delete param.x;
+  delete param.y;
+}
+#else
+void test_layout_fp32_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+void test_layout_fp32_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+void test_layout_int8_nchw(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+void test_layout_int8_nhwc(DDim dim_in,
+                           bool flag_nchw,
+                           const std::vector<int>& thread_num,
+                           const std::vector<int>& power_mode) {}
+#endif  // LITE_WITH_ARM
+
+#if 1  //
+TEST(TestLayout, test_Layout_fp32) {
+  if (FLAGS_basic_test) {
+    for (auto n : {1, 3}) {
+      for (auto c : {1, 3, 5, 32}) {
+        for (auto h : {3, 16, 20, 32}) {
+          for (auto w : {3, 4, 32, 112}) {
+            for (auto nchw2nhwc : {true, false}) {
+              DDim dim_in({n, c, h, w});
+              if (nchw2nhwc) {
+                LOG(INFO) << "NCHW2NHWC";
+                test_layout_fp32_nchw(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              } else {
+                LOG(INFO) << "NHWC2NCHW";
+                test_layout_fp32_nhwc(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+#if 1
+TEST(TestLayout, test_Layout_int8) {
+  if (FLAGS_basic_test) {
+    for (auto n : {1, 3}) {
+      for (auto c : {1, 3, 5, 32}) {
+        for (auto h : {3, 16, 20, 32}) {
+          for (auto w : {3, 4, 32, 112}) {
+            for (auto nchw2nhwc : {true, false}) {
+              DDim dim_in({n, c, h, w});
+              if (nchw2nhwc) {
+                LOG(INFO) << "NCHW2NHWC int8";
+                test_layout_int8_nchw(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              } else {
+                LOG(INFO) << "NHWC2NCHW int8";
+                test_layout_int8_nhwc(
+                    dim_in, nchw2nhwc, {1, 2, 4}, {FLAGS_power_mode});
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
+#if 1  /// custom
+TEST(TestLayoutCustom, test_Layout_custom_size) {
+  test_layout_fp32_nchw(
+      {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})},
+      true,
+      {FLAGS_threads},
+      {FLAGS_power_mode});
+}
+#endif  // custom
diff --git a/lite/tests/math/pool_compute_test.cc b/lite/tests/math/pool_compute_test.cc
index 9f4a943594..73a5ba5606 100644
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/pool_compute.h"
@@ -60,7 +60,7 @@ DEFINE_string(pooling_type, "max", "do max pooling");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::PoolParam PoolParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::PoolParam& param) {
@@ -69,8 +69,7 @@ DDim compute_out_dim(const DDim& dim_in,
   auto kernel_w = param.ksize[1];
   auto h = dim_in[2];
   auto w = dim_in[3];
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
+  auto paddings = *param.paddings;
   int stride_h = param.strides[0];
   int stride_w = param.strides[1];
   bool ceil_mode = param.ceil_mode;
@@ -79,11 +78,15 @@ DDim compute_out_dim(const DDim& dim_in,
   int wout = 1;
   if (!flag_global) {
     if (!ceil_mode) {
-      hout = (h - kernel_h + 2 * pad_h) / stride_h + 1;
-      wout = (w - kernel_w + 2 * pad_w) / stride_w + 1;
+      hout = (h - kernel_h + paddings[0] + paddings[1]) / stride_h + 1;
+      wout = (w - kernel_w + paddings[2] + paddings[3]) / stride_w + 1;
     } else {
-      hout = (h - kernel_h + 2 * pad_h + stride_h - 1) / stride_h + 1;
-      wout = (w - kernel_w + 2 * pad_w + stride_w - 1) / stride_w + 1;
+      hout =
+          (h - kernel_h + paddings[0] + paddings[1] + stride_h - 1) / stride_h +
+          1;
+      wout =
+          (w - kernel_w + paddings[2] + paddings[3] + stride_w - 1) / stride_w +
+          1;
     }
   }
   dim_out[2] = hout;
@@ -116,7 +119,7 @@ void pooling_basic(const float* din,
   int stride_h = strides[0];
   int stride_w = strides[1];
   int pad_h = paddings[0];
-  int pad_w = paddings[1];
+  int pad_w = paddings[2];
   int size_channel_in = win * hin;
   int size_channel_out = wout * hout;
   if (global_pooling) {
@@ -195,18 +198,22 @@ void pooling_basic(const float* din,
                 int bh = kernel_h;
                 int bw = kernel_w;
                 if (ew == win) {
-                  bw = sw + kernel_w >= win + pad_w ? win + pad_w
-                                                    : sw + kernel_w;
+                  bw = (sw + kernel_w) >= (win + paddings[3])
+                           ? (win + paddings[3])
+                           : (sw + kernel_w);
                   bw -= sw;
-                  if (sw - pad_w < 0 && sw + kernel_w > win + pad_w) {
+                  if ((sw - pad_w) < 0 &&
+                      (sw + kernel_w) > (win + paddings[3])) {
                     bw += pad_w;
                   }
                 }
                 if (eh == hin) {
-                  bh = sh + kernel_h >= hin + pad_h ? hin + pad_h
-                                                    : sh + kernel_h;
+                  bh = (sh + kernel_h) >= (hin + paddings[1])
+                           ? (hin + paddings[1])
+                           : (sh + kernel_h);
                   bh -= sh;
-                  if (sh - pad_h < 0 && sh + kernel_h > hin + pad_h) {
+                  if ((sh - pad_h) < 0 &&
+                      (sh + kernel_h) > (hin + paddings[1])) {
                     bh += pad_h;
                   }
                 }
@@ -243,7 +250,7 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
   param.ksize = ksize;
 
   param.strides = strides;
-  param.paddings = pads;
+  param.paddings = std::make_shared<std::vector<int>>(pads);
   param.ceil_mode = ceil_mode;
   param.global_pooling = flag_global;
   param.pooling_type = pooling_type;
@@ -313,18 +320,18 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           pool.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1];
         LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ", running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ", running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
@@ -399,31 +406,38 @@ TEST(TestPoolRand, test_pool_rand) {
       for (auto& kw : {1, 2, 3}) {
         for (auto& kh : {1, 2, 3}) {
           for (auto& stride : {1, 2}) {
-            for (auto& pad : {0, 1, 2}) {
-              for (auto& flag_global : {false, true}) {
-                for (auto& exclusive : {false, true}) {
-                  for (auto& ceil_mode : {false, true}) {
-                    for (auto& pooling_type : {"max", "avg"}) {
-                      bool adaptive = false;
-                      bool use_quantizer = false;
-                      std::vector<DDim> dims;
-                      for (auto& batch : {1, 2}) {
-                        for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) {
-                          dims.push_back(DDim({batch, cin, h, h}));
+            for (auto& pad_top : {0, 1, 2}) {
+              for (auto& pad_bottom : {0, 1, 2}) {
+                for (auto& pad_left : {0, 1, 2}) {
+                  for (auto& pad_right : {0, 1, 2}) {
+                    for (auto& flag_global : {false, true}) {
+                      for (auto& exclusive : {false, true}) {
+                        for (auto& ceil_mode : {false, true}) {
+                          for (auto& pooling_type : {"max", "avg"}) {
+                            bool adaptive = false;
+                            bool use_quantizer = false;
+                            std::vector<DDim> dims;
+                            for (auto& batch : {1, 2}) {
+                              for (auto& h : {1, 2, 3, 4, 11, 19, 32, 28}) {
+                                dims.push_back(DDim({batch, cin, h, h}));
+                              }
+                            }
+                            test_pool_fp32(
+                                dims,
+                                {kh, kw},
+                                {stride, stride},
+                                {pad_top, pad_bottom, pad_left, pad_right},
+                                ceil_mode,
+                                flag_global,
+                                exclusive,
+                                adaptive,
+                                use_quantizer,
+                                pooling_type,
+                                {1, 2, 4},
+                                {FLAGS_power_mode});
+                          }
                         }
                       }
-                      test_pool_fp32(dims,
-                                     {kh, kw},
-                                     {stride, stride},
-                                     {pad, pad},
-                                     ceil_mode,
-                                     flag_global,
-                                     exclusive,
-                                     adaptive,
-                                     use_quantizer,
-                                     pooling_type,
-                                     {1, 2, 4},
-                                     {FLAGS_power_mode});
                     }
                   }
                 }
@@ -443,7 +457,7 @@ TEST(TesPoolCustom, test_pool_fp32_custom_size) {
       {DDim({FLAGS_batch, FLAGS_in_channel, FLAGS_in_height, FLAGS_in_width})},
       {FLAGS_kernel_h, FLAGS_kernel_w},
       {FLAGS_stride_h, FLAGS_stride_w},
-      {FLAGS_pad_h, FLAGS_pad_w},
+      {FLAGS_pad_h, FLAGS_pad_h, FLAGS_pad_w, FLAGS_pad_w},
       FLAGS_ceil_mode,
       FLAGS_flag_global,
       FLAGS_exclusive,
diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc
new file mode 100644
index 0000000000..886dba6ac5
--- /dev/null
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
@@ -0,0 +1,236 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+#ifdef LITE_WITH_ARM
+#include "lite/backends/arm/math/funcs.h"
+#endif  // LITE_WITH_ARM
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+using paddle::lite::profile::Timer;
+
+DEFINE_int32(power_mode,
+             3,
+             "power mode: "
+             "0 for POWER_HIGH;"
+             "1 for POWER_LOW;"
+             "2 for POWER_FULL;"
+             "3 for NO_BIND");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_bool(basic_test, false, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+
+DEFINE_int32(M, 512, "gemm_c4: M");
+DEFINE_int32(N, 512, "gemm_c4: N");
+DEFINE_int32(K, 512, "gemm_c4: K");
+
+DEFINE_bool(flag_relu, false, "do relu");
+DEFINE_bool(flag_bias, false, "with bias");
+
+bool test_sgemm_c4(
+    int m, int n, int k, bool has_bias, bool has_relu, int cls, int ths) {
+  int m_round = (m + 3) / 4 * 4;
+  int k_round = (k + 3) / 4 * 4;
+  int size_a = m * k;
+  int size_b = n * k;
+  int size_a_c4 = m_round * k_round;
+  int size_b_c4 = k_round * n;
+
+  Tensor ta;
+  Tensor tb;
+  Tensor ta_c4;
+  Tensor tb_c4;
+  Tensor tc;
+  Tensor tc_basic;
+  Tensor tc_backup;
+  Tensor tbias;
+
+  ta.Resize({size_a});
+  tb.Resize({size_b});
+  ta_c4.Resize({size_a_c4});
+  tb_c4.Resize({size_b_c4});
+  tc.Resize({m_round * n});
+  tc_basic.Resize({m_round * n});
+  tbias.Resize({m});
+
+  ta.set_precision(PRECISION(kFloat));
+  tb.set_precision(PRECISION(kFloat));
+  ta_c4.set_precision(PRECISION(kFloat));
+  tb_c4.set_precision(PRECISION(kFloat));
+  tc.set_precision(PRECISION(kFloat));
+  tc_basic.set_precision(PRECISION(kFloat));
+  tbias.set_precision(PRECISION(kFloat));
+
+  fill_tensor_rand(ta, -1.f, 1.f);
+  fill_tensor_rand(tb, -1.f, 1.f);
+  fill_tensor_rand(tbias, -1.f, 1.f);
+  fill_tensor_rand(tc, -1.f, 1.f);
+
+  auto da = ta.mutable_data<float>();
+  auto db = tb.mutable_data<float>();
+  auto da_c4 = ta_c4.mutable_data<float>();
+  auto db_c4 = tb_c4.mutable_data<float>();
+  auto dc_basic = tc_basic.mutable_data<float>();
+  auto dbias = tbias.mutable_data<float>();
+
+  // trans A, B to c4
+  basic_trans_mat_to_c4(da, da_c4, k, m, k, true);
+  basic_trans_mat_to_c4(db, db_c4, n, k, n, false);
+
+  LOG(INFO) << "sgemm_c4 M: " << m << ", N: " << n << ", K: " << k
+            << ", relu: " << (has_relu ? "true" : "false")
+            << ", bias: " << (has_bias ? "true" : "false");
+
+  if (FLAGS_check_result) {
+    basic_gemm_c4(false,
+                  false,
+                  m,
+                  n,
+                  k,
+                  1.f,
+                  da,
+                  k,
+                  db,
+                  n,
+                  0.f,
+                  dc_basic,
+                  n,
+                  dbias,
+                  has_bias,
+                  has_relu);
+  }
+  Timer t0;
+#ifdef LITE_WITH_ARM
+  //! compute
+  double ops = 2.0 * m_round * n * k_round;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), ths);
+  auto dc = tc.mutable_data<float>();
+  for (int j = 0; j < FLAGS_warmup; ++j) {
+    paddle::lite::arm::math::sgemm_prepack_c4(
+        m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx);
+  }
+
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    t0.Start();
+    paddle::lite::arm::math::sgemm_prepack_c4(
+        m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx);
+    t0.Stop();
+  }
+  LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
+            << ", power_mode: " << cls << ", threads: " << ths
+            << ", GOPS: " << ops * 1e-9f
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
+            << " GOPs";
+
+  if (FLAGS_check_result) {
+    double max_ratio = 0;
+    double max_diff = 0;
+    tensor_cmp_host(tc_basic, tc, max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff
+              << ", max ratio: " << max_ratio;
+    if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) {
+      Tensor tdiff;
+      tdiff.set_precision(PRECISION(kFloat));
+      tdiff.Resize(tc.dims());
+      tensor_diff(tc_basic, tc, tdiff);
+      LOG(INFO) << "a: ";
+      print_tensor(ta);
+      LOG(INFO) << "a_c4: ";
+      print_tensor(ta_c4);
+      LOG(INFO) << "b: ";
+      print_tensor(tb);
+      LOG(INFO) << "b_c4: ";
+      print_tensor(tb_c4);
+      LOG(INFO) << "basic result: ";
+      print_tensor(tc_basic);
+      LOG(INFO) << "lite result: ";
+      print_tensor(tc);
+      LOG(INFO) << "diff result: ";
+      print_tensor(tdiff);
+      return false;
+    }
+  }
+#endif
+  return true;
+}
+
+TEST(TestSgemmC4, test_func_sgemm_c4_prepacked) {
+  if (FLAGS_basic_test) {
+#ifdef LITE_WITH_ARM
+    paddle::lite::DeviceInfo::Init();
+#endif
+    LOG(INFO) << "run basic sgemm_c4 test";
+    for (auto& m : {1, 3, 8, 32, 397}) {
+      for (auto& n : {1, 2, 3, 4, 13, 141, 789}) {
+        for (auto& k : {1, 3, 8, 59, 234}) {
+          for (auto& has_bias : {false, true}) {
+            for (auto& has_relu : {false, true}) {
+              for (auto& th : {1, 2, 4}) {
+                auto flag = test_sgemm_c4(
+                    m, n, k, has_bias, has_relu, FLAGS_power_mode, th);
+                if (flag) {
+                  LOG(INFO) << "test m = " << m << ", n=" << n << ", k=" << k
+                            << ", bias: " << (has_bias ? "true" : "false")
+                            << ", relu: " << (has_relu ? "true" : "false")
+                            << " passed\n";
+                } else {
+                  LOG(FATAL) << "test m = " << m << ", n=" << n << ", k=" << k
+                             << ", bias: " << (has_bias ? "true" : "false")
+                             << ", relu: " << (has_relu ? "true" : "false")
+                             << " failed\n";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(TestSgemmC4Custom, test_func_sgemm_c4_prepacked_custom) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  auto flag = test_sgemm_c4(FLAGS_M,
+                            FLAGS_N,
+                            FLAGS_K,
+                            FLAGS_flag_bias,
+                            FLAGS_flag_relu,
+                            FLAGS_power_mode,
+                            FLAGS_threads);
+  if (!flag) {
+    LOG(FATAL) << "test m = " << FLAGS_M << ", n=" << FLAGS_N
+               << ", k=" << FLAGS_K << ", bias: " << FLAGS_flag_bias
+               << ", relu: " << FLAGS_flag_relu << " failed!!";
+  }
+  LOG(INFO) << "test m = " << FLAGS_M << ", n=" << FLAGS_N << ", k=" << FLAGS_K
+            << ", bias: " << FLAGS_flag_bias << ", relu: " << FLAGS_flag_relu
+            << " passed!!";
+}
diff --git a/lite/tests/math/sgemm_compute_test.cc b/lite/tests/math/sgemm_compute_test.cc
index 1621ceb904..6df5e671fe 100644
--- a/lite/tests/math/sgemm_compute_test.cc
+++ b/lite/tests/math/sgemm_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -171,7 +171,7 @@ bool test_sgemm(bool tra,
     if (i == FLAGS_repeats - 1) {
       memcpy(dc, dc_backup, sizeof(float) * m * ldc);
     }
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::sgemm_prepack(trb,
                                            m,
                                            n,
@@ -186,15 +186,15 @@ bool test_sgemm(bool tra,
                                            has_bias,
                                            has_relu,
                                            &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc
new file mode 100644
index 0000000000..5dd2d32295
--- /dev/null
+++ b/lite/tests/math/sgemv_compute_test.cc
@@ -0,0 +1,194 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include "lite/tests/utils/fill_data.h"
+#include "lite/tests/utils/naive_math_impl.h"
+#ifdef LITE_WITH_ARM
+#include "lite/backends/arm/math/funcs.h"
+#endif  // LITE_WITH_ARM
+#include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
+#include "lite/core/tensor.h"
+#include "lite/tests/utils/tensor_utils.h"
+
+typedef paddle::lite::Tensor Tensor;
+
+DEFINE_int32(cluster, 3, "cluster id");
+DEFINE_int32(threads, 1, "threads num");
+DEFINE_int32(warmup, 0, "warmup times");
+DEFINE_int32(repeats, 1, "repeats times");
+DEFINE_bool(basic_test, true, "do all tests");
+DEFINE_bool(check_result, true, "check the result");
+
+DEFINE_int32(M, 512, "sgemv: M");
+DEFINE_int32(K, 512, "sgemv: K");
+
+DEFINE_bool(traA, false, "gemv: A transpose");
+
+DEFINE_bool(flag_relu, false, "do relu");
+DEFINE_bool(flag_bias, false, "with bias");
+
+bool test_sgemv(
+    bool tra, int m, int k, bool has_bias, bool has_relu, int cls, int ths) {
+  Tensor ta;
+  Tensor tb;
+  Tensor tc;
+  Tensor tc_basic;
+  Tensor tbias;
+
+  ta.Resize({m, k});
+  tb.Resize({k, 1});
+  tc.Resize({m, 1});
+  tc_basic.Resize({m, 1});
+  tbias.Resize({m});
+
+  ta.set_precision(PRECISION(kFloat));
+  tb.set_precision(PRECISION(kFloat));
+  tc.set_precision(PRECISION(kFloat));
+  tc_basic.set_precision(PRECISION(kFloat));
+  tbias.set_precision(PRECISION(kFloat));
+
+  fill_tensor_rand(ta, -1.f, 1.f);
+  // fill_tensor_const(ta, 1.f);
+  fill_tensor_rand(tb, -1.f, 1.f);
+  // fill_tensor_const(tb, 1.f);
+  fill_tensor_rand(tbias, -1.f, 1.f);
+
+  LOG(INFO) << "sgemv M: " << m << ", K: " << k
+            << ", transA: " << (tra ? "true" : "false")
+            << ", relu: " << (has_relu ? "true" : "false")
+            << ", bias: " << (has_bias ? "true" : "false");
+#ifdef LITE_WITH_ARM
+
+  auto da = ta.mutable_data<float>();
+  auto db = tb.mutable_data<float>();
+  auto dc = tc.mutable_data<float>();
+  auto dc_basic = tc_basic.mutable_data<float>();
+  auto dbias = tbias.mutable_data<float>();
+
+  if (FLAGS_check_result) {
+    basic_gemv(
+        m, k, da, db, dbias, dc_basic, 1.f, 0.f, tra, has_bias, has_relu);
+  }
+  paddle::lite::profile::Timer t0;
+  //! compute
+  double ops = 2.0 * m * k;
+  std::unique_ptr<paddle::lite::KernelContext> ctx1(
+      new paddle::lite::KernelContext);
+  auto& ctx = ctx1->As<paddle::lite::ARMContext>();
+  ctx.SetRunMode(static_cast<paddle::lite_api::PowerMode>(cls), ths);
+  /// warmup
+  for (int j = 0; j < FLAGS_warmup; ++j) {
+    paddle::lite::arm::math::sgemv(
+        da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
+  }
+
+  t0.Reset();
+  for (int i = 0; i < FLAGS_repeats; ++i) {
+    t0.Start();
+    paddle::lite::arm::math::sgemv(
+        da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
+    t0.Stop();
+  }
+  LOG(INFO) << "gemv output: M: " << m << ", K: " << k << ", cluster: " << cls
+            << ", threads: " << ths << ", GOPS: " << ops * 1e-9f
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
+            << " GOPs";
+
+  if (FLAGS_check_result) {
+    double max_ratio = 0;
+    double max_diff = 0;
+    /// fp32 result
+    tensor_cmp_host(tc_basic, tc, max_ratio, max_diff);
+    LOG(INFO) << "compare result, max diff: " << max_diff
+              << ", max ratio: " << max_ratio;
+    if (std::abs(max_ratio) > 1e-4f && std::abs(max_diff) > 5e-5f) {
+      Tensor tdiff;
+      tdiff.set_precision(PRECISION(kFloat));
+      tdiff.Resize(tc.dims());
+      tensor_diff(tc_basic, tc, tdiff);
+      LOG(INFO) << "basic result: ";
+      print_tensor(tc_basic);
+      LOG(INFO) << "saber result: ";
+      print_tensor(tc);
+      LOG(INFO) << "diff result: ";
+      print_tensor(tdiff);
+      return false;
+    }
+  }
+#endif
+  return true;
+}
+
+TEST(TestLiteSgemv, Sgemv) {
+  if (FLAGS_basic_test) {
+#ifdef LITE_WITH_ARM
+    paddle::lite::DeviceInfo::Init();
+#endif
+    LOG(INFO) << "run basic sgemv test";
+    for (auto& m : {1, 3, 8, 21, 32, 397}) {
+      for (auto& k : {1, 3, 8, 17, 59, 234}) {
+        for (auto& tra : {true, false}) {
+          for (auto& has_bias : {false, true}) {
+            for (auto& has_relu : {false, true}) {
+              for (auto& th : {1, 2, 4}) {
+                auto flag = test_sgemv(
+                    tra, m, k, has_bias, has_relu, FLAGS_cluster, th);
+                if (flag) {
+                  LOG(INFO) << "test m = " << m << ", k=" << k
+                            << ", bias: " << (has_bias ? "true" : "false")
+                            << ", relu: " << (has_relu ? "true" : "false")
+                            << ", trans A: " << (tra ? "true" : "false")
+                            << ", threads: " << th << " passed\n";
+                } else {
+                  LOG(FATAL) << "test m = " << m << ", k=" << k
+                             << ", bias: " << (has_bias ? "true" : "false")
+                             << ", relu: " << (has_relu ? "true" : "false")
+                             << ", trans A: " << (tra ? "true" : "false")
+                             << ", threads: " << th << " failed\n";
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(TestSgemvCustom, Sgemv_custom) {
+#ifdef LITE_WITH_ARM
+  paddle::lite::DeviceInfo::Init();
+#endif
+  auto flag = test_sgemv(FLAGS_traA,
+                         FLAGS_M,
+                         FLAGS_K,
+                         FLAGS_flag_bias,
+                         FLAGS_flag_relu,
+                         FLAGS_cluster,
+                         FLAGS_threads);
+  if (!flag) {
+    LOG(FATAL) << "test m = " << FLAGS_M << ", k=" << FLAGS_K
+               << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
+               << ", relu: " << FLAGS_flag_relu << " failed!!";
+  }
+  LOG(INFO) << "test m = " << FLAGS_M << ", k=" << FLAGS_K
+            << ", trans A: " << FLAGS_traA << ", bias: " << FLAGS_flag_bias
+            << ", relu: " << FLAGS_flag_relu << " passed!!";
+}
diff --git a/lite/tests/utils/naive_math_impl.h b/lite/tests/utils/naive_math_impl.h
index 846126ac24..fd868e85ac 100644
--- a/lite/tests/utils/naive_math_impl.h
+++ b/lite/tests/utils/naive_math_impl.h
@@ -14,6 +14,108 @@
 
 #pragma once
 
+template <typename type>
+static void basic_trans_mat_to_c4(const type* input,
+                                  type* output,
+                                  const int ldin,
+                                  const int M,
+                                  const int K,
+                                  bool pack_k) {
+  const int m_round = (M + 3) / 4 * 4;
+  int k_round = (K + 3) / 4 * 4;
+  if (!pack_k) {
+    k_round = K;
+  }
+  const int m_loop = m_round / 4;
+  type zero_buf[K];
+  memset(zero_buf, 0, K * sizeof(type));
+  for (int i = 0; i < m_loop; ++i) {
+    const type* in0 = input + i * 4 * ldin;
+    const type* in1 = in0 + ldin;
+    const type* in2 = in1 + ldin;
+    const type* in3 = in2 + ldin;
+    if (4 * (i + 1) - M > 0) {
+      switch (4 * (i + 1) - M) {
+        case 3:
+          in1 = zero_buf;
+        case 2:
+          in2 = zero_buf;
+        case 1:
+          in3 = zero_buf;
+        default:
+          break;
+      }
+    }
+    for (int j = 0; j < K; ++j) {
+      *output++ = *in0++;
+      *output++ = *in1++;
+      *output++ = *in2++;
+      *output++ = *in3++;
+    }
+    for (int j = K; j < k_round; ++j) {
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+      *output++ = static_cast<type>(0);
+    }
+  }
+}
+
+template <typename type, typename type2>
+static void basic_gemm_c4(bool trans_a,
+                          bool trans_b,
+                          int m,
+                          int n,
+                          int k,
+                          type2 alpha,
+                          const type* a,
+                          int lda,
+                          const type* b,
+                          int ldb,
+                          type2 beta,
+                          type2* c,
+                          int ldc,
+                          const type2* bias,
+                          bool flag_bias = false,
+                          bool flag_relu = false) {
+  type2* tmp_c = reinterpret_cast<type2*>(malloc(m * ldc * sizeof(type2)));
+  memset(tmp_c, 0, m * ldc * sizeof(type2));
+#pragma omp parallel for
+  for (int i = 0; i < m; ++i) {
+    auto bias_data = static_cast<type2>(0);
+    if (flag_bias) {
+      bias_data = bias[i];
+    }
+    for (int j = 0; j < n; ++j) {
+      auto sum = static_cast<type2>(0);
+      for (int l = 0; l < k; ++l) {
+        type av;
+        type bv;
+        if (trans_a) {
+          av = a[l * lda + i];
+        } else {
+          av = a[i * lda + l];
+        }
+        if (trans_b) {
+          bv = b[j * ldb + l];
+        } else {
+          bv = b[l * ldb + j];
+        }
+        sum += av * bv;
+      }
+      type2 tmp = alpha * sum + beta * tmp_c[i * ldc + j] + bias_data;
+      if (flag_relu) {
+        tmp_c[i * ldc + j] = tmp > (type2)0 ? tmp : (type2)0;
+      } else {
+        tmp_c[i * ldc + j] = tmp;
+      }
+    }
+  }
+  //! trans c to c4
+  basic_trans_mat_to_c4(tmp_c, c, ldc, m, n, false);
+  free(tmp_c);
+}
+
 template <typename type, typename type2>
 static void basic_gemm(bool trans_a,
                        bool trans_b,
@@ -228,8 +330,10 @@ static void col2im(const Dtype* data_col,
                    const int width,
                    const int kernel_h,
                    const int kernel_w,
-                   const int pad_h,
-                   const int pad_w,
+                   const int pad_h0,
+                   const int pad_h1,
+                   const int pad_w0,
+                   const int pad_w1,
                    const int stride_h,
                    const int stride_w,
                    const int dilation_h,
@@ -237,21 +341,24 @@ static void col2im(const Dtype* data_col,
                    Dtype* data_im) {
   memset(data_im, 0, height * width * channels * sizeof(Dtype));
   const int output_h =
-      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+      (height + pad_h0 + pad_h1 - (dilation_h * (kernel_h - 1) + 1)) /
+          stride_h +
+      1;
   const int output_w =
-      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+      (width + pad_w0 + pad_w1 - (dilation_w * (kernel_w - 1) + 1)) / stride_w +
+      1;
   const int channel_size = height * width;
 
   for (int channel = channels; channel--; data_im += channel_size) {
     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
-        int input_row = -pad_h + kernel_row * dilation_h;
+        int input_row = -pad_h0 + kernel_row * dilation_h;
 
         for (int output_rows = output_h; output_rows; output_rows--) {
           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
             data_col += output_w;
           } else {
-            int input_col = -pad_w + kernel_col * dilation_w;
+            int input_col = -pad_w0 + kernel_col * dilation_w;
 
             for (int output_col = output_w; output_col; output_col--) {
               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
@@ -289,8 +396,10 @@ void deconv_basic(const Dtype1* din,
                   int stride_h,
                   int dila_w,
                   int dila_h,
-                  int pad_w,
-                  int pad_h,
+                  int pad_w0,
+                  int pad_w1,
+                  int pad_h0,
+                  int pad_h1,
                   bool flag_bias,
                   bool flag_relu) {
   int m = chout * kernel_w * kernel_h / group;
@@ -302,8 +411,9 @@ void deconv_basic(const Dtype1* din,
   int group_size_coldata = m * n;
   int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
   bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
+                      (stride_w == 1) && (pad_w0 == 0) && (pad_h0 == 0) &&
+                      (pad_w1 == 0) && (pad_h1 == 0) && (dila_w == 1) &&
+                      (dila_h == 1);
 
   Dtype2* workspace_ptr =
       static_cast<Dtype2*>(malloc(sizeof(float) * m * n * group));
@@ -316,7 +426,7 @@ void deconv_basic(const Dtype1* din,
     if (flag_1x1s1p1) {
       col_data = dout_batch;
     }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
+    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata * group);
     for (int g = 0; g < group; ++g) {
       const Dtype1* din_group = din_batch + g * group_size_in;
       const Dtype1* weights_group = weights + g * group_size_weights;
@@ -346,8 +456,10 @@ void deconv_basic(const Dtype1* din,
              wout,
              kernel_h,
              kernel_w,
-             pad_h,
-             pad_w,
+             pad_h0,
+             pad_h1,
+             pad_w0,
+             pad_w1,
              stride_h,
              stride_w,
              dila_h,
diff --git a/lite/tools/build.sh b/lite/tools/build.sh
index 4873e70773..319f26ff82 100755
--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -20,6 +20,7 @@ BUILD_DIR=$(pwd)
 OPTMODEL_DIR=""
 BUILD_TAILOR=OFF
 BUILD_CV=OFF
+SHUTDOWN_LOG=ON
 
 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
 
@@ -93,7 +94,7 @@ function make_tiny_publish_so {
       -DWITH_TESTING=OFF \
       -DLITE_WITH_JAVA=$BUILD_JAVA \
       -DLITE_WITH_PYTHON=$BUILD_PYTHON \
-      -DLITE_SHUTDOWN_LOG=ON \
+      -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \
       -DLITE_ON_TINY_PUBLISH=ON \
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
@@ -136,7 +137,7 @@ function make_full_publish_so {
       -DWITH_TESTING=OFF \
       -DLITE_WITH_JAVA=$BUILD_JAVA \
       -DLITE_WITH_PYTHON=$BUILD_PYTHON \
-      -DLITE_SHUTDOWN_LOG=ON \
+      -DLITE_SHUTDOWN_LOG=$SHUTDOWN_LOG \
       -DANDROID_STL_TYPE=$android_stl \
       -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
       -DLITE_WITH_CV=$BUILD_CV \
@@ -236,10 +237,10 @@ function make_cuda {
             -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
             -DWITH_TESTING=OFF \
             -DLITE_WITH_ARM=OFF \
-            -DLITE_WITH_PYTHON=ON \
+            -DLITE_WITH_PYTHON=${BUILD_PYTHON} \
             -DLITE_BUILD_EXTRA=ON
-
-  make publish_inference_python_lib -j8
+ 
+  make publish_inference -j4
   cd -
 }
 
@@ -290,6 +291,7 @@ function print_usage {
     echo -e "   ./build.sh --arm_os=<os> --arm_abi=<abi> --arm_lang=<lang> test"
     echo
     echo -e "optional argument:"
+    echo -e "--shutdown_log: (OFF|ON); controls whether to shutdown log, default is ON"
     echo -e "--build_extra: (OFF|ON); controls whether to publish extra operators and kernels for (sequence-related model such as OCR or NLP)"
     echo -e "--build_python: (OFF|ON); controls whether to publish python api lib (ANDROID and IOS is not supported)"
     echo -e "--build_java: (OFF|ON); controls whether to publish java api lib (Only ANDROID is supported)"
@@ -366,6 +368,10 @@ function main {
                 BUILD_TAILOR="${i#*=}"
                 shift
                 ;;
+            --shutdown_log=*)
+                SHUTDOWN_LOG="${i#*=}"
+                shift
+                ;;
             tiny_publish)
                 make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL 
                 shift
diff --git a/lite/tools/build_npu.sh b/lite/tools/build_npu.sh
index 03a74046f1..1509f563b2 100755
--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
@@ -5,8 +5,8 @@ set -ex
 ARM_OS="android"                    # android only yet
 ARM_ABI="armv8"                     # armv8, armv7
 ARM_LANG="gcc"                      # gcc only yet
-ANDROID_STL="c++_static"            # c++_shared, c++_static
-DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HIAI SDK from https://developer.huawei.com/consumer/cn/hiai/
+ANDROID_STL="c++_shared"            # c++_shared/c++_static, c++_shared is used by HiAI DDK 310
+DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/
 TARGET_NAME="test_npu_pass"         # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
 WITH_JAVA=ON                        # ON(build jar and jni so)/OFF
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 8be8e6e6b6..8b5741a7a6 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -1,9 +1,10 @@
 #!/bin/bash
+# The git version of CI is 2.7.4. This script is not compatible with git version 1.7.1.
 set -ex
 
 TESTS_FILE="./lite_tests.txt"
 LIBS_FILE="./lite_libs.txt"
-
+CUDNN_ROOT="/usr/local/cudnn"
 
 readonly ADB_WORK_DIR="/data/local/tmp"
 readonly common_flags="-DWITH_LITE=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_PYTHON=OFF -DWITH_TESTING=ON -DLITE_WITH_ARM=OFF"
@@ -162,6 +163,12 @@ function cmake_x86_for_CI {
     # make test_generated_code -j$NUM_CORES_FOR_COMPILE
 }
 
+function cmake_cuda_for_CI {
+    prepare_workspace # fake an empty __generated_code__.cc to pass cmake.
+    cmake ..  -DLITE_WITH_CUDA=ON -DWITH_MKLDNN=OFF -DLITE_WITH_X86=OFF ${common_flags} -DLITE_WITH_PROFILE=ON -DWITH_MKL=OFF \
+        -DLITE_BUILD_EXTRA=ON -DCUDNN_ROOT=${CUDNN_ROOT}
+}
+
 function cmake_gpu {
     prepare_workspace
     cmake .. " -DWITH_GPU=ON {common_flags} -DLITE_WITH_GPU=ON"
@@ -195,7 +202,6 @@ function test_server {
     # Due to the missing of x86 kernels, we skip the following tests temporarily.
     # TODO(xxx) clear the skip list latter
     local skip_list=("test_paddle_api" "test_cxx_api"
-                     "test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86"
                      "test_light_api"
                      "test_apis" "test_model_bin"
                     )
@@ -227,6 +233,16 @@ function build_test_server {
     test_model_optimize_tool_compile
 }
 
+# The CUDA version of CI is cuda_10.1.243_418.87.00_linux.
+# The cuDNN version is cudnn-10.1-linux-x64-v7.5.0.56.
+function build_test_cuda_server {
+    mkdir -p ./build
+    cd ./build
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
+    cmake_cuda_for_CI
+    build
+}
+
 function build_test_train {
     mkdir -p ./build
     cd ./build
@@ -951,6 +967,10 @@ function main {
                 test_arm_android $TEST_NAME $ARM_PORT
                 shift
                 ;;
+            build_test_cuda_server)
+                build_test_cuda_server
+                shift
+                ;;
             build_test_server)
                 build_test_server
                 shift
diff --git a/lite/tools/debug/debug_utils.h b/lite/tools/debug/debug_utils.h
index 7f77b90488..ff08c47e52 100644
--- a/lite/tools/debug/debug_utils.h
+++ b/lite/tools/debug/debug_utils.h
@@ -27,7 +27,7 @@
 #include "lite/model_parser/pb/var_desc.h"
 #include "lite/utils/all.h"
 
-DEFINE_string(model_dir, "", "Model dir path");
+DEFINE_string(model_path, "", "Model dir path");
 DEFINE_string(input_file, "", "Input datas file path");
 DEFINE_string(topo_output_file, "", "Runtime topology order output file path");
 DEFINE_bool(output_topo, true, "Dump runtime topology or not");
@@ -185,7 +185,7 @@ void ParseConfig(DebugConfig* conf) {
   CHECK(conf);
 #define CHECK_NON_EMPTY(name__) \
   CHECK(!FLAGS_##name__.empty()) << "Option " << #name__ << " can't be empty."
-  CHECK_NON_EMPTY(model_dir);
+  CHECK_NON_EMPTY(model_path);
   if (FLAGS_output_topo) {
     CHECK_NON_EMPTY(topo_output_file);
   }
@@ -193,7 +193,7 @@ void ParseConfig(DebugConfig* conf) {
     CHECK_NON_EMPTY(tensor_output_file);
   }
 #undef CHECK_NON_EMPTY
-  conf->model_dir = FLAGS_model_dir;
+  conf->model_dir = FLAGS_model_path;
   conf->topo_output_file = FLAGS_topo_output_file;
   conf->tensor_output_file = FLAGS_tensor_output_file;
   conf->input_file = FLAGS_input_file;
diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc
index 0bccfe2804..f180475568 100644
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
@@ -69,240 +69,6 @@ void ImagePreprocess::imageResize(const uint8_t* src,
                                   int dstw,
                                   int dsth) {
   resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
-  /*
-    int size = srcw * srch;
-    if (srcw == dstw && srch == dsth) {
-      if (srcFormat == NV12 || srcFormat == NV21) {
-        size = srcw * (floor(1.5 * srch));
-      } else if (srcFormat == BGR || srcFormat == RGB) {
-        size = 3 * srcw * srch;
-      } else if (srcFormat == BGRA || srcFormat == RGBA) {
-        size = 4 * srcw * srch;
-      }
-      memcpy(dst, src, sizeof(uint8_t) * size);
-      return;
-    }
-    double scale_x = static_cast<double>(srcw / dstw);
-    double scale_y = static_cast<double>(srch / dsth);
-
-    int* buf = new int[dstw * 2 + dsth * 2];
-
-    int* xofs = buf;
-    int* yofs = buf + dstw;
-    int16_t* ialpha = reinterpret_cast<int16_t*>(buf + dstw + dsth);
-    int16_t* ibeta = reinterpret_cast<int16_t*>(buf + 2 * dstw + dsth);
-
-    compute_xy(
-        srcw, srch, dstw, dsth, scale_x, scale_y, xofs, yofs, ialpha, ibeta);
-
-    int w_out = dstw;
-    int w_in = srcw;
-    int num = 1;
-    int orih = dsth;
-    if (srcFormat == GRAY) {
-      num = 1;
-    } else if (srcFormat == NV12 || srcFormat == NV21) {
-      num = 1;
-      int hout = static_cast<int>(0.5 * dsth);
-      dsth += hout;
-    } else if (srcFormat == BGR || srcFormat == RGB) {
-      w_in = srcw * 3;
-      w_out = dstw * 3;
-      num = 3;
-
-    } else if (srcFormat == BGRA || srcFormat == RGBA) {
-      w_in = srcw * 4;
-      w_out = dstw * 4;
-      num = 4;
-    }
-
-    int* xofs1 = nullptr;
-    int* yofs1 = nullptr;
-    int16_t* ialpha1 = nullptr;
-    if (orih < dsth) {  // uv
-      int tmp = dsth - orih;
-      int w = dstw / 2;
-      xofs1 = new int[w];
-      yofs1 = new int[tmp];
-      ialpha1 = new int16_t[srcw];
-      compute_xy(srcw / 2,
-                 srch / 2,
-                 w,
-                 tmp,
-                 scale_x,
-                 scale_y,
-                 xofs1,
-                 yofs1,
-                 ialpha1,
-                 ibeta + orih);
-    }
-    int cnt = w_out >> 3;
-    int remain = w_out % 8;
-    int32x4_t _v2 = vdupq_n_s32(2);
-  #pragma omp parallel for
-    for (int dy = 0; dy < dsth; dy++) {
-      int16_t* rowsbuf0 = new int16_t[w_out];
-      int16_t* rowsbuf1 = new int16_t[w_out];
-      int sy = yofs[dy];
-      if (dy >= orih) {
-        xofs = xofs1;
-        yofs = yofs1;
-        ialpha = ialpha1;
-      }
-      if (sy < 0) {
-        memset(rowsbuf0, 0, sizeof(uint16_t) * w_out);
-        const uint8_t* S1 = src + srcw * (sy + 1);
-        const int16_t* ialphap = ialpha;
-        int16_t* rows1p = rowsbuf1;
-        for (int dx = 0; dx < dstw; dx++) {
-          int sx = xofs[dx] * num;  // num = 4
-          int16_t a0 = ialphap[0];
-          int16_t a1 = ialphap[1];
-
-          const uint8_t* S1pl = S1 + sx;
-          const uint8_t* S1pr = S1 + sx + num;
-          if (sx < 0) {
-            S1pl = S1;
-          }
-          for (int i = 0; i < num; i++) {
-            if (sx < 0) {
-              *rows1p++ = ((*S1pl++) * a1) >> 4;
-            } else {
-              *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-            }
-          }
-          ialphap += 2;
-        }
-      } else {
-        // hresize two rows
-        const uint8_t* S0 = src + w_in * (sy);
-        const uint8_t* S1 = src + w_in * (sy + 1);
-        const int16_t* ialphap = ialpha;
-        int16_t* rows0p = rowsbuf0;
-        int16_t* rows1p = rowsbuf1;
-        for (int dx = 0; dx < dstw; dx++) {
-          int sx = xofs[dx] * num;  // num = 4
-          int16_t a0 = ialphap[0];
-          int16_t a1 = ialphap[1];
-
-          const uint8_t* S0pl = S0 + sx;
-          const uint8_t* S0pr = S0 + sx + num;
-          const uint8_t* S1pl = S1 + sx;
-          const uint8_t* S1pr = S1 + sx + num;
-          if (sx < 0) {
-            S0pl = S0;
-            S1pl = S1;
-          }
-          for (int i = 0; i < num; i++) {
-            if (sx < 0) {
-              *rows0p = ((*S0pl++) * a1) >> 4;
-              *rows1p = ((*S1pl++) * a1) >> 4;
-              rows0p++;
-              rows1p++;
-            } else {
-              *rows0p++ = ((*S0pl++) * a0 + (*S0pr++) * a1) >> 4;
-              *rows1p++ = ((*S1pl++) * a0 + (*S1pr++) * a1) >> 4;
-            }
-          }
-          ialphap += 2;
-        }
-      }
-      int ind = dy * 2;
-      int16_t b0 = ibeta[ind];
-      int16_t b1 = ibeta[ind + 1];
-      int16x8_t _b0 = vdupq_n_s16(b0);
-      int16x8_t _b1 = vdupq_n_s16(b1);
-      uint8_t* dp_ptr = dst + dy * w_out;
-      int16_t* rows0p = rowsbuf0;
-      int16_t* rows1p = rowsbuf1;
-      int re_cnt = cnt;
-      if (re_cnt > 0) {
-  #ifdef __aarch64__
-        asm volatile(
-            "1: \n"
-            "ld1 {v0.8h}, [%[rows0p]], #16 \n"
-            "ld1 {v1.8h}, [%[rows1p]], #16 \n"
-            "orr v6.16b, %w[_v2].16b, %w[_v2].16b \n"
-            "orr v7.16b, %w[_v2].16b, %w[_v2].16b \n"
-            "smull v2.4s, v0.4h, %w[_b0].4h \n"
-            "smull2 v4.4s, v0.8h, %w[_b0].8h \n"
-            "smull v3.4s, v1.4h, %w[_b1].4h \n"
-            "smull2 v5.4s, v1.8h, %w[_b1].8h \n"
-
-            "ssra v6.4s, v2.4s, #16 \n"
-            "ssra v7.4s, v4.4s, #16 \n"
-            "ssra v6.4s, v3.4s, #16 \n"
-            "ssra v7.4s, v5.4s, #16 \n"
-
-            "shrn v0.4h, v6.4s, #2 \n"
-            "shrn2 v0.8h, v7.4s, #2 \n"
-            "subs %w[cnt], %w[cnt], #1 \n"
-            "sqxtun v1.8b, v0.8h \n"
-            "st1 {v1.8b}, [%[dp]], #8 \n"
-            "bne 1b \n"
-            : [rows0p] "+r"(rows0p),
-              [rows1p] "+r"(rows1p),
-              [cnt] "+r"(re_cnt),
-              [dp] "+r"(dp_ptr)
-            : [_b0] "w"(_b0), [_b1] "w"(_b1), [_v2] "w"(_v2)
-            : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-  #else
-        asm volatile(
-            "mov        r4, #2          \n"
-            "vdup.s32   q12, r4         \n"
-            "0:                         \n"
-            "vld1.s16   {d2-d3}, [%[rows0p]]!\n"
-            "vld1.s16   {d6-d7}, [%[rows1p]]!\n"
-            "vorr.s32   q10, q12, q12   \n"
-            "vorr.s32   q11, q12, q12   \n"
-
-            "vmull.s16  q0, d2, %[_b0]     \n"
-            "vmull.s16  q1, d3, %[_b0]     \n"
-            "vmull.s16  q2, d6, %[_b1]     \n"
-            "vmull.s16  q3, d7, %[_b1]     \n"
-
-            "vsra.s32   q10, q0, #16    \n"
-            "vsra.s32   q11, q1, #16    \n"
-            "vsra.s32   q10, q2, #16    \n"
-            "vsra.s32   q11, q3, #16    \n"
-
-            "vshrn.s32  d20, q10, #2    \n"
-            "vshrn.s32  d21, q11, #2    \n"
-            "subs       %[cnt], #1          \n"
-            "vqmovun.s16 d20, q10        \n"
-            "vst1.8     {d20}, [%[dp]]!    \n"
-            "bne        0b              \n"
-            : [rows0p] "+r"(rows0p),
-              [rows1p] "+r"(rows1p),
-              [cnt] "+r"(re_cnt),
-              [dp] "+r"(dp_ptr)
-            : [_b0] "w"(_b0), [_b1] "w"(_b1)
-            : "cc",
-              "memory",
-              "r4",
-              "q0",
-              "q1",
-              "q2",
-              "q3",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12");
-
-  #endif  // __aarch64__
-      }
-      for (int i = 0; i < remain; i++) {
-        //             D[x] = (rows0[x]*b0 + rows1[x]*b1) >>
-        //             INTER_RESIZE_COEF_BITS;
-        *dp_ptr++ =
-            (uint8_t)(((int16_t)((b0 * (int16_t)(*rows0p++)) >> 16) +
-                       (int16_t)((b1 * (int16_t)(*rows1p++)) >> 16) + 2) >>
-                      2);
-      }
-    }
-    delete[] buf;
-    */
 }
 
 void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h
index 11673e1904..5a46a9e48e 100644
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
@@ -133,7 +133,7 @@ class ImagePreprocess {
   * color format support 1-channel image, 3-channel image and 4-channel image
   * param src: input image data
   * param dst: output image data
-  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param degree: Rotate degree, support 90, 180 and 270
@@ -158,7 +158,7 @@ class ImagePreprocess {
   * color format support 1-channel image, 3-channel image and 4-channel image
   * param src: input image data
   * param dst: output image data
-  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param flip_param: flip parameter, support X, Y and XY
@@ -190,7 +190,7 @@ class ImagePreprocess {
   * NCHW
   * param src: input image data
   * param dstTensor: output tensor data
-  * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param layout: output tensor layout，support NHWC and NCHW
diff --git a/lite/utils/io.h b/lite/utils/io.h
index 98a0f39b08..92405cae86 100644
--- a/lite/utils/io.h
+++ b/lite/utils/io.h
@@ -14,9 +14,12 @@
 
 #pragma once
 
+#include <dirent.h>
 #include <sys/stat.h>
+#include <sys/types.h>
 #include <fstream>
 #include <string>
+#include <vector>
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 
@@ -46,11 +49,68 @@ static void MkDirRecur(const std::string& path) {
 // read buffer from file
 static std::string ReadFile(const std::string& filename) {
   std::ifstream ifile(filename.c_str());
+  if (!ifile.is_open()) {
+    LOG(FATAL) << "Open file: [" << filename << "] failed.";
+  }
   std::ostringstream buf;
   char ch;
   while (buf && ifile.get(ch)) buf.put(ch);
+  ifile.close();
   return buf.str();
 }
 
+// read lines from file
+static std::vector<std::string> ReadLines(const std::string& filename) {
+  std::ifstream ifile(filename.c_str());
+  if (!ifile.is_open()) {
+    LOG(FATAL) << "Open file: [" << filename << "] failed.";
+  }
+  std::vector<std::string> res;
+  std::string tmp;
+  while (getline(ifile, tmp)) res.push_back(tmp);
+  ifile.close();
+  return res;
+}
+
+static void WriteLines(const std::vector<std::string>& lines,
+                       const std::string& filename) {
+  std::ofstream ofile(filename.c_str());
+  if (!ofile.is_open()) {
+    LOG(FATAL) << "Open file: [" << filename << "] failed.";
+  }
+  for (const auto& line : lines) {
+    ofile << line << "\n";
+  }
+  ofile.close();
+}
+
+static bool IsDir(const std::string& path) {
+  DIR* dir_fd = opendir(path.c_str());
+  if (dir_fd == nullptr) return false;
+  closedir(dir_fd);
+  return true;
+}
+
+static std::vector<std::string> ListDir(const std::string& path,
+                                        bool only_dir = false) {
+  if (!IsDir(path)) {
+    LOG(FATAL) << "[" << path << "] is not a valid dir path.";
+  }
+
+  std::vector<std::string> paths;
+  DIR* parent_dir_fd = opendir(path.c_str());
+  dirent* dp;
+  while ((dp = readdir(parent_dir_fd)) != nullptr) {
+    // Exclude '.', '..' and hidden dir
+    std::string name(dp->d_name);
+    if (name == "." || name == ".." || name[0] == '.') continue;
+    if (IsDir(Join<std::string>({path, name}, "/"))) {
+      paths.push_back(name);
+    }
+  }
+  closedir(parent_dir_fd);
+  return paths;
+}
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/utils/logging.cc b/lite/utils/logging.cc
index 6351be95ac..e9ee5861ba 100644
--- a/lite/utils/logging.cc
+++ b/lite/utils/logging.cc
@@ -43,10 +43,10 @@ void gen_log(STL::ostream& log_stream_,
   gettimeofday(&tv, NULL);
 
   // print date / time
-  log_stream_ << '[' << level << ' ' << std::setw(2) << 1 + tm_time.tm_mon
-              << '/' << std::setw(2) << tm_time.tm_mday << ' ' << std::setw(2)
-              << tm_time.tm_hour << ':' << std::setw(2) << tm_time.tm_min << ':'
-              << std::setw(2) << tm_time.tm_sec << '.' << std::setw(3)
+  log_stream_ << '[' << level << ' ' << STL::setw(2) << 1 + tm_time.tm_mon
+              << '/' << STL::setw(2) << tm_time.tm_mday << ' ' << STL::setw(2)
+              << tm_time.tm_hour << ':' << STL::setw(2) << tm_time.tm_min << ':'
+              << STL::setw(2) << tm_time.tm_sec << '.' << STL::setw(3)
               << tv.tv_usec / 1000 << " ";
 
   if (len > kMaxLen) {
diff --git a/lite/utils/logging.h b/lite/utils/logging.h
index e85753ec30..c2c999fd70 100644
--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -30,6 +30,18 @@
 #include <string>
 #include "lite/utils/replace_stl/stream.h"
 
+#ifdef LITE_WITH_ANDROID
+#include <android/log.h>
+// Android log macors
+#define ANDROID_LOG_TAG "Paddle-Lite"
+#define ANDROID_LOG_I(msg) \
+  __android_log_print(ANDROID_LOG_INFO, ANDROID_LOG_TAG, msg)
+#define ANDROID_LOG_W(msg) \
+  __android_log_print(ANDROID_LOG_WARN, ANDROID_LOG_TAG, msg)
+#define ANDROID_LOG_F(msg) \
+  __android_log_print(ANDROID_LOG_FATAL, ANDROID_LOG_TAG, msg)
+#endif
+
 // NOLINTFILE()
 
 // LOG()
@@ -93,11 +105,22 @@ class LogMessage {
              const char* func,
              int lineno,
              const char* level = "I") {
+    level_ = level;
     paddle::lite::gen_log(log_stream_, file, func, lineno, level);
   }
 
   ~LogMessage() {
     log_stream_ << '\n';
+#ifdef LITE_WITH_ANDROID
+    if (level_ == "I") {
+      ANDROID_LOG_I(log_stream_.str().c_str());
+    } else if (level_ == "W") {
+      ANDROID_LOG_W(log_stream_.str().c_str());
+    } else {
+      fprintf(stderr, "Unsupported log level: %s", level_.c_str());
+      assert(false);
+    }
+#endif
     fprintf(stderr, "%s", log_stream_.str().c_str());
   }
 
@@ -105,6 +128,7 @@ class LogMessage {
 
  protected:
   STL::stringstream log_stream_;
+  std::string level_;
 
   LogMessage(const LogMessage&) = delete;
   void operator=(const LogMessage&) = delete;
@@ -121,7 +145,11 @@ class LogMessageFatal : public LogMessage {
 
   ~LogMessageFatal() {
     log_stream_ << '\n';
+#ifdef LITE_WITH_ANDROID
+    ANDROID_LOG_F(log_stream_.str().c_str());
+#endif
     fprintf(stderr, "%s", log_stream_.str().c_str());
+
 #ifndef LITE_ON_TINY_PUBLISH
     abort();
 #else
@@ -152,6 +180,9 @@ class VLogMessage {
       return;
     }
     log_stream_ << '\n';
+#ifdef LITE_WITH_ANDROID
+    ANDROID_LOG_I(log_stream_.str().c_str());
+#endif
     fprintf(stderr, "%s", log_stream_.str().c_str());
   }
 
diff --git a/lite/utils/replace_stl/stream.cc b/lite/utils/replace_stl/stream.cc
index 61999a79e3..d821078e36 100644
--- a/lite/utils/replace_stl/stream.cc
+++ b/lite/utils/replace_stl/stream.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "lite/utils/replace_stl/stream.h"
+#include <assert.h>
+#include <stdio.h>
 
 #ifdef LITE_ON_TINY_PUBLISH
 
@@ -20,93 +22,119 @@ namespace paddle {
 namespace lite {
 namespace replace_stl {
 
+void ostream::pad(const std::string& text) {
+  if (display_width_ > 0) {
+    if (display_width_ < text.size()) {
+      fprintf(stderr, "Replace STL IO display length less than text\n");
+      assert(false);
+    } else {
+      for (int i = 0; i < display_width_ - text.size(); ++i) {
+        data_.push_back(' ');
+      }
+      display_width_ = -1;
+    }
+  }
+}
+
 #ifdef LITE_SHUTDOWN_LOG
 #define ADD_DATA_AS_STRING(data_, obj_)
 #else
-#define ADD_DATA_AS_STRING(data_, obj_) data_ = data_ + std::to_string(obj_)
+#define ADD_DATA_AS_STRING(data_, obj_)    \
+  std::string text = std::to_string(obj_); \
+  pad(text);                               \
+  data_ = data_ + text;
+
 #endif
 
 template <>
 ostream& ostream::operator<<(const char* obj) {
-  _data = _data + std::string(obj);
+  data_ = data_ + std::string(obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const char& obj) {
-  _data = _data + obj;
+  data_ = data_ + obj;
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const std::string& obj) {
-  _data = _data + obj;
+  data_ = data_ + obj;
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const int16_t& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const int& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const bool& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const long long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const unsigned& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const unsigned long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const unsigned long long& obj) {  // NOLINT
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const float& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const double& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
   return *this;
 }
 
 template <>
 ostream& ostream::operator<<(const long double& obj) {
-  ADD_DATA_AS_STRING(_data, obj);
+  ADD_DATA_AS_STRING(data_, obj);
+  return *this;
+}
+
+template <>
+ostream& ostream::operator<<(const LiteIoWidth& obj) {
+  int width = obj.width;
+  assert(width > 0);
+  display_width_ = width;
   return *this;
 }
 
diff --git a/lite/utils/replace_stl/stream.h b/lite/utils/replace_stl/stream.h
index e6bb261706..3288a19869 100644
--- a/lite/utils/replace_stl/stream.h
+++ b/lite/utils/replace_stl/stream.h
@@ -29,18 +29,25 @@ namespace lite {
 
 namespace replace_stl {
 
+struct LiteIoWidth {
+  explicit LiteIoWidth(int value) : width(value) {}
+  int width;
+};
+
+static LiteIoWidth setw(int width) { return LiteIoWidth(width); }
+
 class ostream {
  public:
   ostream() {}
-  explicit ostream(const std::string& x) : _data(x) {}
+  explicit ostream(const std::string& x) : data_(x) {}
   ~ostream() {}
 
-  const char* c_str() { return _data.c_str(); }
+  const char* c_str() { return data_.c_str(); }
 
-  const std::string& str() { return _data; }
+  const std::string& str() { return data_; }
   const std::string& str(const std::string& x) {
-    _data = x;
-    return _data;
+    data_ = x;
+    return data_;
   }
 
   template <typename T>
@@ -50,7 +57,9 @@ class ostream {
   ostream& operator<<(const T* obj);
 
  private:
-  std::string _data;
+  void pad(const std::string& text);
+  std::string data_;
+  int display_width_{-1};  // -1 refers to no setting
 };
 
 class stringstream : public ostream {
diff --git a/mobile/src/fpga/V2/api.cpp b/mobile/src/fpga/V2/api.cpp
index f39d012e08..1a90cb5bdc 100644
--- a/mobile/src/fpga/V2/api.cpp
+++ b/mobile/src/fpga/V2/api.cpp
@@ -623,7 +623,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
 
     arg->concat_arg.images_in[i] =
         (int8_t *)arg->conv_arg[i].output.address;  // NOLINT
-    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
+    arg->concat_arg.scales_in[i] = out->scale;
     arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
 
     expand_conv_arg(&arg->conv_arg[i]);
diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp
old mode 100644
new mode 100755
index dc3c3356e8..917491c371
--- a/mobile/src/fpga/V2/image.cpp
+++ b/mobile/src/fpga/V2/image.cpp
@@ -83,11 +83,6 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
                     height *
                         align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
                         sizeof(int8_t));
-    for (j = 0;
-         j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-         j++) {
-      images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5);
-    }
   }
   align_each_out_area_cw =
       align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
@@ -102,7 +97,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
         memcpy(
             (int8_t *)image_out + tmp_channel +  // NOLINT
                 k * align_each_out_area_cw_differ,
-            images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw,
+            images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
             channel_num[i] * sizeof(int8_t));
 
         tmp_channel += channel_num[i];
@@ -110,6 +105,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
     }
   }
   fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t));
+  for (i = 0; i < image_num; i++) {
+    fpga_free(images_in_tmp[i]);
+  }
+  fpga_free(images_in_tmp);
 }
 
 void split_image(int8_t *image_in, void **images_out, int image_num,
diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp
old mode 100644
new mode 100755
index aa150e0c6c..a3c179994a
--- a/mobile/src/fpga/V2/pe.cpp
+++ b/mobile/src/fpga/V2/pe.cpp
@@ -109,7 +109,7 @@ using namespace std;     // NOLINT
 #define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
 #define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
 #define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
-#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880
+#define REG_POOLING_RESULT_AMOUNT_ALIGN_16 0x880
 #define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
 #define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
 #define REG_POOLING_MODE_RECIPROCAL 0x890
@@ -248,8 +248,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
   // DLOG << "   activation_type:" << active_args.activation_type
   //     << "   leaky_relu_negative_slope:"
   //     << active_args.leaky_relu_negative_slope;
-  // DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
+  DLOG << "   reg_ActivationArgs:";
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
     ret = -EIO;
@@ -257,6 +257,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
     pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
+    // reg_writeq(reg_ActivationArgs,
+             // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
+
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
   // new
   reg_writeq((args.driver.row_padding_down << 45) |
                  (args.driver.row_padding_up << 34) |
@@ -270,10 +274,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
                  args.driver.filter_pad_width_mul_channel,
              REG_CONV_REG1);
 
-  reg_writeq((args.driver.stride_h << 48) | (args.driver.skip_window << 28) |
-                 (args.driver.filter_row << 8) |
-                 (args.driver.filter_height << 4) | args.driver.filter_width,
-             REG_CONV_REG2);
+    reg_writeq((args.driver.stride_h << 50) | (args.driver.skip_window << 30) |
+               (args.driver.filter_row << 10) |
+               (args.driver.filter_height << 5) | args.driver.filter_width,
+               REG_CONV_REG2);
 
   reg_writeq((args.driver.filter_num << 42) | (args.driver.filter_align << 26) |
                  (args.driver.prog_full_cnt << 16) |
@@ -358,7 +362,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
        << "   out_scale_address:" << args.output.scale_address;
 #endif
 #ifdef PADDLE_MOBILE_ZU5
-  DLOG << "Polling";
   // return 0;
   uint64_t output_scale = 0;
   uint64_t timer_cnt = 0;
@@ -366,66 +369,74 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
   uint64_t cmd = 0;
   uint64_t image_physical_address = 0;
   uint64_t output_physical_address = 0;
-
-  // uint64_t reg_ActivationArgs = 0;
-  // active function:{none,leakeyrelu,sigmoid,tanh}
-  //  ActivationArgs active_args;
-  // active_args.activation_type = LEAKYRELU;
-  //  active_args.activation_type = args.output.activation.activation_type;
-
-  //  active_args.leaky_relu_negative_slope =
-  //     args.output.activation.leaky_relu_negative_slope;
-
-  //  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-  //                       active_args.leaky_relu_negative_slope;
-
-  //  DLOG << "   activation_type:" << active_args.activation_type
-  //       << "   leaky_relu_negative_slope:"
-  //       << active_args.leaky_relu_negative_slope;
-  // DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
-
-  image_physical_address = vaddr_to_paddr_driver(args.image.address);
-  output_physical_address = vaddr_to_paddr_driver(args.output.address);
-  uint32_t output_height = (uint32_t)(
+uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
+  image_physical_address = vaddr_to_paddr(args.image.address);
+  output_physical_address = vaddr_to_paddr(args.output.address);
+  uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
+  uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
+  uint64_t output_height = (uint64_t)(
       (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
+          args.kernel.stride_h + 1);
+  uint64_t output_width = (uint64_t)(
       (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-          args.kernel.stride_w +
-      1);
+           args.kernel.stride_w + 1);
+
   uint64_t image_amount_per_row =
       align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
                  IMAGE_ALIGNMENT);
-  uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-  uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 |
-                             (((uint64_t)args.kernel_reciprocal));
-
+  uint64_t image_one_pad_per_row = (uint64_t)args.image.width *
+          (uint64_t)args.image.channels +(uint64_t)args.image.pad_width *
+          (uint64_t)args.image.channels;
+
+  uint64_t result_amount_align_32 = align_to_x((uint64_t)output_width *
+          (uint64_t)args.image.channels, 32);
+  uint64_t result_addr_row =
+          (result_amount_align_32 << 32) | output_physical_address;
+  uint64_t row_padding_down =
+          (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
+  uint64_t kernel_width_sub1 =
+          (uint64_t)args.kernel.width - 1;
+  uint64_t kernel_padding_step = row_padding_down |
+          ((uint64_t)args.image.pad_height << 16) |
+          ((uint64_t)args.kernel.stride_h << 24) |
+          ((uint64_t)kernel_width_sub1<<32) |
+          ((uint64_t)args.kernel.height << 40) |
+          ((uint64_t)(args.kernel.height-1) << 48);
+  uint64_t image_calcu_height = (uint64_t)args.kernel.height +
+          (output_height - 1) * (uint64_t)args.kernel.stride_h;
+  uint64_t result_size_calcu_height = (output_height - 1) |
+          ((output_width - 1) << 16) | (image_calcu_height << 32);
+  uint64_t col_padding_down = ((uint64_t)args.image.width +
+          (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
+
+  uint64_t image_row_col_padding_down =
+          image_amount_per_row | (col_padding_down << 32);
+  uint64_t image_rowXpadding_h =
+          image_amount_per_row * (uint64_t)args.image.pad_height;
+  uint64_t image_rowXstep_h =
+          image_amount_per_row * (uint64_t)args.kernel.stride_h;
+  uint64_t image_rowXpad_h_rowXstep_h =
+          image_rowXpadding_h | (image_rowXstep_h << 32);
+  uint64_t channelXpad_w =
+          (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
+  uint64_t channelXstep_w =
+          (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
+  uint64_t channelXpad_w_channelXstep_w =
+          channelXpad_w | (channelXstep_w << 32);
+  uint64_t filter_row_align =
+      C_align_32 * (uint64_t)args.kernel.width;
+  uint64_t sub_filter_amount_align = C_align_32 *
+          (uint64_t)args.kernel.width * (uint64_t)args.kernel.height;
+  uint64_t mult_factor = 0;
+  float average_reciprocal = args.kernel_reciprocal;
+  uint32_t* kernel_reciprocal;
+  kernel_reciprocal =(reinterpret_cast<uint32_t*>(&average_reciprocal));
+  if (args.mode == 1)
+    mult_factor = (uint64_t)(*kernel_reciprocal) |
+            ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
+  else
+    mult_factor =
+            (uint64_t)0x3f800000 | ((uint64_t)1 << 32) | ((uint64_t)1 << 40);
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
     ret = -EIO;
@@ -433,41 +444,21 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
     pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
-
-  // reg_writeq(reg_ActivationArgs,
-  //            REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
-
-  // reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-  reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
-  reg_writeq(cmd, REG_POOLING_CMD);
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(image_physical_address, 0x808);
+  reg_writeq(result_addr_row, 0x810);
+  reg_writeq(kernel_padding_step, 0x818);
+  reg_writeq(result_size_calcu_height, 0x820);
+  reg_writeq((uint64_t)args.image.channels, 0x828);
+  reg_writeq(image_row_col_padding_down, 0x830);
+  reg_writeq(image_rowXpad_h_rowXstep_h, 0x838);
+  reg_writeq(mult_factor, 0x840);  // dw donot care
+  reg_writeq(channelXpad_w_channelXstep_w, 0x848);
+  if (args.mode == 1)
+    cmd = (uint64_t)4;
+  else
+    cmd = (uint64_t)8;
+  reg_writeq(cmd, 0x800);
 
   DLOG << "before reg poll";
   if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
@@ -478,14 +469,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
   }
   DLOG << "after reg poll";
 
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  //  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  //  output_scale = (output_scale << 32) | (output_scale >> 32);
-  //  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-
-  //  active_args.activation_type = NONE;
-  //  reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
   pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
 
   return ret;
@@ -518,19 +501,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #endif
 #ifdef PADDLE_MOBILE_ZU5
   int ret = 0;
-  uint64_t output_scale = 0;
-
-  // uint64_t reg_ActivationArgs = 0;
-  // ActivationArgs active_args;
-  // active_args.activation_type = args.output.activation.activation_type;
-  // active_args.leaky_relu_negative_slope =
-  //     args.output.activation.leaky_relu_negative_slope;
-  //  reg_ActivationArgs = (uint64_t(active_args.activation_type) << 32) |
-  //                      active_args.leaky_relu_negative_slope;
-  // DLOG << "    activation_type:" << active_args.activation_type
-  //     << "    leaky_relu_negative_slope:"
-  //     << active_args.leaky_relu_negative_slope;
-  // DLOG << "    reg_ActivationArgs:" << reg_ActivationArgs;
+uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
 
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
@@ -540,18 +511,47 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
     return ret;
   }
 
-  // reg_writeq(reg_ActivationArgs,
-  //          REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
 
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-  reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
-  reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
-  reg_writeq(args.driver.datalen, REG_EW_DATA_LEN);
-  reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL);
-  reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR);
-  reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT);
-  reg_writeq(args.driver.cmd, REG_EW_CMD);
+  uint64_t image0_physical_address = 0;
+  uint64_t image1_physical_address = 0;
+  uint64_t image_physical_address = 0;
+  uint64_t output_physical_address = 0;
+  image0_physical_address = vaddr_to_paddr(args.image0.address);
+  image1_physical_address = vaddr_to_paddr(args.image1.address);
+  image_physical_address =
+          image0_physical_address | (image1_physical_address << 32);
+  output_physical_address = vaddr_to_paddr(args.output.address);
+  uint64_t image_amount_per_row =
+          align_to_x((uint64_t)args.image0.width *
+          (uint64_t)args.image0.channels, IMAGE_ALIGNMENT);
+  uint64_t result_addr_row =
+          output_physical_address | (image_amount_per_row << 32);
+  uint64_t kernel_padding_step = 0;
+  kernel_padding_step = ((uint64_t)args.image0.height * 2) |
+          ((uint64_t)2 << 24) | ((uint64_t)2 << 40) | ((uint64_t)1 << 48);
+  uint64_t result_size_calcu_height = ((uint64_t)args.image0.height - 1) |
+          ((image_amount_per_row / 32 - 1) << 16) |
+          (((uint64_t)args.image0.height * 2) << 32);
+  uint64_t image_row_col_padding_down = image_amount_per_row |
+          (image_amount_per_row << 32);
+  float  quantParam = (args.output.scale_address)[0];
+  uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam);
+  uint64_t ew_scale_mult_factor = (*ew_scale) |
+          ((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40);
+  reg_writeq(0ul, REG_SCALE_PARAMETER);
+  reg_writeq(image_physical_address, 0x808);
+  reg_writeq(result_addr_row, 0x810);
+  reg_writeq(kernel_padding_step, 0x818);
+  reg_writeq(result_size_calcu_height, 0x820);
+  reg_writeq(32, 0x828);
+  reg_writeq(image_row_col_padding_down, 0x830);
+  reg_writeq(((image_amount_per_row*2) << 32), 0x838);
+  reg_writeq(ew_scale_mult_factor, 0x840);  // dw donot care
+  reg_writeq(((uint64_t)32 << 32), 0x848);
+  reg_writeq(0, 0x858);
+  uint64_t cmd = 0;
+  cmd = (uint64_t)2 | (((uint64_t)args.relu_enabled) << 8);
+  reg_writeq(cmd, 0x800);
 
   if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
     g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
@@ -560,12 +560,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
     PADDLE_MOBILE_ENFORCE(0, "EW Wait Irq Timeout!");
   }
 
-  // output_scale = reg_readq(REG_SCALE_PARAMETER);
-  // output_scale = (output_scale << 32) | (output_scale >> 32);
-  // fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  // active_args.activation_type = NONE;
-  // reg_writeq(reg_ActivationArgs, REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);
-
   pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
   return ret;
 #endif
@@ -870,7 +864,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
 #endif
   }
 
-  if (sub_conv_num > 1) {
+  /*if (sub_conv_num > 1) {
     float max_scale = -1.0f;
 #ifdef COST_TIME_PRINT
     gettimeofday(&start, NULL);
@@ -894,19 +888,7 @@ int ComputeFpgaDeconv(const struct DeconvArgs &args) {
               << "    cost time: " << (dif_sec * 1000000 + dif_usec) << "us"
               << std::endl;
 #endif
-
-    //    fpga_flush(args.output.scale_address, 2 * sizeof(float));
-    /*#ifdef COST_TIME_PRINT
-    gettimeofday(&start,NULL);
-    #endif
-        //deconv_post_process(args);
-    #ifdef COST_TIME_PRINT
-        gettimeofday(&end,NULL);
-     dif_sec = end.tv_sec - start.tv_sec;
-     dif_usec = end.tv_usec - start.tv_usec;
-      std::cout << "deconv_post_process  " << "    cost time: "  <<
-    (dif_sec*1000000+dif_usec)  << "us" << std::endl; #endif*/
-  }
+  }*/
 
   return 0;
 }  // ComputeFpgaDeconv
@@ -940,8 +922,8 @@ int ComputeDWConv(const struct DWconvArgs &args) {
        << "   image_width:" << args.image.width
        << "   pad_height:" << args.image.pad_height
        << "   pad_width:" << args.image.pad_width;
-  DLOG << "   filter_address:" << args.filter_address
-       << "   bias_address:" << args.bias_address;
+  DLOG << "   filter_address:" << args.filter_address;
+       //<< "   bias_address:" << args.bias_address;
   DLOG << "   kernel_height:" << args.kernel.height
        << "   kernel_width:" << args.kernel.width
        << "   stride_h:" << args.kernel.stride_h
@@ -951,11 +933,10 @@ int ComputeDWConv(const struct DWconvArgs &args) {
 #endif
 #ifdef PADDLE_MOBILE_ZU5
   DLOG << "DWConv";
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
   // return 0;
-  uint64_t output_scale = 0;
   uint64_t timer_cnt = 0;
   int ret = 0;
-  // uint64_t cmd = args.relu_enabled;
   uint64_t cmd = 0;
   uint64_t image_physical_address = 0;
   uint64_t output_physical_address = 0;
@@ -966,57 +947,69 @@ int ComputeDWConv(const struct DWconvArgs &args) {
   output_physical_address = vaddr_to_paddr(args.output.address);
   filter_physical_address = vaddr_to_paddr(args.filter_address);
   bias_physical_address = vaddr_to_paddr(args.bias_address);
-  uint64_t filter_N_align =
-      align_to_x((uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t filter_amount_per_row_align =
-      filter_N_align * (uint64_t)args.kernel.width;
-  uint64_t sub_filter_amount_align = filter_N_align *
-                                     (uint64_t)args.kernel.width *
-                                     (uint64_t)args.kernel.height;
-  uint64_t filter_amount_align =
-      sub_filter_amount_align * (uint64_t)args.sub_conv_num;
-
-  uint32_t output_height = (uint32_t)(
-      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
-          args.kernel.stride_h +
-      1);
-  uint32_t output_width = (uint32_t)(
-      ((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
-           args.kernel.stride_w +
-       1) *
-      args.sub_conv_num);
+  uint64_t C_align_64 = align_to_x((uint64_t)args.image.channels, 64);
+  uint64_t C_align_32 = align_to_x((uint64_t)args.image.channels, 32);
+  uint64_t output_height = (uint64_t)
+          ((args.image.height + args.image.pad_height * 2 -
+          args.kernel.height) / args.kernel.stride_h +1);
+  uint64_t output_width = (uint64_t)
+          (((args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+          args.kernel.stride_w + 1) * args.sub_conv_num);
 
   uint64_t image_amount_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 IMAGE_ALIGNMENT);
+          align_to_x((uint64_t)args.image.width *
+          (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
   uint64_t image_one_pad_per_row =
-      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT) +
-      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
-  uint64_t image_two_pad_per_row = align_to_x(
-      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
-          (uint64_t)args.image.channels,
-      IMAGE_ALIGNMENT);
-  uint64_t image_row_mul_pooling_hight =
-      image_amount_per_row * (uint64_t)args.kernel.height;
-  uint64_t image_row_mul_pad_hight =
-      image_amount_per_row * (uint64_t)args.image.pad_height;
-  uint64_t image_row_mul_step_hight =
-      image_amount_per_row * (uint64_t)args.kernel.stride_h;
-  uint64_t result_amount_align_32 =
-      align_to_x((uint64_t)output_width * (uint64_t)args.image.channels,
-                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t result_amount_align_64 = align_to_x(
-      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGNMENT);
-  uint64_t image_calcu_height =
-      (uint64_t)args.kernel.height +
-      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
-  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
-  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
-
-  uint64_t image_padleft_skipwindow =
-      (image_skip_window << 32) | image_pad_left;
-
+          (uint64_t)args.image.width * (uint64_t)args.image.channels +
+          (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
+
+  uint64_t result_amount_align_32 = align_to_x(
+          (uint64_t)output_width * (uint64_t)args.image.channels, 32);
+  uint64_t result_addr_row =
+          (result_amount_align_32 << 32) | output_physical_address;
+  uint64_t row_padding_down =
+          (uint64_t)args.image.height + (uint64_t)args.image.pad_height;
+  uint64_t kernel_width_sub1 = (uint64_t)args.kernel.width - 1;
+  uint64_t kernel_padding_step = row_padding_down |
+          ((uint64_t)args.image.pad_height << 16) |
+          ((uint64_t)args.kernel.stride_h << 24) |
+          ((uint64_t)kernel_width_sub1<<32) |
+          ((uint64_t)args.kernel.height << 40) |
+          ((uint64_t)(args.kernel.height-1) << 48);
+  uint64_t image_calcu_height = (uint64_t)args.kernel.height +
+          (output_height - 1) * (uint64_t)args.kernel.stride_h;
+  uint64_t result_size_calcu_height = (output_height - 1) |
+          ((output_width - 1) << 16) | (image_calcu_height << 32);
+  uint64_t col_padding_down = ((uint64_t)args.image.width +
+          (uint64_t)args.image.pad_width) * (uint64_t)args.image.channels;
+
+  uint64_t image_row_col_padding_down =
+          image_amount_per_row | (col_padding_down << 32);
+  uint64_t image_rowXpadding_h =
+          image_amount_per_row * (uint64_t)args.image.pad_height;
+  uint64_t image_rowXstep_h =
+          image_amount_per_row * (uint64_t)args.kernel.stride_h;
+  uint64_t image_rowXpad_h_rowXstep_h =
+          image_rowXpadding_h | (image_rowXstep_h << 32);
+  uint64_t channelXpad_w =
+          (uint64_t)args.image.channels * (uint64_t)args.image.pad_width;
+  uint64_t channelXstep_w =
+          (uint64_t)args.image.channels * (uint64_t)args.kernel.stride_w;
+  uint64_t channelXpad_w_channelXstep_w =
+          channelXpad_w | (channelXstep_w << 32);
+
+  uint64_t filter_row_align =
+          C_align_64 * (uint64_t)args.kernel.width;
+  uint64_t sub_filter_amount_align = C_align_64 *
+          (uint64_t)args.kernel.width *
+          (uint64_t)args.kernel.height;
+  uint64_t filter_amount_align =
+          sub_filter_amount_align * (uint64_t)args.sub_conv_num;
+  uint64_t filter_param = filter_row_align | (filter_amount_align << 16) |
+          (sub_filter_amount_align << 32) |
+          (((uint64_t)args.sub_conv_num -1) << 48);
+  uint64_t channel_parameter =
+          (uint64_t)args.image.channels | (C_align_64 << 16);
   pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
   if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
     ret = -EIO;
@@ -1024,73 +1017,31 @@ int ComputeDWConv(const struct DWconvArgs &args) {
     pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
     return ret;
   }
-
-  /*restart scale*/
-  reg_writeq(output_scale, REG_SCALE_PARAMETER);
-
-  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
-  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
-  reg_writeq((bias_physical_address << 32 | filter_physical_address),
-             REG_DWCONV_FILTER_BASE_ADDR);
-  reg_writeq(filter_amount_per_row_align | (filter_amount_align << 32),
-             REG_DWCONV_FILTER_SHAPE);
-  reg_writeq(sub_filter_amount_align | (((uint64_t)args.sub_conv_num) << 32),
-             REG_DWCONV_FILTER_SUBNUMBER);
-  reg_writeq(filter_N_align, REG_DWCONV_FILTER_N_ALIGN);
-
-  reg_writeq(
-      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
-      REG_POOLING_IMAGE_PIXEL);
-  reg_writeq(
-      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
-      REG_POOLING_WINDOW_SIZE);
-
-  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
-             REG_POOLING_RESULT_PIXEL);
-
-  reg_writeq(((uint64_t)args.image.pad_height) |
-                 (((uint64_t)args.image.pad_width) << 32),
-             REG_POOLING_PAD_PIXEL);
-  reg_writeq(((uint64_t)args.kernel.stride_h) |
-                 (((uint64_t)args.kernel.stride_w) << 32),
-             REG_POOLING_STEP_PIXEL);
-
-  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
-
-  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
-  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
-  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
-
-  reg_writeq(image_row_mul_pooling_hight,
-             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
-  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
-  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
-
-  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
-  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
-
-  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
-
-  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
-
-  /*SDK刷Cache保证数据一致性*/
-
-  reg_writeq(cmd, REG_DWCONV_CMD);
+  reg_writeq(0ul, REG_SCALE_PARAMETER);
+  reg_writeq(image_physical_address, 0x808);
+  reg_writeq(result_addr_row, 0x810);
+  reg_writeq(kernel_padding_step, 0x818);
+  reg_writeq(result_size_calcu_height, 0x820);
+  reg_writeq(channel_parameter, 0x828);
+  reg_writeq(image_row_col_padding_down, 0x830);
+  reg_writeq(image_rowXpad_h_rowXstep_h, 0x838);
+  reg_writeq(0, 0x840);
+  reg_writeq(channelXpad_w_channelXstep_w, 0x848);
+  reg_writeq(filter_physical_address, 0x850);
+  reg_writeq(filter_param, 0x858);
+  reg_writeq(((bias_physical_address+C_align_64*4) |
+  (bias_physical_address << 32)), 0x860);
+  cmd = (uint64_t)1 | (((uint64_t)args.relu_enabled) << 8);
+  reg_writeq(cmd, 0x800);
 
   DLOG << "before reg poll";
   if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
     g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
     ret = -EIO;
-    DLOG << "Pooling Wait Irq Timeout!";
+    DLOG << "DWconv Wait Irq Timeout!";
     PADDLE_MOBILE_ENFORCE(0, "DWConv Wait Irq Timeout");
   }
   DLOG << "after reg poll";
-
-  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = reg_readq(REG_SCALE_PARAMETER);
-  output_scale = (output_scale << 32) | (output_scale >> 32);
-  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
-  DLOG << "output_scale:" << output_scale;
   pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
   return ret;
 #endif
diff --git a/mobile/src/fpga/common/driver.cpp b/mobile/src/fpga/common/driver.cpp
old mode 100644
new mode 100755
index 911704965a..b7ce4d3247
--- a/mobile/src/fpga/common/driver.cpp
+++ b/mobile/src/fpga/common/driver.cpp
@@ -134,9 +134,9 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
   uint64_t i = 0;
   /*timeout精确性待确认*/
   int64_t timeout = time * 6;
-  usleep(1);
 
   for (i = 0; i < timeout; i++) {
+    usleep(1);
     if (val == reg_readq(reg)) {
       break;
     }
diff --git a/mobile/src/fpga/common/fpga_common.h b/mobile/src/fpga/common/fpga_common.h
old mode 100644
new mode 100755
index a798d54459..a767cd2606
--- a/mobile/src/fpga/common/fpga_common.h
+++ b/mobile/src/fpga/common/fpga_common.h
@@ -211,6 +211,7 @@ struct ConcatArgs {
   uint32_t out_channel;
   uint32_t height;
   uint32_t width;
+  std::vector<std::shared_ptr<char>> vector_concat_space;
 };
 
 struct SplitConvArgs {
diff --git a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
old mode 100644
new mode 100755
index 951fbb5f37..56cc8927f0
--- a/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/anchor_generator_kernel.cpp
@@ -37,7 +37,7 @@ bool AnchorGeneratorKernel<FPGA, float>::Init(
   int anchors_offset[] = {-2,  -2,   18,   18,  -10, -9,   26,   25,   -23,
                           -20, 39,   36,   -43, -34, 59,   49,   -63,  -54,
                           79,  69,   -96,  -77, 112, 93,   -137, -118, 153,
-                          134, -204, -188, 220, 204, -281, -395, 296,  441};
+                          134, -204, -188, 220, 204, -281, -395, 296,  411};
 
   int anchors_offset2[] = {0, 0, 51, 77, 0, 0, 30, 35, 0, 0, 81, 103,
                            0, 0, 20, 21, 0, 0, 36, 44, 0, 0, 43, 58,
diff --git a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
old mode 100644
new mode 100755
index 716531fcab..8442eef8b2
--- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
@@ -53,6 +53,15 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
   concatArgs.channel_num = channel_num;
   concatArgs.height = height;
   concatArgs.width = width;
+
+  auto deleter = [](void *p) { fpga::fpga_free(p); };
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.images_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.scales_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.channel_num), deleter));
+
   param->SetFpgaArgs(concatArgs);
   return true;
 }
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
old mode 100644
new mode 100755
index 43b9355c99..57ccf9f00d
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
@@ -12,12 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef ELEMENTWISEADD_OP
-
+#include <math.h>
 #include "operators/kernel/elementwise_add_kernel.h"
 
-#include <string>
-#include "fpga/V2/api.h"
-
 namespace paddle_mobile {
 namespace operators {
 
@@ -60,10 +57,36 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
   return true;
 }
 
+void ComputeCPUEWAdd(fpga::EWAddArgs ewaddArgs) {
+  int inputc = ewaddArgs.image0.channels;
+  int inputh = ewaddArgs.image0.height;
+  int inputw = ewaddArgs.image0.width;
+  float inScale0 =
+          (reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
+  float inScale1 =
+          (reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
+  float outScale =
+          (reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
+  int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
+  int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
+  int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
+  int datasize = inputc * inputh * inputw;
+  float const0 = inScale0 / outScale;
+  float const1 = inScale1 / outScale;
+  fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t));
+  fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t));
+  for (int i = 0; i < datasize; i++) {
+    float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1;
+    int tmpI = static_cast<int>(round(tmpF));
+    outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < -127 ? -127 : tmpI)));
+  }
+  fpga::fpga_flush(outPtr, datasize * sizeof(int8_t));
+}
 template <>
 void ElementwiseAddKernel<FPGA, float>::Compute(
     const ElementwiseAddParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  // fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  ComputeCPUEWAdd(param.FpgaArgs());
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
old mode 100644
new mode 100755
index 6d5ad50573..de60341874
--- a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
-
+#include <math.h>
 #include "operators/kernel/elementwise_add_relu_kernel.h"
 
 namespace paddle_mobile {
@@ -58,10 +58,37 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
   return true;
 }
 
+void ComputeCPUEWAddRelu(fpga::EWAddArgs ewaddArgs) {
+  int inputc = ewaddArgs.image0.channels;
+  int inputh = ewaddArgs.image0.height;
+  int inputw = ewaddArgs.image0.width;
+  float inScale0 =
+          (reinterpret_cast<float*>(ewaddArgs.image0.scale_address))[0];
+  float inScale1 =
+          (reinterpret_cast<float*>(ewaddArgs.image1.scale_address))[0];
+  float outScale =
+          (reinterpret_cast<float*>(ewaddArgs.output.scale_address))[0];
+  int8_t* inPtr0 = reinterpret_cast<int8_t*>(ewaddArgs.image0.address);
+  int8_t* inPtr1 = reinterpret_cast<int8_t*>(ewaddArgs.image1.address);
+  int8_t* outPtr = reinterpret_cast<int8_t*>(ewaddArgs.output.address);
+  int datasize = inputc * inputh * inputw;
+  float const0 = inScale0 / outScale;
+  float const1 = inScale1 / outScale;
+  fpga::fpga_invalidate(inPtr0, datasize * sizeof(int8_t));
+  fpga::fpga_invalidate(inPtr1, datasize * sizeof(int8_t));
+  for (int i = 0; i < datasize; i++) {
+    float tmpF = inPtr0[i] * const0 + inPtr1[i] * const1;
+    int tmpI = static_cast<int>(round(tmpF));
+    outPtr[i] = (int8_t)((tmpI > 127 ? 127 : (tmpI < 0 ? 0 : tmpI)));
+  }
+  fpga::fpga_flush(outPtr, datasize * sizeof(int8_t));
+}
+
 template <>
 void ElementwiseAddReluKernel<FPGA, float>::Compute(
     const ElementwiseAddReluParam<FPGA> &param) {
-  fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  // fpga::ComputeFpgaEWAdd(param.FpgaArgs());
+  ComputeCPUEWAddRelu(param.FpgaArgs());
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
old mode 100644
new mode 100755
index fcf0889b4a..c7cd6575e4
--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
@@ -110,7 +110,27 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
     }
   }
   output->Resize(framework::make_ddim(shape));
+
+  bool reshapeNeedFlg = 1;
   if (output->dims() == input->dims()) {
+    reshapeNeedFlg = 0;
+  } else if (output->dims().size() != input->dims().size()) {
+    auto inputdimsize = input->dims().size();
+    auto outputdimsize = output->dims().size();
+    int smallersize =
+            inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
+    int i = 0;
+    for (i = 0; i < smallersize; i++) {
+      if ((input->dims())[i] != (output->dims())[i])
+        break;
+    }
+    if (i == smallersize) {
+      reshapeNeedFlg = 0;
+    }
+  }
+  if (reshapeNeedFlg) {
+    reshape(input, output);
+  } else {
     DLOG << "No need to reshape";
     output->ShareDataWith(*input);
     framework::LoD lod = input->lod();
@@ -118,9 +138,6 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
     output->scale[0] = input->scale[0];
     return;
   }
-
-  reshape(input, output);
-  //
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
index 194fd5a305..44aae4be32 100644
--- a/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/sigmoid_kernel.cpp
@@ -48,7 +48,7 @@ bool SigmoidKernel<FPGA, float>::Init(SigmoidParam<FPGA> *param) {
 template <>
 void SigmoidKernel<FPGA, float>::Compute(const SigmoidParam<FPGA> &param) {
   fpga::PerformBypass(param.FpgaArgs());
-  param.Out()->scale[0] = 127.0;
+  param.Out()->scale[0] = 1.0;
 }
 
 }  // namespace operators
diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
old mode 100644
new mode 100755
index a1500ecdb0..d32dddb307
--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -30,6 +30,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
   }
   return true;
 }
+
 template <>
 void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
   // Only support slicing in channel dimension
@@ -38,6 +39,8 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
 
   auto input = param.input_;
   auto output = param.output_;
+  int H = input->dims()[2];
+  int W = input->dims()[3];
   int HW = input->dims()[2] * input->dims()[3];
   int channel = input->dims()[1];
   auto input_ptr = input->data<int8_t>();
@@ -53,10 +56,32 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
   end = end > channel ? channel : end;
   int len = end - start;
   size_t size = len * sizeof(int8_t);
+  DLOG << input->fpga_data_num;
+  fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t));
+  DLOG << output->fpga_data_num;
+  fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t));
+  int unalignedWC = len * W;
+  int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
 
-  for (int i = 0; i < HW; i++) {
-    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+  if (unalignedWC != alignedWC) {
+      auto tmpOutput = reinterpret_cast<int8_t*>
+              (fpga::fpga_malloc(len*HW * sizeof(int8_t)));
+      for (int i = 0; i < HW; i++) {
+          memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
+      }
+      for (int i = 0; i < H; i++) {
+          for (int j = 0; j < unalignedWC; j++) {
+              *(output_ptr + alignedWC * i + j) =
+                      *(tmpOutput + unalignedWC * i + j);
+          }
+      }
+      fpga::fpga_free(tmpOutput);
+  } else {
+      for (int i = 0; i < HW; i++) {
+          memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+      }
   }
+  fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t));
 }
 }  // namespace operators
 }  // namespace paddle_mobile
diff --git a/mobile/src/operators/math/depthwise_conv3x3.cpp b/mobile/src/operators/math/depthwise_conv3x3.cpp
index 11fce28605..4f8b7a7b30 100644
--- a/mobile/src/operators/math/depthwise_conv3x3.cpp
+++ b/mobile/src/operators/math/depthwise_conv3x3.cpp
@@ -150,7 +150,8 @@ void DepthwiseConv3x3S1<float, float>(const framework::Tensor &input,
   const int out_image_size = output_h * output_w;
   const int valid_h_start = padding_h;
   const int valid_h_end = output_h - valid_h_start;
-  const int valid_h = valid_h_end - valid_h_start;
+  const int valid_h =
+      valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0;
   const int valid_w_start = padding_w;
   const int valid_w_end = output_w - valid_w_start;
   const int valid_w = valid_w_end - valid_w_start;
@@ -631,7 +632,7 @@ void DepthwiseConv3x3S1<float, float>(const framework::Tensor &input,
       }
     }
     // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
+    for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) {
       DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h,
                                       input_w, padding_h, padding_w, output_w,
                                       output_ptr, _ker);
@@ -659,7 +660,8 @@ void DepthwiseConv3x3S2<float, float>(const framework::Tensor &input,
   const int valid_h_start = (padding_h + 1) / 2;
   const int valid_h_end =
       std::max((input_h + padding_h - 1) / 2, valid_h_start);
-  const int valid_h = valid_h_end - valid_h_start;
+  const int valid_h =
+      valid_h_end - valid_h_start > 0 ? valid_h_end - valid_h_start : 0;
   const int valid_w_start = (padding_w + 1) / 2;
   const int valid_w_end =
       std::max((input_w + padding_w - 1) / 2, valid_w_start);
@@ -1045,7 +1047,7 @@ void DepthwiseConv3x3S2<float, float>(const framework::Tensor &input,
       }
     }
     // pad bottom
-    for (int h = valid_h_end; h < output_h; ++h) {
+    for (int h = valid_h_end; (h < output_h) && (h > valid_h_start - 1); ++h) {
       DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h,
                                       input_w, padding_h, padding_w, output_w,
                                       output_ptr, _ker);
-- 
GitLab