From 36004a9aa4bc963bf406299235c86d0d42ebf0d5 Mon Sep 17 00:00:00 2001
From: "baolei.an" <baolei.an@bitmain.com>
Date: Thu, 9 Jan 2020 15:12:14 +0800
Subject: [PATCH] update the newest version

---
 README.md                                     |   75 +-
 cmake/cross_compiling/ios.cmake               |    1 +
 cmake/cross_compiling/npu.cmake               |    2 +-
 lite/CMakeLists.txt                           |   16 +-
 lite/api/CMakeLists.txt                       |   14 +-
 lite/api/cxx_api.cc                           |    6 +-
 lite/api/cxx_api_impl.cc                      |    4 +-
 lite/api/lite_multithread_test.cc             |    0
 lite/api/model_optimize_tool.cc               |  220 +-
 lite/api/model_test.cc                        |    1 +
 lite/api/paddle_api.h                         |   14 +-
 lite/api/test_step_rnn_lite_x86.cc            |    4 +-
 .../arm/math/conv3x3s1_depthwise_fp32.cc      |  538 --
 .../arm/math/conv3x3s2_depthwise_fp32.cc      |  361 --
 .../backends/arm/math/conv_depthwise_3x3p0.cc | 4178 --------------
 .../backends/arm/math/conv_depthwise_3x3p1.cc | 4850 -----------------
 .../backends/arm/math/conv_depthwise_3x3s1.cc | 2539 ---------
 .../backends/arm/math/conv_depthwise_3x3s2.cc | 1862 -------
 lite/backends/arm/math/reduce_prod.cc         |    0
 lite/backends/arm/math/reduce_prod.h          |    0
 .../arm/math/split_merge_lod_tenosr.cc        |    0
 .../arm/math/split_merge_lod_tenosr.h         |    0
 lite/backends/fpga/KD/debugger.hpp            |    0
 lite/backends/fpga/KD/dl_engine.cpp           |    0
 lite/backends/fpga/KD/dl_engine.hpp           |    0
 lite/backends/fpga/KD/llapi/zynqmp_api.cpp    |    0
 lite/backends/fpga/KD/llapi/zynqmp_api.h      |    0
 lite/backends/fpga/KD/pes/conv_process.hpp    |    0
 lite/backends/fpga/KD/pes/crop_pe.cpp         |    0
 .../fpga/KD/pes/depthwise_conv_pe.hpp         |    0
 .../fpga/KD/pes/elementwise_mul_pe.hpp        |    0
 .../fpga/KD/pes/fully_connected_pe.hpp        |    0
 lite/backends/fpga/KD/pes/gru_pe.hpp          |    0
 lite/backends/fpga/KD/pes/gru_util.hpp        |    0
 lite/backends/fpga/KD/pes/output_pe.hpp       |    0
 lite/backends/fpga/KD/pes/pooling_pe.hpp      |    0
 lite/backends/fpga/KD/pes/scale_pe.hpp        |    0
 lite/backends/fpga/lite_tensor.cc             |    0
 lite/backends/npu/builder.cc                  |  192 -
 lite/backends/npu/builder.h                   |  145 -
 lite/backends/npu/device.cc                   |    0
 lite/backends/npu/device.h                    |    4 +-
 lite/backends/npu/runtime.cc                  |   60 -
 lite/backends/npu/runtime.h                   |   50 -
 .../cl_kernel/image/conv2d_1x1_kernel.cl      |    0
 .../opencl/cl_kernel/image/reshape_kernel.cl  |    0
 lite/backends/x86/jit/README.en.md            |    2 +-
 lite/backends/x86/jit/README.md               |    2 +-
 lite/backends/x86/jit/gen/CMakeLists.txt      |   54 +-
 lite/backends/x86/jit/gen/act.cc              |   12 +-
 lite/backends/x86/jit/gen/blas.cc             |   14 +-
 lite/backends/x86/jit/gen/embseqpool.cc       |    2 +-
 lite/backends/x86/jit/gen/gru.cc              |    6 +-
 lite/backends/x86/jit/gen/hopv.cc             |    4 +-
 lite/backends/x86/jit/gen/lstm.cc             |    4 +-
 lite/backends/x86/jit/gen/matmul.cc           |    2 +-
 lite/backends/x86/jit/gen/seqpool.cc          |    2 +-
 lite/backends/x86/jit/gen/sgd.cc              |    2 +-
 lite/backends/x86/jit/gen/vbroadcast.cc       |    2 +-
 lite/backends/x86/jit/more/CMakeLists.txt     |    4 +-
 .../x86/jit/more/intrinsic/CMakeLists.txt     |    4 +-
 lite/backends/x86/jit/more/mix/CMakeLists.txt |   16 +-
 lite/backends/x86/jit/more/mkl/CMakeLists.txt |   30 +-
 lite/backends/x86/jit/refer/CMakeLists.txt    |   66 +-
 lite/backends/x86/jit/refer/refer.cc          |    2 +-
 lite/backends/x86/jit/registry.h              |  122 +-
 lite/backends/x86/parallel.h                  |    0
 lite/backends/xpu/builder.cc                  |  189 -
 lite/backends/xpu/builder.h                   |   60 -
 lite/backends/xpu/device.cc                   |    7 +-
 lite/backends/xpu/device.h                    |   22 +-
 lite/backends/xpu/runtime.cc                  |   46 -
 lite/backends/xpu/runtime.h                   |   69 -
 lite/core/CMakeLists.txt                      |   10 +-
 lite/core/arena/CMakeLists.txt                |    2 +-
 lite/core/framework.proto                     |    1 -
 lite/core/kernel.h                            |    9 +-
 ...elementwise_mul_constant_eliminate_pass.cc |    0
 .../elementwise_add_activation_fuse_pass.cc   |    4 +-
 lite/core/mir/fusion/fc_fuse_pass.cc          |   11 +-
 lite/core/mir/fusion/fc_fuse_pass_test.cc     |    1 +
 lite/core/mir/fusion/fc_fuser.cc              |   16 +-
 lite/core/mir/fusion/fc_fuser.h               |    2 +
 .../fusion/sequence_pool_concat_fuse_pass.cc  |    0
 .../fusion/sequence_pool_concat_fuse_pass.h   |    0
 .../mir/fusion/sequence_pool_concat_fuser.cc  |    0
 .../mir/fusion/sequence_pool_concat_fuser.h   |    0
 .../var_conv_2d_activation_fuse_pass.cc       |    0
 .../fusion/var_conv_2d_activation_fuse_pass.h |    0
 .../fusion/var_conv_2d_activation_fuser.cc    |    0
 .../mir/fusion/var_conv_2d_activation_fuser.h |    0
 lite/core/mir/generate_program_pass.cc        |    1 -
 lite/core/mir/subgraph/CMakeLists.txt         |    2 +-
 lite/core/mir/subgraph/subgraph_detector.cc   |    2 +-
 lite/core/mir/subgraph/subgraph_detector.h    |    0
 .../mir/subgraph/subgraph_detector_test.cc    |    0
 lite/core/mir/subgraph/subgraph_pass.cc       |    6 +-
 lite/core/mir/subgraph/subgraph_pass.h        |    0
 lite/core/mir/subgraph/subgraph_pass_test.cc  |    2 +-
 lite/core/profile/profiler.cc                 |   78 +-
 lite/core/profile/profiler.h                  |   26 +-
 lite/core/profile/test_timer.cc               |    6 +-
 lite/core/program.cc                          |   16 +-
 lite/core/program.h                           |   13 +-
 lite/core/tensor.h                            |   21 +-
 lite/demo/cxx/README.md                       |  110 +-
 .../mobile_classify/Makefile.android.armv7    |    0
 .../mobile_classify/Makefile.android.armv8    |    0
 .../Makefile.android.armv7                    |   12 +-
 .../Makefile.android.armv8                    |   12 +-
 .../makefiles/test_cv/Makefile.android.armv7  |   71 +
 .../makefiles/test_cv/Makefile.android.armv8  |   70 +
 .../yolov3_detection/Makefile.android.armv7   |   61 +
 .../yolov3_detection/Makefile.android.armv8   |   61 +
 .../cxx/mobile_classify/mobile_classify.cc    |    2 +-
 lite/demo/cxx/mobile_detection/test.jpg       |  Bin 127499 -> 0 bytes
 .../ssd_detection.cc}                         |    2 +-
 lite/demo/cxx/test_cv/README.md               |  131 +
 lite/demo/cxx/test_cv/test_img_prepross.cc    |  389 ++
 lite/demo/cxx/test_cv/test_model_cv.cc        |  224 +
 .../cxx/yolov3_detection/yolov3_detection.cc  |  238 +
 lite/kernels/arm/CMakeLists.txt               |    2 +-
 .../arm/collect_fpn_proposals_compute.cc      |    0
 .../arm/collect_fpn_proposals_compute.h       |    0
 lite/kernels/arm/conditional_block_compute.cc |    0
 lite/kernels/arm/conditional_block_compute.h  |    0
 lite/kernels/arm/conv_compute.cc              |    6 +-
 .../arm/conv_transpose_compute_test.cc        |  371 --
 .../arm/distribute_fpn_proposals_compute.cc   |    0
 .../arm/distribute_fpn_proposals_compute.h    |    0
 lite/kernels/arm/grid_sampler_compute.cc      |    0
 lite/kernels/arm/grid_sampler_compute.h       |    0
 lite/kernels/arm/instance_norm_compute.cc     |    0
 lite/kernels/arm/instance_norm_compute.h      |    0
 lite/kernels/arm/merge_lod_tensor_compute.cc  |    0
 lite/kernels/arm/merge_lod_tensor_compute.h   |    0
 .../arm/merge_lod_tensor_compute_test.cc      |    0
 lite/kernels/arm/reduce_prod_compute.cc       |    0
 lite/kernels/arm/reduce_prod_compute.h        |    0
 lite/kernels/arm/split_lod_tensor_compute.cc  |    0
 lite/kernels/arm/split_lod_tensor_compute.h   |    0
 .../arm/split_lod_tensor_compute_test.cc      |    0
 lite/kernels/arm/yolo_box_compute.cc          |    2 +
 lite/kernels/cuda/CMakeLists.txt              |    2 +-
 lite/kernels/cuda/conv_compute_test.cc        |    1 +
 lite/kernels/cuda/elementwise_add_compute.cu  |  139 -
 lite/kernels/cuda/elementwise_add_compute.h   |   53 -
 .../cuda/elementwise_add_compute_test.cc      |  166 -
 lite/kernels/cuda/mul_compute.h               |    1 -
 .../cuda/sequence_pool_concat_compute.cu      |    0
 .../cuda/sequence_pool_concat_compute.h       |    0
 lite/kernels/cuda/yolo_box_compute.cu         |    2 +-
 lite/kernels/fpga/CMakeLists.txt              |    2 +-
 lite/kernels/fpga/calib_compute.cc            |    0
 lite/kernels/fpga/conv_compute.cc             |    0
 lite/kernels/fpga/conv_compute.h              |    0
 lite/kernels/fpga/dropout_compute.cc          |    0
 lite/kernels/fpga/elementwise_compute.cc      |    0
 lite/kernels/fpga/fc_compute.h                |    0
 lite/kernels/fpga/feed_compute.cc             |    0
 lite/kernels/fpga/feed_compute.h              |    0
 lite/kernels/fpga/fetch_compute.h             |    0
 lite/kernels/fpga/gru_compute.h               |    0
 lite/kernels/fpga/im2sequence_compute.cc      |    0
 lite/kernels/fpga/im2sequence_compute.h       |    0
 lite/kernels/fpga/mul_compute.h               |    0
 lite/kernels/fpga/multiclass_nms_compute.cc   |    0
 lite/kernels/fpga/norm_compute.cc             |    0
 lite/kernels/fpga/norm_compute.h              |    0
 lite/kernels/fpga/pooling_compute_test.cc     |    0
 lite/kernels/fpga/prior_box_compute.cc        |    0
 lite/kernels/fpga/prior_box_compute.h         |    0
 lite/kernels/fpga/reshape_compute.cc          |    0
 lite/kernels/fpga/scale_compute.cc            |    0
 lite/kernels/fpga/scale_compute.h             |    0
 lite/kernels/fpga/softmax_compute.cc          |    0
 lite/kernels/fpga/transpose_compute.cc        |    0
 lite/kernels/npu/bridges/CMakeLists.txt       |    3 +-
 lite/kernels/npu/bridges/act_op.cc            |   59 +-
 lite/kernels/npu/bridges/argmax_op.cc         |   21 +-
 lite/kernels/npu/bridges/argmax_op_test.cc    |    0
 lite/kernels/npu/bridges/batch_norm_op.cc     |   41 +-
 .../kernels/npu/bridges/batch_norm_op_test.cc |  168 -
 lite/kernels/npu/bridges/concat_op.cc         |   23 +-
 lite/kernels/npu/bridges/conv_op.cc           |  153 +-
 lite/kernels/npu/bridges/conv_transpose_op.cc |   78 +-
 lite/kernels/npu/bridges/elementwise_ops.cc   |   77 +-
 lite/kernels/npu/bridges/engine.cc            |    6 +-
 lite/kernels/npu/bridges/engine.h             |    0
 lite/kernels/npu/bridges/fc_op.cc             |   59 +-
 lite/kernels/npu/bridges/graph.cc             |   54 +-
 lite/kernels/npu/bridges/graph.h              |  217 +-
 lite/kernels/npu/bridges/interpolate_op.cc    |   48 +-
 lite/kernels/npu/bridges/mul_op.cc            |   51 +-
 lite/kernels/npu/bridges/pad2d_op.cc          |   33 +-
 lite/kernels/npu/bridges/paddle_use_bridges.h |   70 +-
 .../npu/bridges/paddle_use_npu_bridges.h      |   55 -
 lite/kernels/npu/bridges/pool_op.cc           |   34 +-
 lite/kernels/npu/bridges/pool_op_test.cc      |  252 -
 lite/kernels/npu/bridges/reduce_mean_op.cc    |   36 +-
 lite/kernels/npu/bridges/registry.cc          |   24 +-
 lite/kernels/npu/bridges/registry.h           |   30 +-
 lite/kernels/npu/bridges/reshape_op.cc        |   68 +-
 lite/kernels/npu/bridges/scale_op.cc          |   38 +-
 .../kernels/npu/bridges/shuffle_channel_op.cc |   20 +-
 .../npu/bridges/shuffle_channel_op_test.cc    |  117 -
 lite/kernels/npu/bridges/softmax_op.cc        |   31 +-
 lite/kernels/npu/bridges/split_op.cc          |   37 +-
 lite/kernels/npu/bridges/sqrt_op.cc           |   17 +-
 lite/kernels/npu/bridges/square_op.cc         |   17 +-
 lite/kernels/npu/bridges/transpose_op.cc      |   29 +-
 lite/kernels/npu/bridges/transpose_op_test.cc |  153 -
 lite/kernels/npu/bridges/unsqueeze_op.cc      |   26 +-
 lite/kernels/npu/bridges/unsqueeze_op_test.cc |  139 -
 lite/kernels/npu/bridges/utility.cc           |   18 +-
 lite/kernels/npu/bridges/utility.h            |   66 +-
 lite/kernels/npu/graph_compute.cc             |  145 -
 lite/kernels/npu/graph_compute.h              |   54 -
 lite/kernels/npu/subgraph_compute.cc          |   46 +-
 lite/kernels/npu/subgraph_compute.h           |    2 +-
 lite/kernels/opencl/CMakeLists.txt            |   10 +-
 lite/kernels/opencl/conv2d_1x1_compute.cc     |    0
 .../kernels/opencl/conv2d_1x1_compute_test.cc |    0
 lite/kernels/opencl/reshape_compute.cc        |    0
 lite/kernels/opencl/reshape_compute_test.cc   |    0
 lite/kernels/x86/fc_compute_test.cc           |  100 -
 lite/kernels/x86/layer_norm_compute.h         |    2 +-
 lite/kernels/x86/relu_compute.cc              |   25 -
 lite/kernels/x86/relu_compute.h               |   52 -
 lite/kernels/xpu/bridges/act_op.cc            |   21 +-
 lite/kernels/xpu/bridges/act_op_test.cc       |  102 -
 lite/kernels/xpu/bridges/batch_norm_op.cc     |   38 +-
 .../kernels/xpu/bridges/batch_norm_op_test.cc |  164 -
 lite/kernels/xpu/bridges/conv_op.cc           |   51 +-
 lite/kernels/xpu/bridges/dropout_op.cc        |   22 +-
 lite/kernels/xpu/bridges/elementwise_ops.cc   |   32 +-
 lite/kernels/xpu/bridges/gather_op.cc         |   46 +-
 lite/kernels/xpu/bridges/graph.cc             |  107 +-
 lite/kernels/xpu/bridges/graph.h              |  183 +-
 lite/kernels/xpu/bridges/layer_norm_op.cc     |   56 +-
 lite/kernels/xpu/bridges/lookup_table_op.cc   |   43 +-
 lite/kernels/xpu/bridges/matmul_op.cc         |   75 +-
 lite/kernels/xpu/bridges/mul_op.cc            |   47 +-
 lite/kernels/xpu/bridges/paddle_use_bridges.h |   44 +-
 .../xpu/bridges/paddle_use_xpu_bridges.h      |   26 -
 lite/kernels/xpu/bridges/pool_op.cc           |   26 +-
 lite/kernels/xpu/bridges/registry.cc          |   41 -
 lite/kernels/xpu/bridges/registry.h           |   93 -
 lite/kernels/xpu/bridges/reshape_op.cc        |   32 +-
 lite/kernels/xpu/bridges/scale_op.cc          |   18 +-
 lite/kernels/xpu/bridges/slice_op.cc          |   18 +-
 lite/kernels/xpu/bridges/softmax_op.cc        |   14 +-
 lite/kernels/xpu/bridges/stack_op.cc          |   20 +-
 lite/kernels/xpu/bridges/transpose_op.cc      |   26 +-
 lite/kernels/xpu/bridges/utility.cc           |    4 +-
 lite/kernels/xpu/bridges/utility.h            |    1 -
 lite/kernels/xpu/graph_compute.cc             |   99 -
 lite/kernels/xpu/graph_compute.h              |   47 -
 lite/kernels/xpu/subgraph_compute.cc          |   46 +-
 lite/kernels/xpu/subgraph_compute.h           |    0
 lite/model_parser/naive_buffer/naive_buffer.h |   33 +-
 lite/model_parser/naive_buffer/param_desc.cc  |    9 +-
 lite/operators/CMakeLists.txt                 |    3 +-
 lite/operators/collect_fpn_proposals_op.cc    |    0
 lite/operators/collect_fpn_proposals_op.h     |    0
 lite/operators/compare_op.cc                  |    2 +-
 lite/operators/conditional_block_op.cc        |    0
 lite/operators/conditional_block_op.h         |    0
 lite/operators/distribute_fpn_proposals_op.cc |    0
 lite/operators/distribute_fpn_proposals_op.h  |    0
 lite/operators/dropout_op.cc                  |    2 +-
 lite/operators/fc_op.cc                       |    2 +-
 lite/operators/graph_op.cc                    |   58 -
 lite/operators/graph_op.h                     |   52 -
 lite/operators/grid_sampler_op.cc             |    0
 lite/operators/grid_sampler_op.h              |    0
 lite/operators/instance_norm_op.cc            |    0
 lite/operators/instance_norm_op.h             |    0
 lite/operators/merge_lod_tensor_op.cc         |    0
 lite/operators/merge_lod_tensor_op.h          |    0
 lite/operators/reduce_prod_op.cc              |    0
 lite/operators/reduce_prod_op.h               |    0
 lite/operators/sequence_pool_concat_op.cc     |    0
 lite/operators/sequence_pool_concat_op.h      |    0
 lite/operators/split_lod_tensor_op.cc         |    0
 lite/operators/split_lod_tensor_op.h          |    0
 lite/operators/subgraph_op.cc                 |    0
 lite/operators/subgraph_op.h                  |    0
 lite/tests/cv/CMakeLists.txt                  |    2 +-
 lite/tests/cv/cv_basic.h                      |   61 +-
 lite/tests/cv/image_convert_test.cc           |  136 +-
 lite/tests/kernels/CMakeLists.txt             |  130 +-
 lite/tests/kernels/batch_norm_compute_test.cc |  181 +
 lite/tests/kernels/dropout_compute_test.cc    |    0
 lite/tests/kernels/gather_compute_test.cc     |    0
 .../kernels/grid_sampler_compute_test.cc      |    0
 .../kernels/instance_norm_compute_test.cc     |    0
 lite/tests/kernels/layer_norm_compute_test.cc |    0
 .../kernels/lookup_table_compute_test.cc      |    0
 lite/tests/kernels/mul_compute_test.cc        |    0
 lite/tests/kernels/pool_compute_test.cc       |  367 ++
 .../tests/kernels/reduce_prod_compute_test.cc |    0
 lite/tests/kernels/reshape_compute_test.cc    |   38 +-
 lite/tests/kernels/scale_compute_test.cc      |   57 +-
 .../kernels/shuffle_channel_compute_test.cc   |   72 +-
 lite/tests/kernels/softmax_compute_test.cc    |   47 +-
 lite/tests/kernels/transpose_compute_test.cc  |   44 +-
 lite/tests/kernels/unsqueeze_compute_test.cc  |   88 +-
 lite/tests/utils/timer.h                      |  105 -
 lite/tools/build_bm.sh                        |  112 -
 lite/tools/build_xpu.sh                       |    5 +
 lite/tools/ci_build.sh                        |   57 +
 .../create_fake_kernel_registry.py            |    3 +
 .../cmake_tools/parse_kernel_registry.py      |    4 +
 lite/tools/cmake_tools/parse_op_registry.py   |    4 +
 .../cmake_tools/record_supported_kernel_op.py |  129 +
 lite/utils/cv/CMakeLists.txt                  |    3 +-
 lite/utils/cv/image2tensor.cc                 |  154 +-
 lite/utils/cv/image_convert.cc                |  302 +-
 lite/utils/cv/image_flip.cc                   |   26 +
 lite/utils/cv/image_flip.h                    |    9 +
 lite/utils/cv/image_resize.cc                 |    9 +
 lite/utils/cv/image_resize.h                  |   10 +
 lite/utils/cv/image_rotate.cc                 |   43 +-
 lite/utils/cv/image_rotate.h                  |   10 +
 lite/utils/cv/paddle_image_preprocess.cc      |   53 +-
 lite/utils/cv/paddle_image_preprocess.h       |   27 +-
 lite/utils/env.h                              |    0
 mobile/src/common/log.h                       |   14 +-
 mobile/src/fpga/V2/image.cpp                  |    0
 mobile/src/fpga/V2/pe.cpp                     |    0
 mobile/src/framework/cl/cl_deleter.h          |   20 +-
 mobile/src/framework/cl/cl_engine.cpp         |   61 +-
 mobile/src/framework/cl/cl_engine.h           |   61 +-
 mobile/src/framework/cl/cl_helper.h           |    4 +-
 mobile/src/framework/cl/cl_image.h            |   50 +-
 mobile/src/framework/cl/cl_scope.h            |   44 +-
 mobile/src/framework/context.h                |   10 +-
 mobile/src/framework/executor.cpp             |    7 +-
 mobile/src/framework/loader.cpp               |    3 +-
 mobile/src/framework/operator.cpp             |   78 +-
 mobile/src/io/opencl_interface.cpp            |   18 +
 mobile/src/io/opencl_interface.h              |    1 +
 mobile/src/io/paddle_mobile.h                 |   13 +-
 mobile/src/operators/expand_op.cpp            |    0
 mobile/src/operators/expand_op.h              |    0
 mobile/src/operators/grid_sampler_op.cpp      |    0
 mobile/src/operators/grid_sampler_op.h        |    0
 .../kernel/cl/cl_kernel/conv_kernel.inc.cl    |    0
 .../cl/cl_kernel/elementwise_mul_kernel.cl    |   18 +
 .../cl/cl_kernel/elementwise_sub_kernel.cl    |    0
 .../operators/kernel/cl/cl_kernel/expend.cl   |    0
 .../cl/cl_kernel/grid_sampler_kernel.cl       |    0
 .../kernel/cl/conv_transpose_kernel.cpp       |    8 +-
 .../kernel/cl/elementwise_mul_kernel.cpp      |  178 +-
 .../kernel/cl/elementwise_sub_kernel.cpp      |    0
 .../src/operators/kernel/cl/expand_kernel.cpp |    0
 .../kernel/cl/grid_sampler_kernel.cpp         |    0
 mobile/src/operators/kernel/expand_kernel.h   |    0
 .../kernel/fpga/V2/elementwise_add_kernel.cpp |    0
 .../fpga/V2/elementwise_add_relu_kernel.cpp   |    0
 .../kernel/fpga/V2/reshape2_kernel.cpp        |    0
 .../operators/kernel/fpga/V2/slice_kernel.cpp |    0
 .../operators/kernel/grid_sampler_kernel.h    |    0
 mobile/src/operators/op_param.h               |   40 +-
 mobile/src/pass/memory_optimize_cl.cpp        |   13 +-
 mobile/test/CMakeLists.txt                    |    6 +
 mobile/test/executor_for_test_opencl.h        |    0
 mobile/test/net/test_inference_api_v2.cpp     |    0
 mobile/test/net/test_mobilenet_male2fe.cpp    |   66 +
 mobile/test/net/test_net_multi_feed.cpp       |    0
 mobile/test/operators/test_expend_op.cpp      |    0
 .../tools/python/fluidtools/run_multi_feed.py |    0
 373 files changed, 5457 insertions(+), 20291 deletions(-)
 mode change 100755 => 100644 lite/api/lite_multithread_test.cc
 delete mode 100644 lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc
 delete mode 100644 lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3p0.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3p1.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3s1.cc
 delete mode 100644 lite/backends/arm/math/conv_depthwise_3x3s2.cc
 mode change 100755 => 100644 lite/backends/arm/math/reduce_prod.cc
 mode change 100755 => 100644 lite/backends/arm/math/reduce_prod.h
 mode change 100755 => 100644 lite/backends/arm/math/split_merge_lod_tenosr.cc
 mode change 100755 => 100644 lite/backends/arm/math/split_merge_lod_tenosr.h
 mode change 100755 => 100644 lite/backends/fpga/KD/debugger.hpp
 mode change 100644 => 100755 lite/backends/fpga/KD/dl_engine.cpp
 mode change 100644 => 100755 lite/backends/fpga/KD/dl_engine.hpp
 mode change 100644 => 100755 lite/backends/fpga/KD/llapi/zynqmp_api.cpp
 mode change 100644 => 100755 lite/backends/fpga/KD/llapi/zynqmp_api.h
 mode change 100644 => 100755 lite/backends/fpga/KD/pes/conv_process.hpp
 mode change 100644 => 100755 lite/backends/fpga/KD/pes/crop_pe.cpp
 mode change 100644 => 100755 lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
 mode change 100755 => 100644 lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
 mode change 100644 => 100755 lite/backends/fpga/KD/pes/fully_connected_pe.hpp
 mode change 100755 => 100644 lite/backends/fpga/KD/pes/gru_pe.hpp
 mode change 100755 => 100644 lite/backends/fpga/KD/pes/gru_util.hpp
 mode change 100644 => 100755 lite/backends/fpga/KD/pes/output_pe.hpp
 mode change 100644 => 100755 lite/backends/fpga/KD/pes/pooling_pe.hpp
 mode change 100755 => 100644 lite/backends/fpga/KD/pes/scale_pe.hpp
 mode change 100644 => 100755 lite/backends/fpga/lite_tensor.cc
 delete mode 100644 lite/backends/npu/builder.cc
 delete mode 100644 lite/backends/npu/builder.h
 mode change 100755 => 100644 lite/backends/npu/device.cc
 mode change 100755 => 100644 lite/backends/npu/device.h
 delete mode 100644 lite/backends/npu/runtime.cc
 delete mode 100644 lite/backends/npu/runtime.h
 mode change 100755 => 100644 lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
 mode change 100755 => 100644 lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
 mode change 100755 => 100644 lite/backends/x86/parallel.h
 delete mode 100644 lite/backends/xpu/builder.cc
 delete mode 100644 lite/backends/xpu/builder.h
 mode change 100755 => 100644 lite/backends/xpu/device.cc
 mode change 100755 => 100644 lite/backends/xpu/device.h
 delete mode 100644 lite/backends/xpu/runtime.cc
 delete mode 100644 lite/backends/xpu/runtime.h
 mode change 100755 => 100644 lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc
 mode change 100755 => 100644 lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
 mode change 100755 => 100644 lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h
 mode change 100755 => 100644 lite/core/mir/fusion/sequence_pool_concat_fuser.cc
 mode change 100755 => 100644 lite/core/mir/fusion/sequence_pool_concat_fuser.h
 mode change 100755 => 100644 lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
 mode change 100755 => 100644 lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
 mode change 100755 => 100644 lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
 mode change 100755 => 100644 lite/core/mir/fusion/var_conv_2d_activation_fuser.h
 mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_detector.cc
 mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_detector.h
 mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_detector_test.cc
 mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_pass.cc
 mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_pass.h
 mode change 100755 => 100644 lite/core/mir/subgraph/subgraph_pass_test.cc
 mode change 100755 => 100644 lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
 mode change 100755 => 100644 lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
 rename lite/demo/cxx/makefiles/{mobile_detection => ssd_detection}/Makefile.android.armv7 (90%)
 rename lite/demo/cxx/makefiles/{mobile_detection => ssd_detection}/Makefile.android.armv8 (89%)
 create mode 100644 lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
 create mode 100644 lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
 create mode 100644 lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7
 create mode 100644 lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8
 mode change 100755 => 100644 lite/demo/cxx/mobile_classify/mobile_classify.cc
 delete mode 100644 lite/demo/cxx/mobile_detection/test.jpg
 rename lite/demo/cxx/{mobile_detection/mobile_detection.cc => ssd_detection/ssd_detection.cc} (98%)
 create mode 100644 lite/demo/cxx/test_cv/README.md
 create mode 100644 lite/demo/cxx/test_cv/test_img_prepross.cc
 create mode 100644 lite/demo/cxx/test_cv/test_model_cv.cc
 create mode 100644 lite/demo/cxx/yolov3_detection/yolov3_detection.cc
 mode change 100755 => 100644 lite/kernels/arm/collect_fpn_proposals_compute.cc
 mode change 100755 => 100644 lite/kernels/arm/collect_fpn_proposals_compute.h
 mode change 100755 => 100644 lite/kernels/arm/conditional_block_compute.cc
 mode change 100755 => 100644 lite/kernels/arm/conditional_block_compute.h
 delete mode 100644 lite/kernels/arm/conv_transpose_compute_test.cc
 mode change 100755 => 100644 lite/kernels/arm/distribute_fpn_proposals_compute.cc
 mode change 100755 => 100644 lite/kernels/arm/distribute_fpn_proposals_compute.h
 mode change 100755 => 100644 lite/kernels/arm/grid_sampler_compute.cc
 mode change 100755 => 100644 lite/kernels/arm/grid_sampler_compute.h
 mode change 100755 => 100644 lite/kernels/arm/instance_norm_compute.cc
 mode change 100755 => 100644 lite/kernels/arm/instance_norm_compute.h
 mode change 100755 => 100644 lite/kernels/arm/merge_lod_tensor_compute.cc
 mode change 100755 => 100644 lite/kernels/arm/merge_lod_tensor_compute.h
 mode change 100755 => 100644 lite/kernels/arm/merge_lod_tensor_compute_test.cc
 mode change 100755 => 100644 lite/kernels/arm/reduce_prod_compute.cc
 mode change 100755 => 100644 lite/kernels/arm/reduce_prod_compute.h
 mode change 100755 => 100644 lite/kernels/arm/split_lod_tensor_compute.cc
 mode change 100755 => 100644 lite/kernels/arm/split_lod_tensor_compute.h
 mode change 100755 => 100644 lite/kernels/arm/split_lod_tensor_compute_test.cc
 delete mode 100644 lite/kernels/cuda/elementwise_add_compute.cu
 delete mode 100644 lite/kernels/cuda/elementwise_add_compute.h
 delete mode 100644 lite/kernels/cuda/elementwise_add_compute_test.cc
 mode change 100755 => 100644 lite/kernels/cuda/sequence_pool_concat_compute.cu
 mode change 100755 => 100644 lite/kernels/cuda/sequence_pool_concat_compute.h
 mode change 100644 => 100755 lite/kernels/fpga/CMakeLists.txt
 mode change 100644 => 100755 lite/kernels/fpga/calib_compute.cc
 mode change 100644 => 100755 lite/kernels/fpga/conv_compute.cc
 mode change 100644 => 100755 lite/kernels/fpga/conv_compute.h
 mode change 100755 => 100644 lite/kernels/fpga/dropout_compute.cc
 mode change 100644 => 100755 lite/kernels/fpga/elementwise_compute.cc
 mode change 100644 => 100755 lite/kernels/fpga/fc_compute.h
 mode change 100644 => 100755 lite/kernels/fpga/feed_compute.cc
 mode change 100644 => 100755 lite/kernels/fpga/feed_compute.h
 mode change 100644 => 100755 lite/kernels/fpga/fetch_compute.h
 mode change 100755 => 100644 lite/kernels/fpga/gru_compute.h
 mode change 100755 => 100644 lite/kernels/fpga/im2sequence_compute.cc
 mode change 100755 => 100644 lite/kernels/fpga/im2sequence_compute.h
 mode change 100755 => 100644 lite/kernels/fpga/mul_compute.h
 mode change 100755 => 100644 lite/kernels/fpga/multiclass_nms_compute.cc
 mode change 100755 => 100644 lite/kernels/fpga/norm_compute.cc
 mode change 100755 => 100644 lite/kernels/fpga/norm_compute.h
 mode change 100644 => 100755 lite/kernels/fpga/pooling_compute_test.cc
 mode change 100755 => 100644 lite/kernels/fpga/prior_box_compute.cc
 mode change 100755 => 100644 lite/kernels/fpga/prior_box_compute.h
 mode change 100755 => 100644 lite/kernels/fpga/reshape_compute.cc
 mode change 100644 => 100755 lite/kernels/fpga/scale_compute.cc
 mode change 100644 => 100755 lite/kernels/fpga/scale_compute.h
 mode change 100644 => 100755 lite/kernels/fpga/softmax_compute.cc
 mode change 100755 => 100644 lite/kernels/fpga/transpose_compute.cc
 mode change 100755 => 100644 lite/kernels/npu/bridges/argmax_op.cc
 mode change 100755 => 100644 lite/kernels/npu/bridges/argmax_op_test.cc
 delete mode 100644 lite/kernels/npu/bridges/batch_norm_op_test.cc
 mode change 100755 => 100644 lite/kernels/npu/bridges/engine.cc
 mode change 100755 => 100644 lite/kernels/npu/bridges/engine.h
 mode change 100755 => 100644 lite/kernels/npu/bridges/graph.cc
 mode change 100755 => 100644 lite/kernels/npu/bridges/graph.h
 mode change 100755 => 100644 lite/kernels/npu/bridges/paddle_use_bridges.h
 delete mode 100644 lite/kernels/npu/bridges/paddle_use_npu_bridges.h
 delete mode 100644 lite/kernels/npu/bridges/pool_op_test.cc
 delete mode 100644 lite/kernels/npu/bridges/shuffle_channel_op_test.cc
 delete mode 100644 lite/kernels/npu/bridges/transpose_op_test.cc
 mode change 100755 => 100644 lite/kernels/npu/bridges/unsqueeze_op.cc
 delete mode 100755 lite/kernels/npu/bridges/unsqueeze_op_test.cc
 mode change 100755 => 100644 lite/kernels/npu/bridges/utility.cc
 mode change 100755 => 100644 lite/kernels/npu/bridges/utility.h
 delete mode 100644 lite/kernels/npu/graph_compute.cc
 delete mode 100644 lite/kernels/npu/graph_compute.h
 mode change 100755 => 100644 lite/kernels/npu/subgraph_compute.cc
 mode change 100755 => 100644 lite/kernels/npu/subgraph_compute.h
 mode change 100755 => 100644 lite/kernels/opencl/conv2d_1x1_compute.cc
 mode change 100755 => 100644 lite/kernels/opencl/conv2d_1x1_compute_test.cc
 mode change 100755 => 100644 lite/kernels/opencl/reshape_compute.cc
 mode change 100755 => 100644 lite/kernels/opencl/reshape_compute_test.cc
 delete mode 100644 lite/kernels/x86/fc_compute_test.cc
 delete mode 100644 lite/kernels/x86/relu_compute.cc
 delete mode 100644 lite/kernels/x86/relu_compute.h
 delete mode 100644 lite/kernels/xpu/bridges/act_op_test.cc
 delete mode 100644 lite/kernels/xpu/bridges/batch_norm_op_test.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/dropout_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/gather_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/graph.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/graph.h
 mode change 100755 => 100644 lite/kernels/xpu/bridges/layer_norm_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/lookup_table_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/matmul_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/paddle_use_bridges.h
 delete mode 100644 lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
 delete mode 100644 lite/kernels/xpu/bridges/registry.cc
 delete mode 100644 lite/kernels/xpu/bridges/registry.h
 mode change 100755 => 100644 lite/kernels/xpu/bridges/reshape_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/scale_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/slice_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/stack_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/transpose_op.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/utility.cc
 mode change 100755 => 100644 lite/kernels/xpu/bridges/utility.h
 delete mode 100644 lite/kernels/xpu/graph_compute.cc
 delete mode 100644 lite/kernels/xpu/graph_compute.h
 mode change 100755 => 100644 lite/kernels/xpu/subgraph_compute.cc
 mode change 100755 => 100644 lite/kernels/xpu/subgraph_compute.h
 mode change 100755 => 100644 lite/operators/collect_fpn_proposals_op.cc
 mode change 100755 => 100644 lite/operators/collect_fpn_proposals_op.h
 mode change 100755 => 100644 lite/operators/conditional_block_op.cc
 mode change 100755 => 100644 lite/operators/conditional_block_op.h
 mode change 100755 => 100644 lite/operators/distribute_fpn_proposals_op.cc
 mode change 100755 => 100644 lite/operators/distribute_fpn_proposals_op.h
 delete mode 100644 lite/operators/graph_op.cc
 delete mode 100644 lite/operators/graph_op.h
 mode change 100755 => 100644 lite/operators/grid_sampler_op.cc
 mode change 100755 => 100644 lite/operators/grid_sampler_op.h
 mode change 100755 => 100644 lite/operators/instance_norm_op.cc
 mode change 100755 => 100644 lite/operators/instance_norm_op.h
 mode change 100755 => 100644 lite/operators/merge_lod_tensor_op.cc
 mode change 100755 => 100644 lite/operators/merge_lod_tensor_op.h
 mode change 100755 => 100644 lite/operators/reduce_prod_op.cc
 mode change 100755 => 100644 lite/operators/reduce_prod_op.h
 mode change 100755 => 100644 lite/operators/sequence_pool_concat_op.cc
 mode change 100755 => 100644 lite/operators/sequence_pool_concat_op.h
 mode change 100755 => 100644 lite/operators/split_lod_tensor_op.cc
 mode change 100755 => 100644 lite/operators/split_lod_tensor_op.h
 mode change 100755 => 100644 lite/operators/subgraph_op.cc
 mode change 100755 => 100644 lite/operators/subgraph_op.h
 create mode 100644 lite/tests/kernels/batch_norm_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/dropout_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/gather_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/grid_sampler_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/instance_norm_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/layer_norm_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/lookup_table_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/mul_compute_test.cc
 create mode 100644 lite/tests/kernels/pool_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/reduce_prod_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/reshape_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/softmax_compute_test.cc
 mode change 100755 => 100644 lite/tests/kernels/transpose_compute_test.cc
 delete mode 100644 lite/tests/utils/timer.h
 delete mode 100755 lite/tools/build_bm.sh
 create mode 100644 lite/tools/cmake_tools/record_supported_kernel_op.py
 mode change 100755 => 100644 lite/utils/env.h
 mode change 100755 => 100644 mobile/src/fpga/V2/image.cpp
 mode change 100755 => 100644 mobile/src/fpga/V2/pe.cpp
 mode change 100755 => 100644 mobile/src/operators/expand_op.cpp
 mode change 100755 => 100644 mobile/src/operators/expand_op.h
 mode change 100755 => 100644 mobile/src/operators/grid_sampler_op.cpp
 mode change 100755 => 100644 mobile/src/operators/grid_sampler_op.h
 mode change 100755 => 100644 mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
 mode change 100755 => 100644 mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
 mode change 100755 => 100644 mobile/src/operators/kernel/cl/cl_kernel/expend.cl
 mode change 100755 => 100644 mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
 mode change 100755 => 100644 mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
 mode change 100755 => 100644 mobile/src/operators/kernel/cl/expand_kernel.cpp
 mode change 100755 => 100644 mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
 mode change 100755 => 100644 mobile/src/operators/kernel/expand_kernel.h
 mode change 100755 => 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
 mode change 100755 => 100644 mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
 mode change 100755 => 100644 mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
 mode change 100755 => 100644 mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
 mode change 100755 => 100644 mobile/src/operators/kernel/grid_sampler_kernel.h
 mode change 100755 => 100644 mobile/test/executor_for_test_opencl.h
 mode change 100755 => 100644 mobile/test/net/test_inference_api_v2.cpp
 create mode 100644 mobile/test/net/test_mobilenet_male2fe.cpp
 mode change 100755 => 100644 mobile/test/net/test_net_multi_feed.cpp
 mode change 100755 => 100644 mobile/test/operators/test_expend_op.cpp
 mode change 100755 => 100644 mobile/tools/python/fluidtools/run_multi_feed.py

diff --git a/README.md b/README.md
index 83d0a986da..22b8488829 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,74 @@
-编译方法：  ./lite/tools/build_bm.sh --target_name=bm --bm_sdk_root=/Paddle-Lite/third-party/bmnnsdk2-bm1684_v2.0.1 bm
+[中文版](./README_cn.md)
+
+# Paddle Lite
+
+<!--[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle-Lite.svg?branch=develop&longCache=true&style=flat-square)](https://travis-ci.org/PaddlePaddle/Paddle-Lite)-->
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](https://paddlepaddle.github.io/Paddle-Lite/)
+[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
+<!-- [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases) -->
+
+
+Paddle Lite is an updated version of Paddle-Mobile, an open-open source deep learning framework designed to make it easy to perform inference on mobile, embeded, and IoT devices. It is compatible with PaddlePaddle and pre-trained models from other sources.
+
+For tutorials, please see [PaddleLite Document](https://paddlepaddle.github.io/Paddle-Lite/).
+
+## Key Features
+
+### Light Weight
+
+On mobile devices, execution module can be deployed without third-party libraries, because our excecution module and analysis module are decoupled.
+
+On ARM V7, only 800KB are taken up, while on ARM V8, 1.3MB are taken up with the 80 operators and 85 kernels in the dynamic libraries provided by Paddle Lite.
+
+Paddle Lite enables immediate inference without extra optimization.
+
+### High Performance
+
+Paddle Lite enables device-optimized kernels, maximizing ARM CPU performance.
+
+It also supports INT8 quantizations with [PaddleSlim model compression tools](https://github.com/PaddlePaddle/models/tree/v1.5/PaddleSlim), reducing the size of models and increasing the performance of models.
+
+On Huawei NPU and FPGA, the performance is also boosted.
+
+The latest benchmark is located at [benchmark](https://paddlepaddle.github.io/Paddle-Lite/develop/benchmark/)
+
+### High Compatibility
+
+Hardware compatibility: Paddle Lite supports a diversity of hardwares — ARM CPU, Mali GPU, Adreno GPU, Huawei NPU and FPGA. In the near future, we will also support AI microchips from Cambricon and Bitmain.
+
+Model compatibility: The Op of Paddle Lite is fully compatible to that of PaddlePaddle. The accuracy and performance of 18 models (mostly CV models and OCR models) and 85 operators have been validated. In the future, we will also support other models.
+
+Framework compatibility: In addition to models trained on PaddlePaddle, those trained on Caffe and TensorFlow can also be converted to be used on Paddle Lite, via [X2Paddle](https://github.com/PaddlePaddle/X2Paddle). In the future to come, we will also support models of ONNX format.
+
+## Architecture
+
+Paddle Lite is designed to support a wide range of hardwares and devices, and it enables mixed execution of a single model on multiple devices, optimization on various phases, and leight-weighted applications on devices.
+
+![img](https://user-images.githubusercontent.com/45189361/70908123-6ce4fd00-2045-11ea-97e1-ad08446c5c86.png)
+
+As is shown in the figure above, analysis phase includes Machine IR module, and it enables optimizations like Op fusion and redundant computation pruning. Besides, excecution phase only involves Kernal exevution, so it can be deployed on its own to ensure maximized light-weighted deployment.
+
+## Key Info about the Update
+
+The earlier Paddle-Mobile was designed to be compatible with PaddlePaddle and multiple hardwares, including ARM CPU, Mali GPU, Adreno GPU, FPGA, ARM-Linux and Apple's GPU Metal. Within Baidu, inc, many product lines have been using Paddle-Mobile. For more details, please see: [mobile/README](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/mobile/README.md).
+
+As an update of Paddle-Mobile, Paddle Lite has incorporated many older capabilities into the [new architecture](https://github.com/PaddlePaddle/Paddle-Lite/tree/develop/lite). For the time being, the code of Paddle-mobile will be kept under the directory `mobile/`, before complete transfer to Paddle Lite.
+
+For demands of Apple's GPU Metal and web front end inference, please see `./metal` and `./web` . These two modules will be further developed and maintained.
+
+## Special Thanks
+
+Paddle Lite has referenced the following open-source projects:
+
+- [ARM compute library](http://agroup.baidu.com/paddle-infer/md/article/%28https://github.com/ARM-software/ComputeLibrary%29)
+- [Anakin](https://github.com/PaddlePaddle/Anakin). The optimizations under Anakin has been incorporated into Paddle Lite, and so there will not be any future updates of Anakin. As another high-performance inference project under PaddlePaddle, Anakin has been forward-looking and helpful to the making of Paddle Lite. 
+
+
+## Feedback and Community Support
+
+- Questions, reports, and suggestions are welcome through Github Issues!
+- Forum: Opinions and questions are welcome at our [PaddlePaddle Forum](https://ai.baidu.com/forum/topic/list/168)！
+- WeChat Official Account: PaddlePaddle
+- QQ Group Chat: 696965088
+<p align="center"><img width="200" height="200"  src="https://user-images.githubusercontent.com/45189361/64117959-1969de80-cdc9-11e9-84f7-e1c2849a004c.jpeg"/>&#8194;&#8194;&#8194;&#8194;&#8194;<img width="200" height="200" margin="500" src="https://user-images.githubusercontent.com/45189361/64117844-cb54db00-cdc8-11e9-8c08-24bbe594608e.jpeg"/></p>
+<p align="center">&#8194; WeChat Official Account&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;&#8194;QQ Group Chat&#8194;&#8194;&#8194;&#8194;&#8194;</p>
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
index 76f62765af..0597ef0cc4 100644
--- a/cmake/cross_compiling/ios.cmake
+++ b/cmake/cross_compiling/ios.cmake
@@ -120,6 +120,7 @@
 #
 
 ## Lite settings
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto")
 if (ARM_TARGET_OS STREQUAL "ios")
   set(PLATFORM "OS")
 elseif(ARM_TARGET_OS STREQUAL "ios64")
diff --git a/cmake/cross_compiling/npu.cmake b/cmake/cross_compiling/npu.cmake
index 25aa4d2bc8..c22bb1db4f 100644
--- a/cmake/cross_compiling/npu.cmake
+++ b/cmake/cross_compiling/npu.cmake
@@ -30,7 +30,7 @@ if(NOT NPU_DDK_INC)
   message(FATAL_ERROR "Can not find HiAiModelManagerService.h in ${NPU_DDK_ROOT}/include")
 endif()
 
-include_directories("${NPU_DDK_ROOT}")
+include_directories("${NPU_DDK_ROOT}/include")
 
 set(NPU_SUB_LIB_PATH "lib64")
 if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt
index c053d4ec2b..cb6a872e06 100644
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -224,10 +224,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_full/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_full/Makefile"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
-                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
             )
             add_dependencies(publish_inference_android_cxx_demos logging gflags)
             add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)
@@ -239,10 +243,14 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/README.md" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_light" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_light/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_light/Makefile"
-                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
-                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/ssd_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/ssd_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/ssd_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/yolov3_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/yolov3_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/yolov3_detection/Makefile"
                 COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mobile_classify" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                 COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mobile_classify/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mobile_classify/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_cv" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
             )
             add_dependencies(tiny_publish_cxx_lib publish_inference_android_cxx_demos)
         endif()
diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt
index d57496487a..a1fde4c152 100644
--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -35,6 +35,7 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGE
         NPU_DEPS ${npu_kernels})
 
     target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
+
     if (LITE_WITH_NPU)
         # Strips the symbols of our protobuf functions to fix the conflicts during
         # loading HIAI builder libs (libhiai_ir.so and libhiai_ir_build.so)
@@ -45,8 +46,8 @@ else()
     if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
         add_library(paddle_light_api_shared SHARED "")
         target_sources(paddle_light_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc light_api_impl.cc)
-       set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
-       add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
+        set_target_properties(paddle_light_api_shared PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
+        add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
         if (LITE_WITH_NPU)
             # Need to add HIAI runtime libs (libhiai.so) dependency
             target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
@@ -91,6 +92,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
                         SRCS cxx_api.cc
                         DEPS ${cxx_api_deps} ${ops} ${host_kernels} program
                         X86_DEPS ${x86_kernels}
+                        CUDA_DEPS ${cuda_kernels}
                         ARM_DEPS ${arm_kernels}
                         CV_DEPS paddle_cv_arm
                         NPU_DEPS ${npu_kernels}
@@ -129,7 +131,9 @@ if(WITH_TESTING)
        DEPS cxx_api mir_passes lite_api_test_helper
        ${ops} ${host_kernels}
        X86_DEPS ${x86_kernels}
+       CUDA_DEPS ${cuda_kernels}
        ARM_DEPS ${arm_kernels}
+       CV_DEPS paddle_cv_arm
        NPU_DEPS ${npu_kernels}
        XPU_DEPS ${xpu_kernels}
        CL_DEPS ${opencl_kernels}
@@ -293,12 +297,13 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
     message(STATUS "Compiling model_optimize_tool")
     lite_cc_binary(model_optimize_tool SRCS model_optimize_tool.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
         DEPS gflags kernel op optimizer mir_passes utils)
-    add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc)
+    add_dependencies(model_optimize_tool op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
 endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
 
 lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
   ${ops}
   ARM_DEPS ${arm_kernels}
+  CV_DEPS paddle_cv_arm
   NPU_DEPS ${npu_kernels}
   XPU_DEPS ${xpu_kernels}
   CL_DEPS ${opencl_kernels}
@@ -327,13 +332,14 @@ if(NOT IOS)
     lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
         ARM_DEPS ${arm_kernels}
+        CV_DEPS paddle_cv_arm
         NPU_DEPS ${npu_kernels}
         XPU_DEPS ${xpu_kernels}
         CL_DEPS ${opencl_kernels}
         FPGA_DEPS ${fpga_kernels}
         X86_DEPS ${x86_kernels}
         CUDA_DEPS ${cuda_kernels})
-        lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
+    lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
         ${ops} ${host_kernels}
         ARM_DEPS ${arm_kernels}
         CV_DEPS paddle_cv_arm
diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc
index 990d08f18f..c1e9fc4224 100644
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -201,7 +201,11 @@ void Predictor::Build(const lite_api::CxxConfig &config,
   const std::string &model_file = config.model_file();
   const std::string &param_file = config.param_file();
   const bool model_from_memory = config.model_from_memory();
-  LOG(INFO) << "load from memory " << model_from_memory;
+  if (model_from_memory) {
+    LOG(INFO) << "Load model from memory.";
+  } else {
+    LOG(INFO) << "Load model from file.";
+  }
 
   Build(model_path,
         model_file,
diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc
index 3e6e10103e..81ea60eac6 100644
--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -42,11 +42,11 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
 
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
     !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
-  int num_threads = config.cpu_math_library_num_threads();
+  int num_threads = config.x86_math_library_num_threads();
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
   omp_set_num_threads(real_num_threads);
-  VLOG(3) << "set_cpu_math_library_math_threads() is set successfully and the "
+  VLOG(3) << "set_x86_math_library_math_threads() is set successfully and the "
              "number of threads is:"
           << num_threads;
 #endif
diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc
old mode 100755
new mode 100644
diff --git a/lite/api/model_optimize_tool.cc b/lite/api/model_optimize_tool.cc
index b678c7ecd2..fc23e0b54b 100644
--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -16,8 +16,9 @@
 #ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
 #endif
-// "all_kernel_faked.cc" and "kernel_src_map.h" are created automatically during
-// model_optimize_tool's compiling period
+// "supported_kernel_op_info.h", "all_kernel_faked.cc" and "kernel_src_map.h"
+// are created automatically during model_optimize_tool's compiling period
+#include <iomanip>
 #include "all_kernel_faked.cc"  // NOLINT
 #include "kernel_src_map.h"     // NOLINT
 #include "lite/api/cxx_api.h"
@@ -25,8 +26,11 @@
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
 #include "lite/core/op_registry.h"
+#include "lite/model_parser/compatible_pb.h"
+#include "lite/model_parser/pb/program_desc.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
+#include "supported_kernel_op_info.h"  // NOLINT
 
 DEFINE_string(model_dir,
               "",
@@ -62,10 +66,16 @@ DEFINE_string(valid_targets,
               "The targets this model optimized for, should be one of (arm, "
               "opencl, x86), splitted by space");
 DEFINE_bool(prefer_int8_kernel, false, "Prefer to run model with int8 kernels");
+DEFINE_bool(print_supported_ops,
+            false,
+            "Print supported operators on the inputed target");
+DEFINE_bool(print_all_ops,
+            false,
+            "Print all the valid operators of Paddle-Lite");
+DEFINE_bool(print_model_ops, false, "Print operators in the input model");
 
 namespace paddle {
 namespace lite_api {
-
 //! Display the kernel information.
 void DisplayKernels() {
   LOG(INFO) << ::paddle::lite::KernelRegistry::Global().DebugString();
@@ -130,9 +140,7 @@ void RunOptimize(const std::string& model_dir,
   config.set_model_dir(model_dir);
   config.set_model_file(model_file);
   config.set_param_file(param_file);
-
   config.set_valid_places(valid_places);
-
   auto predictor = lite_api::CreatePaddlePredictor(config);
 
   LiteModelType model_type;
@@ -168,6 +176,202 @@ void CollectModelMetaInfo(const std::string& output_dir,
   lite::WriteLines(std::vector<std::string>(total.begin(), total.end()),
                    output_path);
 }
+void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
+  std::vector<std::string> targets = {"kHost",
+                                      "kX86",
+                                      "kCUDA",
+                                      "kARM",
+                                      "kOpenCL",
+                                      "kFPGA",
+                                      "kNPU",
+                                      "kXPU",
+                                      "kAny",
+                                      "kUnk"};
+  int maximum_optype_length = 0;
+  for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
+    maximum_optype_length = it->first.size() > maximum_optype_length
+                                ? it->first.size()
+                                : maximum_optype_length;
+  }
+  std::cout << std::setiosflags(std::ios::internal);
+  std::cout << std::setw(maximum_optype_length) << "OP_name";
+  for (int i = 0; i < targets.size(); i++) {
+    std::cout << std::setw(10) << targets[i].substr(1);
+  }
+  std::cout << std::endl;
+  if (valid_ops.empty()) {
+    for (auto it = supported_ops.begin(); it != supported_ops.end(); it++) {
+      std::cout << std::setw(maximum_optype_length) << it->first;
+      auto ops_valid_places = it->second;
+      for (int i = 0; i < targets.size(); i++) {
+        if (std::find(ops_valid_places.begin(),
+                      ops_valid_places.end(),
+                      targets[i]) != ops_valid_places.end()) {
+          std::cout << std::setw(10) << "Y";
+        } else {
+          std::cout << std::setw(10) << " ";
+        }
+      }
+      std::cout << std::endl;
+    }
+  } else {
+    for (auto op = valid_ops.begin(); op != valid_ops.end(); op++) {
+      std::cout << std::setw(maximum_optype_length) << *op;
+      // Check: If this kernel doesn't match any operator, we will skip it.
+      if (supported_ops.find(*op) == supported_ops.end()) {
+        continue;
+      }
+      // Print OP info.
+      auto ops_valid_places = supported_ops.at(*op);
+      for (int i = 0; i < targets.size(); i++) {
+        if (std::find(ops_valid_places.begin(),
+                      ops_valid_places.end(),
+                      targets[i]) != ops_valid_places.end()) {
+          std::cout << std::setw(10) << "Y";
+        } else {
+          std::cout << std::setw(10) << " ";
+        }
+      }
+      std::cout << std::endl;
+    }
+  }
+}
+/// Print help information
+void PrintHelpInfo() {
+  // at least one argument should be inputed
+  const char help_info[] =
+      "At least one argument should be inputed. Valid arguments are listed "
+      "below:\n"
+      "  Arguments of model optimization:\n"
+      "        `--model_dir=<model_param_dir>`\n"
+      "        `--model_file=<model_path>`\n"
+      "        `--param_file=<param_path>`\n"
+      "        `--optimize_out_type=(protobuf|naive_buffer)`\n"
+      "        `--optimize_out=<output_optimize_model_dir>`\n"
+      "        `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
+      "        `--prefer_int8_kernel=(true|false)`\n"
+      "        `--record_tailoring_info=(true|false)`\n"
+      "  Arguments of model checking and ops information:\n"
+      "        `--print_all_ops=true`   Display all the valid operators of "
+      "Paddle-Lite\n"
+      "        `--print_supported_ops=true  "
+      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "  Display valid operators of input targets\n"
+      "        `--print_model_ops=true  --model_dir=<model_param_dir> "
+      "--valid_targets=(arm|opencl|x86|npu|xpu)`"
+      "  Display operators in the input model\n";
+  std::cout << help_info << std::endl;
+  exit(1);
+}
+
+// Parse Input command
+void ParseInputCommand() {
+  if (FLAGS_print_all_ops) {
+    std::cout << "All OPs supported by Paddle-Lite: " << supported_ops.size()
+              << " ops in total." << std::endl;
+    PrintOpsInfo();
+    exit(1);
+  } else if (FLAGS_print_supported_ops) {
+    auto valid_places = paddle::lite_api::ParserValidPlaces();
+    // get valid_targets string
+    std::vector<TargetType> target_types = {};
+    for (int i = 0; i < valid_places.size(); i++) {
+      target_types.push_back(valid_places[i].target);
+    }
+    std::string targets_str = TargetToStr(target_types[0]);
+    for (int i = 1; i < target_types.size(); i++) {
+      targets_str = targets_str + TargetToStr(target_types[i]);
+    }
+
+    std::cout << "Supported OPs on '" << targets_str << "': " << std::endl;
+    target_types.push_back(TARGET(kHost));
+    target_types.push_back(TARGET(kUnk));
+
+    std::set<std::string> valid_ops;
+    for (int i = 0; i < target_types.size(); i++) {
+      auto ops = supported_ops_target[static_cast<int>(target_types[i])];
+      valid_ops.insert(ops.begin(), ops.end());
+    }
+    PrintOpsInfo(valid_ops);
+    exit(1);
+  }
+}
+// test whether this model is supported
+void CheckIfModelSupported() {
+  // 1. parse valid places and valid targets
+  auto valid_places = paddle::lite_api::ParserValidPlaces();
+  // set valid_ops
+  auto valid_ops = supported_ops_target[static_cast<int>(TARGET(kHost))];
+  auto valid_unktype_ops = supported_ops_target[static_cast<int>(TARGET(kUnk))];
+  valid_ops.insert(
+      valid_ops.end(), valid_unktype_ops.begin(), valid_unktype_ops.end());
+  for (int i = 0; i < valid_places.size(); i++) {
+    auto target = valid_places[i].target;
+    auto ops = supported_ops_target[static_cast<int>(target)];
+    valid_ops.insert(valid_ops.end(), ops.begin(), ops.end());
+  }
+  // get valid ops
+  std::set<std::string> valid_ops_set(valid_ops.begin(), valid_ops.end());
+
+  // 2.Load model into program to get ops in model
+  std::string prog_path = FLAGS_model_dir + "/__model__";
+  if (!FLAGS_model_file.empty() && !FLAGS_param_file.empty()) {
+    prog_path = FLAGS_model_file;
+  }
+  lite::cpp::ProgramDesc cpp_prog;
+  framework::proto::ProgramDesc pb_proto_prog =
+      *lite::LoadProgram(prog_path, false);
+  lite::pb::ProgramDesc pb_prog(&pb_proto_prog);
+  // Transform to cpp::ProgramDesc
+  lite::TransformProgramDescAnyToCpp(pb_prog, &cpp_prog);
+
+  std::set<std::string> unsupported_ops;
+  std::set<std::string> input_model_ops;
+  for (int index = 0; index < cpp_prog.BlocksSize(); index++) {
+    auto current_block = cpp_prog.GetBlock<lite::cpp::BlockDesc>(index);
+    for (size_t i = 0; i < current_block->OpsSize(); ++i) {
+      auto& op_desc = *current_block->GetOp<lite::cpp::OpDesc>(i);
+      auto op_type = op_desc.Type();
+      input_model_ops.insert(op_type);
+      if (valid_ops_set.count(op_type) == 0) {
+        unsupported_ops.insert(op_type);
+      }
+    }
+  }
+  // 3. Print ops_info of input model and check if this model is supported
+  if (FLAGS_print_model_ops) {
+    std::cout << "OPs in the input model include:\n";
+    PrintOpsInfo(input_model_ops);
+  }
+  if (!unsupported_ops.empty()) {
+    std::string unsupported_ops_str = *unsupported_ops.begin();
+    for (auto op_str = ++unsupported_ops.begin();
+         op_str != unsupported_ops.end();
+         op_str++) {
+      unsupported_ops_str = unsupported_ops_str + ", " + *op_str;
+    }
+    std::vector<TargetType> targets = {};
+    for (int i = 0; i < valid_places.size(); i++) {
+      targets.push_back(valid_places[i].target);
+    }
+    std::sort(targets.begin(), targets.end());
+    targets.erase(unique(targets.begin(), targets.end()), targets.end());
+    std::string targets_str = TargetToStr(targets[0]);
+    for (int i = 1; i < targets.size(); i++) {
+      targets_str = targets_str + "," + TargetToStr(targets[i]);
+    }
+
+    LOG(ERROR) << "Error: This model is not supported, because "
+               << unsupported_ops.size() << " ops are not supported on '"
+               << targets_str << "'. These unsupported ops are: '"
+               << unsupported_ops_str << "'.";
+    exit(1);
+  }
+  if (FLAGS_print_model_ops) {
+    std::cout << "Paddle-Lite supports this model!" << std::endl;
+    exit(1);
+  }
+}
 
 void Main() {
   if (FLAGS_display_kernels) {
@@ -241,7 +445,13 @@ void Main() {
 }  // namespace paddle
 
 int main(int argc, char** argv) {
+  // If there is none input argument, print help info.
+  if (argc < 2) {
+    paddle::lite_api::PrintHelpInfo();
+  }
   google::ParseCommandLineFlags(&argc, &argv, false);
+  paddle::lite_api::ParseInputCommand();
+  paddle::lite_api::CheckIfModelSupported();
   paddle::lite_api::Main();
   return 0;
 }
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index dc9fac96ee..5b063a8ef1 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -86,6 +86,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
     for (int i = 0; i < input_shapes[j].size(); ++i) {
       input_num *= input_shapes[j][i];
     }
+
     for (int i = 0; i < input_num; ++i) {
       input_data[i] = 1.f;
     }
diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h
index a014719c57..6308699ac9 100644
--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -133,7 +133,9 @@ class LITE_API CxxConfig : public ConfigBase {
   std::string model_file_;
   std::string param_file_;
   bool model_from_memory_{false};
-  int cpu_math_library_math_threads_ = 1;
+#ifdef LITE_WITH_X86
+  int x86_math_library_math_threads_ = 1;
+#endif
 
  public:
   void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
@@ -153,12 +155,14 @@ class LITE_API CxxConfig : public ConfigBase {
   std::string param_file() const { return param_file_; }
   bool model_from_memory() const { return model_from_memory_; }
 
-  void set_cpu_math_library_num_threads(int threads) {
-    cpu_math_library_math_threads_ = threads;
+#ifdef LITE_WITH_X86
+  void set_x86_math_library_num_threads(int threads) {
+    x86_math_library_math_threads_ = threads;
   }
-  int cpu_math_library_num_threads() const {
-    return cpu_math_library_math_threads_;
+  int x86_math_library_num_threads() const {
+    return x86_math_library_math_threads_;
   }
+#endif
 };
 
 /// MobileConfig is the config for the light weight predictor, it will skip
diff --git a/lite/api/test_step_rnn_lite_x86.cc b/lite/api/test_step_rnn_lite_x86.cc
index 075d314df6..013fd82b19 100644
--- a/lite/api/test_step_rnn_lite_x86.cc
+++ b/lite/api/test_step_rnn_lite_x86.cc
@@ -30,7 +30,9 @@ TEST(Step_rnn, test_step_rnn_lite_x86) {
   std::string model_dir = FLAGS_model_dir;
   lite_api::CxxConfig config;
   config.set_model_dir(model_dir);
-  config.set_cpu_math_library_num_threads(1);
+#ifdef LITE_WITH_X86
+  config.set_x86_math_library_num_threads(1);
+#endif
   config.set_valid_places({lite_api::Place{TARGET(kX86), PRECISION(kInt64)},
                            lite_api::Place{TARGET(kX86), PRECISION(kFloat)},
                            lite_api::Place{TARGET(kHost), PRECISION(kFloat)}});
diff --git a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc
deleted file mode 100644
index 99aeea8bde..0000000000
--- a/lite/backends/arm/math/conv3x3s1_depthwise_fp32.cc
+++ /dev/null
@@ -1,538 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <arm_neon.h>
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/operators/op_params.h"
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void conv_3x3s1_depthwise_fp32(const float* i_data,
-                               float* o_data,
-                               int bs,
-                               int oc,
-                               int oh,
-                               int ow,
-                               int ic,
-                               int ih,
-                               int win,
-                               const float* weights,
-                               const float* bias,
-                               const operators::ConvParam& param,
-                               ARMContext* ctx) {
-  int threads = ctx->threads();
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
-  const int out_c_block = 4;
-  const int out_h_kernel = 2;
-  const int out_w_kernel = 4;
-  const int win_ext = ow + 2;
-  const int ow_round = ROUNDUP(ow, 4);
-  const int win_round = ROUNDUP(win_ext, 4);
-  const int hin_round = oh + 2;
-  const int prein_size = win_round * hin_round * out_c_block;
-  auto workspace_size =
-      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
-  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
-
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-
-  /// get workspace
-  float* ptr_zero = ctx->workspace_data<float>();
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float* ptr_write = ptr_zero + win_round;
-
-  int size_in_channel = win * ih;
-  int size_out_channel = ow * oh;
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int hs = -pad_h;
-  int he = hs + hin_round;
-  int w_loop = ow_round / 4;
-  auto remain = w_loop * 4 - ow;
-  bool flag_remain = remain > 0;
-  remain = 4 - remain;
-  remain = remain > 0 ? remain : 0;
-  int row_len = win_round * out_c_block;
-
-  for (int n = 0; n < bs; ++n) {
-    const float* din_batch = i_data + n * ic * size_in_channel;
-    float* dout_batch = o_data + n * oc * size_out_channel;
-#pragma omp parallel for num_threads(threads)
-    for (int c = 0; c < oc; c += out_c_block) {
-#ifdef ARM_WITH_OMP
-      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
-#else
-      float* pre_din = ptr_write + ow_round;
-#endif
-      /// const array size
-      float pre_out[out_c_block * out_w_kernel * out_h_kernel];  // NOLINT
-      prepack_input_nxwc4_dw(
-          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
-      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
-      float* dout_c00 = dout_batch + c * size_out_channel;
-      float bias_local[4] = {0, 0, 0, 0};
-      if (flag_bias) {
-        bias_local[0] = bias[c];
-        bias_local[1] = bias[c + 1];
-        bias_local[2] = bias[c + 2];
-        bias_local[3] = bias[c + 3];
-      }
-      float32x4_t vbias = vld1q_f32(bias_local);
-#ifdef __aarch64__
-      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
-      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
-      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
-      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
-      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
-      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
-      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
-      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
-      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
-#endif
-      for (int h = 0; h < oh; h += out_h_kernel) {
-        float* outc00 = dout_c00 + h * ow;
-        float* outc01 = outc00 + ow;
-        float* outc10 = outc00 + size_out_channel;
-        float* outc11 = outc10 + ow;
-        float* outc20 = outc10 + size_out_channel;
-        float* outc21 = outc20 + ow;
-        float* outc30 = outc20 + size_out_channel;
-        float* outc31 = outc30 + ow;
-        const float* inr0 = pre_din + h * row_len;
-        const float* inr1 = inr0 + row_len;
-        const float* inr2 = inr1 + row_len;
-        const float* inr3 = inr2 + row_len;
-        if (c + out_c_block > oc) {
-          switch (c + out_c_block - oc) {
-            case 3:
-              outc10 = ptr_write;
-              outc11 = ptr_write;
-            case 2:
-              outc20 = ptr_write;
-              outc21 = ptr_write;
-            case 1:
-              outc30 = ptr_write;
-              outc31 = ptr_write;
-            default:
-              break;
-          }
-        }
-        if (h + out_h_kernel > oh) {
-          outc01 = ptr_write;
-          outc11 = ptr_write;
-          outc21 = ptr_write;
-          outc31 = ptr_write;
-        }
-        float* outl[] = {outc00,
-                         outc10,
-                         outc20,
-                         outc30,
-                         outc01,
-                         outc11,
-                         outc21,
-                         outc31,
-                         reinterpret_cast<float*>(bias_local),
-                         reinterpret_cast<float*>(flag_relu)};
-        void* outl_ptr = reinterpret_cast<void*>(outl);
-        for (int w = 0; w < w_loop; ++w) {
-          bool flag_mask = (w == w_loop - 1) && flag_remain;
-          float* out0 = pre_out;
-// clang-format off
-#ifdef __aarch64__
-          asm volatile(
-          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
-          "ldp    q6, q7,   [%[inr1]], #32\n" /* load input r1*/
-          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
-          "ldp    q8, q9,   [%[inr1]], #32\n" /* load input r1*/
-          "ldp    q4, q5,   [%[inr0]]\n"      /* load input r0*/
-          "ldp    q10, q11, [%[inr1]]\n"      /* load input r1*/
-          /*  r0, r1, mul w0, get out r0, r1 */
-          "fmul   v15.4s ,  %[w0].4s,  v0.4s\n" /* outr00 = w0 * r0, 0*/
-          "fmul   v16.4s ,  %[w0].4s,  v1.4s\n" /* outr01 = w0 * r0, 1*/
-          "fmul   v17.4s ,  %[w0].4s,  v2.4s\n" /* outr02 = w0 * r0, 2*/
-          "fmul   v18.4s ,  %[w0].4s,  v3.4s\n" /* outr03 = w0 * r0, 3*/
-          "fmul   v19.4s ,  %[w0].4s,  v6.4s\n" /* outr10 = w0 * r1, 0*/
-          "fmul   v20.4s ,  %[w0].4s,  v7.4s\n" /* outr11 = w0 * r1, 1*/
-          "fmul   v21.4s ,  %[w0].4s,  v8.4s\n" /* outr12 = w0 * r1, 2*/
-          "fmul   v22.4s ,  %[w0].4s,  v9.4s\n" /* outr13 = w0 * r1, 3*/
-          /*  r0, r1, mul w1, get out r0, r1 */
-          "fmla   v15.4s ,  %[w1].4s,  v1.4s\n" /* outr00 = w1 * r0[1]*/
-          "ldp    q0, q1,   [%[inr2]], #32\n"     /* load input r2*/
-          "fmla   v16.4s ,  %[w1].4s,  v2.4s\n" /* outr01 = w1 * r0[2]*/
-          "fmla   v17.4s ,  %[w1].4s,  v3.4s\n" /* outr02 = w1 * r0[3]*/
-          "fmla   v18.4s ,  %[w1].4s,  v4.4s\n" /* outr03 = w1 * r0[4]*/
-          "fmla   v19.4s ,  %[w1].4s,  v7.4s\n" /* outr10 = w1 * r1[1]*/
-          "fmla   v20.4s ,  %[w1].4s,  v8.4s\n" /* outr11 = w1 * r1[2]*/
-          "fmla   v21.4s ,  %[w1].4s,  v9.4s\n" /* outr12 = w1 * r1[3]*/
-          "fmla   v22.4s ,  %[w1].4s,  v10.4s\n"/* outr13 = w1 * r1[4]*/
-          /*  r0, r1, mul w2, get out r0, r1 */
-          "fmla   v15.4s ,  %[w2].4s,  v2.4s\n" /* outr00 = w2 * r0[2]*/
-          "fmla   v16.4s ,  %[w2].4s,  v3.4s\n" /* outr01 = w2 * r0[3]*/
-          "ldp    q2, q3,   [%[inr2]], #32\n"     /* load input r2*/
-          "fmla   v17.4s ,  %[w2].4s,  v4.4s\n" /* outr02 = w2 * r0[4]*/
-          "fmla   v18.4s ,  %[w2].4s,  v5.4s\n" /* outr03 = w2 * r0[5]*/
-          "ldp    q4, q5,   [%[inr2]]\n"          /* load input r2*/
-          "fmla   v19.4s ,  %[w2].4s,  v8.4s\n" /* outr10 = w2 * r1[2]*/
-          "fmla   v20.4s ,  %[w2].4s,  v9.4s\n" /* outr11 = w2 * r1[3]*/
-          "fmla   v21.4s ,  %[w2].4s,  v10.4s\n"/* outr12 = w2 * r1[4]*/
-          "fmla   v22.4s ,  %[w2].4s,  v11.4s\n"/* outr13 = w2 * r1[5]*/
-          /*  r1, r2, mul w3, get out r0, r1 */
-          "fmla   v15.4s ,  %[w3].4s,  v6.4s\n" /* outr00 = w3 * r1[0]*/
-          "fmla   v16.4s ,  %[w3].4s,  v7.4s\n" /* outr01 = w3 * r1[1]*/
-          "fmla   v17.4s ,  %[w3].4s,  v8.4s\n" /* outr02 = w3 * r1[2]*/
-          "fmla   v18.4s ,  %[w3].4s,  v9.4s\n" /* outr03 = w3 * r1[3]*/
-          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr10 = w3 * r2[0]*/
-          "fmla   v20.4s ,  %[w3].4s,  v1.4s\n" /* outr11 = w3 * r2[1]*/
-          "fmla   v21.4s ,  %[w3].4s,  v2.4s\n" /* outr12 = w3 * r2[2]*/
-          "fmla   v22.4s ,  %[w3].4s,  v3.4s\n" /* outr13 = w3 * r2[3]*/
-          /*  r1, r2, mul w4, get out r0, r1 */
-          "fmla   v15.4s ,  %[w4].4s,  v7.4s\n" /* outr00 = w4 * r1[1]*/
-          "ldp    q6, q7,   [%[inr3]], #32\n"     /* load input r3*/
-          "fmla   v16.4s ,  %[w4].4s,  v8.4s\n" /* outr01 = w4 * r1[2]*/
-          "fmla   v17.4s ,  %[w4].4s,  v9.4s\n" /* outr02 = w4 * r1[3]*/
-          "fmla   v18.4s ,  %[w4].4s,  v10.4s\n"/* outr03 = w4 * r1[4]*/
-          "ldp    x0, x1, [%[outl]]  \n"
-          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr10 = w4 * r2[1]*/
-          "fmla   v20.4s ,  %[w4].4s,  v2.4s\n" /* outr11 = w4 * r2[2]*/
-          "fmla   v21.4s ,  %[w4].4s,  v3.4s\n" /* outr12 = w4 * r2[3]*/
-          "fmla   v22.4s ,  %[w4].4s,  v4.4s\n" /* outr13 = w4 * r2[4]*/
-          /*  r1, r2, mul w5, get out r0, r1 */
-          "fmla   v15.4s ,  %[w5].4s,  v8.4s\n" /* outr00 = w5 * r1[2]*/
-          "fmla   v16.4s ,  %[w5].4s,  v9.4s\n" /* outr01 = w5 * r1[3]*/
-          "ldp    q8, q9,   [%[inr3]], #32\n"     /* load input r3*/
-          "fmla   v17.4s ,  %[w5].4s,  v10.4s\n"/* outr02 = w5 * r1[4]*/
-          "fmla   v18.4s ,  %[w5].4s,  v11.4s\n"/* outr03 = w5 * r1[5]*/
-          "ldp    q10, q11,   [%[inr3]]\n"        /* load input r3*/
-          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr10 = w5 * r2[2]*/
-          "fmla   v20.4s ,  %[w5].4s,  v3.4s\n" /* outr11 = w5 * r2[3]*/
-          "fmla   v21.4s ,  %[w5].4s,  v4.4s\n" /* outr12 = w5 * r2[4]*/
-          "fmla   v22.4s ,  %[w5].4s,  v5.4s\n" /* outr13 = w5 * r2[5]*/
-          /*  r2, r3, mul w6, get out r0, r1 */
-          "fmla   v15.4s ,  %[w6].4s,  v0.4s\n" /* outr00 = w6 * r2[0]*/
-          "fmla   v16.4s ,  %[w6].4s,  v1.4s\n" /* outr01 = w6 * r2[1]*/
-          "fmla   v17.4s ,  %[w6].4s,  v2.4s\n" /* outr02 = w6 * r2[2]*/
-          "fmla   v18.4s ,  %[w6].4s,  v3.4s\n" /* outr03 = w6 * r2[3]*/
-          "ldp    x2, x3, [%[outl], #16]  \n"
-          "fmla   v19.4s ,  %[w6].4s,  v6.4s\n" /* outr10 = w6 * r3[0]*/
-          "fmla   v20.4s ,  %[w6].4s,  v7.4s\n" /* outr11 = w6 * r3[1]*/
-          "fmla   v21.4s ,  %[w6].4s,  v8.4s\n" /* outr12 = w6 * r3[2]*/
-          "fmla   v22.4s ,  %[w6].4s,  v9.4s\n" /* outr13 = w6 * r3[3]*/
-          /*  r2, r3, mul w7, get out r0, r1 */
-          "fmla   v15.4s ,  %[w7].4s,  v1.4s\n" /* outr00 = w7 * r2[1]*/
-          "fmla   v16.4s ,  %[w7].4s,  v2.4s\n" /* outr01 = w7 * r2[2]*/
-          "fmla   v17.4s ,  %[w7].4s,  v3.4s\n" /* outr02 = w7 * r2[3]*/
-          "fmla   v18.4s ,  %[w7].4s,  v4.4s\n" /* outr03 = w7 * r2[4]*/
-          "ldp    x4, x5, [%[outl], #32]  \n"
-          "fmla   v19.4s ,  %[w7].4s,  v7.4s\n" /* outr10 = w7 * r3[1]*/
-          "fmla   v20.4s ,  %[w7].4s,  v8.4s\n" /* outr11 = w7 * r3[2]*/
-          "fmla   v21.4s ,  %[w7].4s,  v9.4s\n" /* outr12 = w7 * r3[3]*/
-          "fmla   v22.4s ,  %[w7].4s,  v10.4s\n"/* outr13 = w7 * r3[4]*/
-          /*  r2, r3, mul w8, get out r0, r1 */
-          "fmla   v15.4s ,  %[w8].4s,  v2.4s\n" /* outr00 = w8 * r2[2]*/
-          "fmla   v16.4s ,  %[w8].4s,  v3.4s\n" /* outr01 = w8 * r2[3]*/
-          "fmla   v17.4s ,  %[w8].4s,  v4.4s\n" /* outr02 = w8 * r2[0]*/
-          "fmla   v18.4s ,  %[w8].4s,  v5.4s\n" /* outr03 = w8 * r2[1]*/
-          "ldp    x6, x7, [%[outl], #48]  \n"
-          "fmla   v19.4s ,  %[w8].4s,  v8.4s\n" /* outr10 = w8 * r3[2]*/
-          "fmla   v20.4s ,  %[w8].4s,  v9.4s\n" /* outr11 = w8 * r3[3]*/
-          "fmla   v21.4s ,  %[w8].4s,  v10.4s\n"/* outr12 = w8 * r3[0]*/
-          "fmla   v22.4s ,  %[w8].4s,  v11.4s\n"/* outr13 = w8 * r3[1]*/
-
-          "fadd   v15.4s, v15.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v16.4s, v16.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v17.4s, v17.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v18.4s, v18.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v19.4s, v19.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v20.4s, v20.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v21.4s, v21.4s, %[vbias].4s\n"/* add bias */
-          "fadd   v22.4s, v22.4s, %[vbias].4s\n"/* add bias */
-
-          /* transpose */
-          "trn1   v0.4s, v15.4s, v16.4s\n" /* r0: a0a1c0c1*/
-          "trn2   v1.4s, v15.4s, v16.4s\n" /* r0: b0b1d0d1*/
-          "trn1   v2.4s, v17.4s, v18.4s\n" /* r0: a2a3c2c3*/
-          "trn2   v3.4s, v17.4s, v18.4s\n" /* r0: b2b3d2d3*/
-          "trn1   v4.4s, v19.4s, v20.4s\n" /* r1: a0a1c0c1*/
-          "trn2   v5.4s, v19.4s, v20.4s\n" /* r1: b0b1d0d1*/
-          "trn1   v6.4s, v21.4s, v22.4s\n" /* r1: a2a3c2c3*/
-          "trn2   v7.4s, v21.4s, v22.4s\n" /* r1: b2b3d2d3*/
-          "trn1   v15.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
-          "trn2   v19.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
-          "trn1   v17.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
-          "trn2   v21.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
-          "trn1   v16.2d, v4.2d, v6.2d\n"  /* r1: a0a1a2a3*/
-          "trn2   v20.2d, v4.2d, v6.2d\n"  /* r1: c0c1c2c3*/
-          "trn1   v18.2d, v5.2d, v7.2d\n"  /* r1: b0b1b2b3*/
-          "trn2   v22.2d, v5.2d, v7.2d\n"  /* r1: d0d1d2d3*/
-
-          "cbz    %w[flag_relu],  0f\n"    /* skip relu*/
-          "movi   v0.4s, #0\n"             /* for relu */
-          "fmax   v15.4s, v15.4s, v0.4s\n"
-          "fmax   v16.4s, v16.4s, v0.4s\n"
-          "fmax   v17.4s, v17.4s, v0.4s\n"
-          "fmax   v18.4s, v18.4s, v0.4s\n"
-          "fmax   v19.4s, v19.4s, v0.4s\n"
-          "fmax   v20.4s, v20.4s, v0.4s\n"
-          "fmax   v21.4s, v21.4s, v0.4s\n"
-          "fmax   v22.4s, v22.4s, v0.4s\n"
-          "0:\n"
-          "cbnz   %w[flag_mask], 1f\n"
-          "str    q15, [x0]\n" /* save outc00 */
-          "str    q16, [x4]\n" /* save outc01 */
-          "str    q17, [x1]\n" /* save outc10 */
-          "str    q18, [x5]\n" /* save outc11 */
-          "str    q19, [x2]\n" /* save outc20 */
-          "str    q20, [x6]\n" /* save outc21 */
-          "str    q21, [x3]\n" /* save outc30 */
-          "str    q22, [x7]\n" /* save outc31 */
-          "b 2f\n"
-          "1:\n"
-          "str  q15, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q17, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q19, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q21, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q16, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q18, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q20, [%[out]], #16 \n" /* save remain to pre_out */
-          "str  q22, [%[out]], #16 \n" /* save remain to pre_out */
-          "2:\n"
-          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
-           [inr2] "+r"(inr2), [inr3] "+r"(inr3),
-           [out]"+r"(out0)
-          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
-           [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
-           [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
-           [vbias]"w" (vbias), [outl] "r" (outl_ptr),
-           [flag_mask] "r" (flag_mask), [flag_relu] "r" (flag_relu)
-          : "cc", "memory",
-            "v0","v1","v2","v3","v4","v5","v6","v7",
-            "v8", "v9", "v10", "v11", "v15",
-            "v16","v17","v18","v19","v20","v21","v22",
-            "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7"
-          );
-#else
-          asm volatile(
-          /* load weights */
-          "vld1.32    {d10-d13}, [%[wc0]]!      @ load w0, w1, to q5, q6\n"
-          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w2, to q7\n"
-          /* load r0, r1 */
-          "vld1.32    {d0-d3}, [%[r0]]!         @ load r0, q0, q1\n"
-          "vld1.32    {d4-d7}, [%[r0]]!         @ load r0, q2, q3\n"
-          /* main loop */
-          "0:                                   @ main loop\n"
-          /* mul r0 with w0, w1, w2, get out r0 */
-          "vmul.f32   q8, q5, q0                @ w0 * inr00\n"
-          "vmul.f32   q9, q5, q1                @ w0 * inr01\n"
-          "vmul.f32   q10, q5, q2               @ w0 * inr02\n"
-          "vmul.f32   q11, q5, q3               @ w0 * inr03\n"
-          "vmla.f32   q8, q6, q1                @ w1 * inr01\n"
-          "vld1.32    {d0-d3}, [%[r0]]          @ load r0, q0, q1\n"
-          "vmla.f32   q9, q6, q2                @ w1 * inr02\n"
-          "vmla.f32   q10, q6, q3               @ w1 * inr03\n"
-          "vmla.f32   q11, q6, q0               @ w1 * inr04\n"
-          "vmla.f32   q8, q7, q2                @ w2 * inr02\n"
-          "vmla.f32   q9, q7, q3                @ w2 * inr03\n"
-          "vld1.32    {d4-d7}, [%[r1]]!         @ load r0, q2, q3\n"
-          "vmla.f32   q10, q7, q0               @ w2 * inr04\n"
-          "vmla.f32   q11, q7, q1               @ w2 * inr05\n"
-          "vld1.32    {d0-d3}, [%[r1]]!         @ load r0, q0, q1\n"
-          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w3 to q4\n"
-          /* mul r1 with w0-w5, get out r0, r1 */
-          "vmul.f32   q12, q5, q2               @ w0 * inr10\n"
-          "vmul.f32   q13, q5, q3               @ w0 * inr11\n"
-          "vmul.f32   q14, q5, q0               @ w0 * inr12\n"
-          "vmul.f32   q15, q5, q1               @ w0 * inr13\n"
-          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w4 to q5\n"
-          "vmla.f32   q8, q4, q2                @ w3 * inr10\n"
-          "vmla.f32   q9, q4, q3                @ w3 * inr11\n"
-          "vmla.f32   q10, q4, q0               @ w3 * inr12\n"
-          "vmla.f32   q11, q4, q1               @ w3 * inr13\n"
-          /* mul r1 with w1, w4, get out r1, r0 */
-          "vmla.f32   q8, q5, q3                @ w4 * inr11\n"
-          "vmla.f32   q12, q6, q3               @ w1 * inr11\n"
-          "vld1.32    {d4-d7}, [%[r1]]          @ load r1, q2, q3\n"
-          "vmla.f32   q9, q5, q0                @ w4 * inr12\n"
-          "vmla.f32   q13, q6, q0               @ w1 * inr12\n"
-          "vmla.f32   q10, q5, q1               @ w4 * inr13\n"
-          "vmla.f32   q14, q6, q1               @ w1 * inr13\n"
-          "vmla.f32   q11, q5, q2               @ w4 * inr14\n"
-          "vmla.f32   q15, q6, q2               @ w1 * inr14\n"
-          "vld1.32    {d12-d13}, [%[wc0]]!      @ load w5 to q6\n"
-          /* mul r1 with w2, w5, get out r1, r0 */
-          "vmla.f32   q12, q7, q0               @ w2 * inr12\n"
-          "vmla.f32   q13, q7, q1               @ w2 * inr13\n"
-          "vmla.f32   q8, q6, q0                @ w5 * inr12\n"
-          "vmla.f32   q9, q6, q1                @ w5 * inr13\n"
-          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, q0, q1\n"
-          "vmla.f32   q14, q7, q2               @ w2 * inr14\n"
-          "vmla.f32   q15, q7, q3               @ w2 * inr15\n"
-          "vmla.f32   q10, q6, q2               @ w5 * inr14\n"
-          "vmla.f32   q11, q6, q3               @ w5 * inr15\n"
-          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, q0, q1\n"
-          "vld1.32    {d14-d15}, [%[wc0]]!      @ load w6, to q7\n"
-          /* mul r2 with w3-w8, get out r0, r1 */
-          "vmla.f32   q12, q4, q0               @ w3 * inr20\n"
-          "vmla.f32   q13, q4, q1               @ w3 * inr21\n"
-          "vmla.f32   q14, q4, q2               @ w3 * inr22\n"
-          "vmla.f32   q15, q4, q3               @ w3 * inr23\n"
-          "vld1.32    {d8-d9}, [%[wc0]]!        @ load w7, to q4\n"
-          "vmla.f32   q8,  q7, q0               @ w6 * inr20\n"
-          "vmla.f32   q9,  q7, q1               @ w6 * inr21\n"
-          "vmla.f32   q10, q7, q2               @ w6 * inr22\n"
-          "vmla.f32   q11, q7, q3               @ w6 * inr23\n"
-          /* mul r2 with w4, w7, get out r1, r0 */
-          "vmla.f32   q8,  q4, q1               @ w7 * inr21\n"
-          "vmla.f32   q12, q5, q1               @ w4 * inr21\n"
-          "vld1.32    {d0-d3}, [%[r2]]          @ load r2, q0, q1\n"
-          "vmla.f32   q9,  q4, q2               @ w7 * inr22\n"
-          "vmla.f32   q13, q5, q2               @ w4 * inr22\n"
-          "vmla.f32   q10, q4, q3               @ w7 * inr23\n"
-          "vmla.f32   q14, q5, q3               @ w4 * inr23\n"
-          "vmla.f32   q11, q4, q0               @ w7 * inr24\n"
-          "vmla.f32   q15, q5, q0               @ w4 * inr24\n"
-          "vld1.32    {d10-d11}, [%[wc0]]!      @ load w8 to q5\n"
-          /* mul r1 with w5, w8, get out r1, r0 */
-          "vmla.f32   q12, q6, q2               @ w5 * inr22\n"
-          "vmla.f32   q13, q6, q3               @ w5 * inr23\n"
-          "vmla.f32   q8,  q5, q2               @ w8 * inr22\n"
-          "vmla.f32   q9,  q5, q3               @ w8 * inr23\n"
-          "vld1.32    {d4-d7}, [%[r3]]!         @ load r3, q2, q3\n"
-          "ldr r4,    [%[outl], #32]            @ load bias addr to r4\n"
-          "vmla.f32   q14, q6, q0               @ w5 * inr24\n"
-          "vmla.f32   q15, q6, q1               @ w5 * inr25\n"
-          "vmla.f32   q10, q5, q0               @ w8 * inr24\n"
-          "vmla.f32   q11, q5, q1               @ w8 * inr25\n"
-          "vld1.32    {d0-d3}, [%[r3]]!         @ load r3, q0, q1\n"
-          "sub %[wc0], %[wc0], #144      @ wc0 - 144 to start address\n"
-          /* mul r3 with w6, w7, w8, get out r1 */
-          "vmla.f32   q12, q7, q2               @ w6 * inr30\n"
-          "vmla.f32   q13, q7, q3               @ w6 * inr31\n"
-          "vmla.f32   q14, q7, q0               @ w6 * inr32\n"
-          "vmla.f32   q15, q7, q1               @ w6 * inr33\n"
-          "vmla.f32   q12, q4, q3               @ w7 * inr31\n"
-          "vld1.32    {d4-d7}, [%[r3]]          @ load r3, q2, q3\n"
-          "vld1.32    {d12-d13}, [r4]           @ load bias\n"
-          "vmla.f32   q13, q4, q0               @ w7 * inr32\n"
-          "vmla.f32   q14, q4, q1               @ w7 * inr33\n"
-          "vmla.f32   q15, q4, q2               @ w7 * inr34\n"
-          "ldr r0,    [%[outl]]                 @ load outc00 to r0\n"
-          "vmla.f32   q12, q5, q0               @ w8 * inr32\n"
-          "vmla.f32   q13, q5, q1               @ w8 * inr33\n"
-          "ldr r5,    [%[outl], #36]            @ load flag_relu to r5\n"
-          "vmla.f32   q14, q5, q2               @ w8 * inr34\n"
-          "vmla.f32   q15, q5, q3               @ w8 * inr35\n"
-          "ldr r1,    [%[outl], #4]             @ load outc10 to r1\n"
-          "vadd.f32   q8, q8, q6                @ r00 add bias\n"
-          "vadd.f32   q9, q9, q6                @ r01 add bias\n"
-          "vadd.f32   q10, q10, q6              @ r02 add bias\n"
-          "vadd.f32   q11, q11, q6              @ r03 add bias\n"
-          "ldr r2,    [%[outl], #8]             @ load outc20 to r2\n"
-          "vadd.f32   q12, q12, q6              @ r10 add bias\n"
-          "vadd.f32   q13, q13, q6              @ r11 add bias\n"
-          "vadd.f32   q14, q14, q6              @ r12 add bias\n"
-          "vadd.f32   q15, q15, q6              @ r13 add bias\n"
-          "ldr r3,    [%[outl], #12]            @ load outc30 to r3\n"
-          "vmov.u32   q7, #0                    @ mov zero to q7\n"
-          "cmp  r5, #0                          @ cmp flag relu\n"
-          "beq  1f                              @ skip relu\n"
-          "vmax.f32  q8, q8, q7                 @ r00 relu\n"
-          "vmax.f32  q9, q9, q7                 @ r01 relu\n"
-          "vmax.f32  q10, q10, q7               @ r02 relu\n"
-          "vmax.f32  q11, q11, q7               @ r03 relu\n"
-          "vmax.f32  q12, q12, q7               @ r10 relu\n"
-          "vmax.f32  q13, q13, q7               @ r11 relu\n"
-          "vmax.f32  q14, q14, q7               @ r12 relu\n"
-          "vmax.f32  q15, q15, q7               @ r13 relu\n"
-          "1:\n"
-          "ldr r4,   [%[outl], #16]   @ load outc01 to r4\n"
-          "vtrn.32   q8, q9           @ r0: q8 : a0a1c0c1, q9 : b0b1d0d1\n"
-          "vtrn.32   q10, q11         @ r0: q10: a2a3c2c3, q11: b2b3d2d3\n"
-          "vtrn.32   q12, q13         @ r1: q12: a0a1c0c1, q13: b0b1d0d1\n"
-          "vtrn.32   q14, q15         @ r1: q14: a2a3c2c3, q15: b2b3d2d3\n"
-          "ldr r5,   [%[outl], #20]   @ load outc11 to r5\n"
-          "vswp      d17, d20         @ r0: q8 : a0a1a2a3, q10: c0c1c2c3 \n"
-          "vswp      d19, d22         @ r0: q9 : b0b1b2b3, q11: d0d1d2d3 \n"
-          "vswp      d25, d28         @ r1: q12: a0a1a2a3, q14: c0c1c2c3 \n"
-          "vswp      d27, d30         @ r1: q13: b0b1b2b3, q15: d0d1d2d3 \n"
-          "cmp %[flag_mask], #0       @ cmp flag mask\n"
-          "bne 2f\n"
-          "vst1.32   {d16-d17}, [r0]  @ save outc00\n"
-          "vst1.32   {d18-d19}, [r1]  @ save outc10\n"
-          "vst1.32   {d20-d21}, [r2]  @ save outc20\n"
-          "vst1.32   {d22-d23}, [r3]  @ save outc30\n"
-          "vst1.32   {d24-d25}, [r4]  @ save outc01\n"
-          "vst1.32   {d26-d27}, [r5]  @ save outc11\n"
-          "ldr r0,   [%[outl], #24]   @ load outc21 to r0\n"
-          "ldr r1,   [%[outl], #28]   @ load outc31 to r1\n"
-          "vst1.32   {d28-d29}, [r0]  @ save outc21\n"
-          "vst1.32   {d30-d31}, [r1]  @ save outc31\n"
-          "b 3f                       @ branch end\n"
-          "2: \n"
-          "vst1.32 {d16-d17}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d18-d19}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d20-d21}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d22-d23}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d24-d25}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d26-d27}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d28-d29}, [%[out0]]!  @ save remain to pre_out\n"
-          "vst1.32 {d30-d31}, [%[out0]]!  @ save remain to pre_out\n"
-          "3: \n"
-          : [r0] "+r"(inr0), [r1] "+r"(inr1),
-            [r2] "+r"(inr2), [r3] "+r"(inr3),
-            [out0] "+r"(out0), [wc0] "+r"(weight_c)
-          : [flag_mask] "r" (flag_mask), [outl] "r" (outl_ptr)
-          : "cc", "memory",
-            "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-            "q10", "q11", "q12", "q13","q14", "q15", "r0", "r1", "r2", "r3", "r4", "r5"
-          );
-#endif  //  __arch64__
-          // clang-format on
-          outl[0] += 4;
-          outl[1] += 4;
-          outl[2] += 4;
-          outl[3] += 4;
-          outl[4] += 4;
-          outl[5] += 4;
-          outl[6] += 4;
-          outl[7] += 4;
-          if (flag_mask) {
-            memcpy(outl[0] - 4, pre_out, remain * sizeof(float));
-            memcpy(outl[1] - 4, pre_out + 4, remain * sizeof(float));
-            memcpy(outl[2] - 4, pre_out + 8, remain * sizeof(float));
-            memcpy(outl[3] - 4, pre_out + 12, remain * sizeof(float));
-            memcpy(outl[4] - 4, pre_out + 16, remain * sizeof(float));
-            memcpy(outl[5] - 4, pre_out + 20, remain * sizeof(float));
-            memcpy(outl[6] - 4, pre_out + 24, remain * sizeof(float));
-            memcpy(outl[7] - 4, pre_out + 28, remain * sizeof(float));
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc b/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc
deleted file mode 100644
index 2d75323a96..0000000000
--- a/lite/backends/arm/math/conv3x3s2_depthwise_fp32.cc
+++ /dev/null
@@ -1,361 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <arm_neon.h>
-#include "lite/backends/arm/math/conv_block_utils.h"
-#include "lite/backends/arm/math/conv_impl.h"
-#include "lite/core/context.h"
-#include "lite/operators/op_params.h"
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_3x3s2_depthwise_fp32(const float* i_data,
-                               float* o_data,
-                               int bs,
-                               int oc,
-                               int oh,
-                               int ow,
-                               int ic,
-                               int ih,
-                               int win,
-                               const float* weights,
-                               const float* bias,
-                               const operators::ConvParam& param,
-                               ARMContext* ctx) {
-  int threads = ctx->threads();
-  const int pad_h = param.paddings[0];
-  const int pad_w = param.paddings[1];
-  const int out_c_block = 4;
-  const int out_h_kernel = 1;
-  const int out_w_kernel = 4;
-  const int win_ext = ow * 2 + 1;
-  const int ow_round = ROUNDUP(ow, 4);
-  const int win_round = ROUNDUP(win_ext, 4);
-  const int hin_round = oh * 2 + 1;
-  const int prein_size = win_round * hin_round * out_c_block;
-  auto workspace_size =
-      threads * prein_size + win_round /*tmp zero*/ + ow_round /*tmp writer*/;
-  ctx->ExtendWorkspace(sizeof(float) * workspace_size);
-
-  bool flag_relu = param.fuse_relu;
-  bool flag_bias = param.bias != nullptr;
-
-  /// get workspace
-  auto ptr_zero = ctx->workspace_data<float>();
-  memset(ptr_zero, 0, sizeof(float) * win_round);
-  float* ptr_write = ptr_zero + win_round;
-
-  int size_in_channel = win * ih;
-  int size_out_channel = ow * oh;
-
-  int ws = -pad_w;
-  int we = ws + win_round;
-  int hs = -pad_h;
-  int he = hs + hin_round;
-  int w_loop = ow_round / 4;
-  auto remain = w_loop * 4 - ow;
-  bool flag_remain = remain > 0;
-  remain = 4 - remain;
-  remain = remain > 0 ? remain : 0;
-  int row_len = win_round * out_c_block;
-
-  for (int n = 0; n < bs; ++n) {
-    const float* din_batch = i_data + n * ic * size_in_channel;
-    float* dout_batch = o_data + n * oc * size_out_channel;
-#pragma omp parallel for num_threads(threads)
-    for (int c = 0; c < oc; c += out_c_block) {
-#ifdef ARM_WITH_OMP
-      float* pre_din = ptr_write + ow_round + omp_get_thread_num() * prein_size;
-#else
-      float* pre_din = ptr_write + ow_round;
-#endif
-      /// const array size
-      prepack_input_nxwc4_dw(
-          din_batch, pre_din, c, hs, he, ws, we, ic, win, ih, ptr_zero);
-      const float* weight_c = weights + c * 9;  // kernel_w * kernel_h
-      float* dout_c00 = dout_batch + c * size_out_channel;
-      float bias_local[4] = {0, 0, 0, 0};
-      if (flag_bias) {
-        bias_local[0] = bias[c];
-        bias_local[1] = bias[c + 1];
-        bias_local[2] = bias[c + 2];
-        bias_local[3] = bias[c + 3];
-      }
-#ifdef __aarch64__
-      float32x4_t w0 = vld1q_f32(weight_c);       // w0, v23
-      float32x4_t w1 = vld1q_f32(weight_c + 4);   // w1, v24
-      float32x4_t w2 = vld1q_f32(weight_c + 8);   // w2, v25
-      float32x4_t w3 = vld1q_f32(weight_c + 12);  // w3, v26
-      float32x4_t w4 = vld1q_f32(weight_c + 16);  // w4, v27
-      float32x4_t w5 = vld1q_f32(weight_c + 20);  // w5, v28
-      float32x4_t w6 = vld1q_f32(weight_c + 24);  // w6, v29
-      float32x4_t w7 = vld1q_f32(weight_c + 28);  // w7, v30
-      float32x4_t w8 = vld1q_f32(weight_c + 32);  // w8, v31
-#endif
-      for (int h = 0; h < oh; h += out_h_kernel) {
-        float* outc0 = dout_c00 + h * ow;
-        float* outc1 = outc0 + size_out_channel;
-        float* outc2 = outc1 + size_out_channel;
-        float* outc3 = outc2 + size_out_channel;
-        const float* inr0 = pre_din + h * 2 * row_len;
-        const float* inr1 = inr0 + row_len;
-        const float* inr2 = inr1 + row_len;
-        if (c + out_c_block > oc) {
-          switch (c + out_c_block - oc) {
-            case 3:
-              outc1 = ptr_write;
-            case 2:
-              outc2 = ptr_write;
-            case 1:
-              outc3 = ptr_write;
-            default:
-              break;
-          }
-        }
-        auto c0 = outc0;
-        auto c1 = outc1;
-        auto c2 = outc2;
-        auto c3 = outc3;
-        float pre_out[16];
-        for (int w = 0; w < w_loop; ++w) {
-          bool flag_mask = (w == w_loop - 1) && flag_remain;
-          if (flag_mask) {
-            c0 = outc0;
-            c1 = outc1;
-            c2 = outc2;
-            c3 = outc3;
-            outc0 = pre_out;
-            outc1 = pre_out + 4;
-            outc2 = pre_out + 8;
-            outc3 = pre_out + 12;
-          }
-// clang-format off
-#ifdef __aarch64__
-          asm volatile(
-          "ldr    q8, [%[bias]]\n"         /* load bias */
-          "ldp    q0, q1,   [%[inr0]], #32\n" /* load input r0*/
-          "and    v19.16b,  v8.16b, v8.16b\n"
-          "ldp    q2, q3,   [%[inr0]], #32\n" /* load input r0*/
-          "and    v20.16b,  v8.16b, v8.16b\n"
-          "ldp    q4, q5,   [%[inr0]], #32\n" /* load input r0*/
-          "and    v21.16b,  v8.16b, v8.16b\n"
-          "ldp    q6, q7,   [%[inr0]], #32\n" /* load input r0*/
-          "and    v22.16b,  v8.16b, v8.16b\n"
-          "ldr    q8,       [%[inr0]]\n"      /* load input r0*/
-          /*  r0 mul w0-w2, get out */
-          "fmla   v19.4s ,  %[w0].4s,  v0.4s\n" /* outr0 = w0 * r0, 0*/
-          "fmla   v20.4s ,  %[w0].4s,  v2.4s\n" /* outr1 = w0 * r0, 2*/
-          "fmla   v21.4s ,  %[w0].4s,  v4.4s\n" /* outr2 = w0 * r0, 4*/
-          "fmla   v22.4s ,  %[w0].4s,  v6.4s\n" /* outr3 = w0 * r0, 6*/
-          "fmla   v19.4s ,  %[w1].4s,  v1.4s\n" /* outr0 = w1 * r0, 1*/
-          "ldp    q0, q1,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v20.4s ,  %[w1].4s,  v3.4s\n" /* outr1 = w1 * r0, 3*/
-          "fmla   v21.4s ,  %[w1].4s,  v5.4s\n" /* outr2 = w1 * r0, 5*/
-          "fmla   v22.4s ,  %[w1].4s,  v7.4s\n" /* outr3 = w1 * r0, 7*/
-          "fmla   v19.4s ,  %[w2].4s,  v2.4s\n" /* outr0 = w0 * r0, 2*/
-          "ldp    q2, q3,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v20.4s ,  %[w2].4s,  v4.4s\n" /* outr1 = w0 * r0, 4*/
-          "ldp    q4, q5,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v21.4s ,  %[w2].4s,  v6.4s\n" /* outr2 = w0 * r0, 6*/
-          "ldp    q6, q7,   [%[inr1]], #32\n"   /* load input r1*/
-          "fmla   v22.4s ,  %[w2].4s,  v8.4s\n" /* outr3 = w0 * r0, 8*/
-          "ldr    q8,   [%[inr1]]\n"            /* load input r1*/
-          /*  r1, mul w3-w5, get out */
-          "fmla   v19.4s ,  %[w3].4s,  v0.4s\n" /* outr0 = w3 * r1, 0*/
-          "fmla   v20.4s ,  %[w3].4s,  v2.4s\n" /* outr1 = w3 * r1, 2*/
-          "fmla   v21.4s ,  %[w3].4s,  v4.4s\n" /* outr2 = w3 * r1, 4*/
-          "fmla   v22.4s ,  %[w3].4s,  v6.4s\n" /* outr3 = w3 * r1, 6*/
-          "fmla   v19.4s ,  %[w4].4s,  v1.4s\n" /* outr0 = w4 * r1, 1*/
-          "ldp    q0, q1,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v20.4s ,  %[w4].4s,  v3.4s\n" /* outr1 = w4 * r1, 3*/
-          "fmla   v21.4s ,  %[w4].4s,  v5.4s\n" /* outr2 = w4 * r1, 5*/
-          "fmla   v22.4s ,  %[w4].4s,  v7.4s\n" /* outr3 = w4 * r1, 7*/
-          "fmla   v19.4s ,  %[w5].4s,  v2.4s\n" /* outr0 = w5 * r1, 2*/
-          "ldp    q2, q3,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v20.4s ,  %[w5].4s,  v4.4s\n" /* outr1 = w5 * r1, 4*/
-          "ldp    q4, q5,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v21.4s ,  %[w5].4s,  v6.4s\n" /* outr2 = w5 * r1, 6*/
-          "ldp    q6, q7,   [%[inr2]], #32\n"   /* load input r2*/
-          "fmla   v22.4s ,  %[w5].4s,  v8.4s\n" /* outr3 = w5 * r1, 8*/
-          "ldr    q8,   [%[inr2]]\n"            /* load input r2*/
-          /*  r2, mul w6-w8, get out r0, r1 */
-          "fmla   v19.4s ,  %[w6].4s,  v0.4s\n" /* outr0 = w6 * r2, 0*/
-          "fmla   v20.4s ,  %[w6].4s,  v2.4s\n" /* outr1 = w6 * r2, 2*/
-          "fmla   v21.4s ,  %[w6].4s,  v4.4s\n" /* outr2 = w6 * r2, 4*/
-          "fmla   v22.4s ,  %[w6].4s,  v6.4s\n" /* outr3 = w6 * r2, 6*/
-          "fmla   v19.4s ,  %[w7].4s,  v1.4s\n" /* outr0 = w7 * r2, 1*/
-          "fmla   v20.4s ,  %[w7].4s,  v3.4s\n" /* outr1 = w7 * r2, 3*/
-          "fmla   v21.4s ,  %[w7].4s,  v5.4s\n" /* outr2 = w7 * r2, 5*/
-          "fmla   v22.4s ,  %[w7].4s,  v7.4s\n" /* outr3 = w7 * r2, 7*/
-          "fmla   v19.4s ,  %[w8].4s,  v2.4s\n" /* outr0 = w8 * r2, 2*/
-          "fmla   v20.4s ,  %[w8].4s,  v4.4s\n" /* outr1 = w8 * r2, 4*/
-          "fmla   v21.4s ,  %[w8].4s,  v6.4s\n" /* outr2 = w8 * r2, 6*/
-          "fmla   v22.4s ,  %[w8].4s,  v8.4s\n" /* outr3 = w8 * r2, 8*/
-          /* transpose */
-          "trn1 v0.4s, v19.4s, v20.4s\n" /* r0: a0a1c0c1*/
-          "trn2 v1.4s, v19.4s, v20.4s\n" /* r0: b0b1d0d1*/
-          "trn1 v2.4s, v21.4s, v22.4s\n" /* r0: a2a3c2c3*/
-          "trn2 v3.4s, v21.4s, v22.4s\n" /* r0: b2b3d2d3*/
-          "trn1 v19.2d, v0.2d, v2.2d\n"  /* r0: a0a1a2a3*/
-          "trn2 v21.2d, v0.2d, v2.2d\n"  /* r0: c0c1c2c3*/
-          "trn1 v20.2d, v1.2d, v3.2d\n"  /* r0: b0b1b2b3*/
-          "trn2 v22.2d, v1.2d, v3.2d\n"  /* r0: d0d1d2d3*/
-          /* relu */
-          "cbz  %w[flag_relu],  0f\n"    /* skip relu*/
-          "movi v0.4s, #0\n"             /* for relu */
-          "fmax v19.4s, v19.4s, v0.4s\n"
-          "fmax v20.4s, v20.4s, v0.4s\n"
-          "fmax v21.4s, v21.4s, v0.4s\n"
-          "fmax v22.4s, v22.4s, v0.4s\n"
-          /* save result */
-          "0:\n"
-          "str q19, [%[outc0]], #16\n"
-          "str q20, [%[outc1]], #16\n"
-          "str q21, [%[outc2]], #16\n"
-          "str q22, [%[outc3]], #16\n"
-          :[inr0] "+r"(inr0), [inr1] "+r"(inr1),
-          [inr2] "+r"(inr2),
-          [outc0]"+r"(outc0), [outc1]"+r"(outc1),
-          [outc2]"+r"(outc2), [outc3]"+r"(outc3)
-          :[w0] "w"(w0), [w1] "w"(w1), [w2] "w"(w2),
-          [w3] "w"(w3), [w4] "w"(w4), [w5] "w"(w5),
-          [w6] "w"(w6), [w7] "w"(w7), [w8] "w"(w8),
-          [bias] "r" (bias_local), [flag_relu]"r"(flag_relu)
-          : "cc", "memory",
-                  "v0","v1","v2","v3","v4","v5","v6","v7",
-                  "v8", "v19","v20","v21","v22"
-          );
-#else
-          asm volatile(
-          /* fill with bias */
-          "vld1.32  {d16-d17}, [%[bias]]\n"   /* load bias */
-          /* load weights */
-          "vld1.32    {d18-d21}, [%[wc0]]!\n" /* load w0-2, to q9-11 */
-          "vld1.32  {d0-d3},   [%[r0]]!\n"    /* load input r0, 0,1*/
-          "vand.i32 q12,  q8, q8\n"
-          "vld1.32  {d4-d7},   [%[r0]]!\n"    /* load input r0, 2,3*/
-          "vand.i32 q13,  q8, q8\n"
-          "vld1.32  {d8-d11},  [%[r0]]!\n"    /* load input r0, 4,5*/
-          "vand.i32 q14,  q8, q8\n"
-          "vld1.32  {d12-d15}, [%[r0]]!\n"    /* load input r0, 6,7*/
-          "vand.i32 q15,  q8, q8\n"
-          "vld1.32  {d16-d17}, [%[r0]]\n"     /* load input r0, 8*/
-          /* mul r0 with w0, w1, w2 */
-          "vmla.f32   q12, q9, q0               @ w0 * inr0\n"
-          "vmla.f32   q13, q9, q2               @ w0 * inr2\n"
-          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w2, to q11 */
-          "vmla.f32   q14, q9, q4               @ w0 * inr4\n"
-          "vmla.f32   q15, q9, q6               @ w0 * inr6\n"
-          "vmla.f32   q12, q10, q1              @ w1 * inr1\n"
-          "vld1.32    {d0-d3}, [%[r1]]!         @ load r1, 0, 1\n"
-          "vmla.f32   q13, q10, q3              @ w1 * inr3\n"
-          "vmla.f32   q14, q10, q5              @ w1 * inr5\n"
-          "vmla.f32   q15, q10, q7              @ w1 * inr7\n"
-          "vld1.32    {d18-d21}, [%[wc0]]!\n"  /* load w3-4, to q9-10 */
-          "vmla.f32   q12, q11, q2              @ w2 * inr2\n"
-          "vld1.32    {d4-d7}, [%[r1]]!         @ load r1, 2, 3\n"
-          "vmla.f32   q13, q11, q4              @ w2 * inr4\n"
-          "vld1.32    {d8-d11}, [%[r1]]!        @ load r1, 4, 5\n"
-          "vmla.f32   q14, q11, q6              @ w2 * inr6\n"
-          "vld1.32    {d12-d15}, [%[r1]]!       @ load r1, 6, 7\n"
-          "vmla.f32   q15, q11, q8              @ w2 * inr8\n"
-          /* mul r1 with w3, w4, w5 */
-          "vmla.f32   q12, q9, q0               @ w3 * inr0\n"
-          "vmla.f32   q13, q9, q2               @ w3 * inr2\n"
-          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w5, to q11 */
-          "vmla.f32   q14, q9, q4               @ w3 * inr4\n"
-          "vmla.f32   q15, q9, q6               @ w3 * inr6\n"
-          "vld1.32    {d16-d17}, [%[r1]]\n"     /* load input r1, 8*/
-          "vmla.f32   q12, q10, q1              @ w4 * inr1\n"
-          "vld1.32    {d0-d3}, [%[r2]]!         @ load r2, 0, 1\n"
-          "vmla.f32   q13, q10, q3              @ w4 * inr3\n"
-          "vmla.f32   q14, q10, q5              @ w4 * inr5\n"
-          "vmla.f32   q15, q10, q7              @ w4 * inr7\n"
-          "vld1.32    {d18-d21}, [%[wc0]]!\n"   /* load w6-7, to q9-10 */
-          "vmla.f32   q12, q11, q2              @ w5 * inr2\n"
-          "vld1.32    {d4-d7}, [%[r2]]!         @ load r2, 2, 3\n"
-          "vmla.f32   q13, q11, q4              @ w5 * inr4\n"
-          "vld1.32    {d8-d11}, [%[r2]]!        @ load r2, 4, 5\n"
-          "vmla.f32   q14, q11, q6              @ w5 * inr6\n"
-          "vld1.32    {d12-d15}, [%[r2]]!       @ load r2, 6, 7\n"
-          "vmla.f32   q15, q11, q8              @ w5 * inr8\n"
-          /* mul r2 with w6, w7, w8 */
-          "vmla.f32   q12, q9, q0               @ w6 * inr0\n"
-          "vmla.f32   q13, q9, q2               @ w6 * inr2\n"
-          "vld1.32    {d22-d23}, [%[wc0]]!\n"   /* load w8, to q11 */
-          "vmla.f32   q14, q9, q4               @ w6 * inr4\n"
-          "vmla.f32   q15, q9, q6               @ w6 * inr6\n"
-          "vld1.32    {d16-d17}, [%[r2]]\n"     /* load input r2, 8*/
-          "vmla.f32   q12, q10, q1              @ w7 * inr1\n"
-          "vmla.f32   q13, q10, q3              @ w7 * inr3\n"
-          "vmla.f32   q14, q10, q5              @ w7 * inr5\n"
-          "vmla.f32   q15, q10, q7              @ w7 * inr7\n"
-          "sub    %[wc0], %[wc0], #144          @ wc0 - 144 to start address\n"
-          "vmla.f32   q12, q11, q2              @ w8 * inr2\n"
-          "vmla.f32   q13, q11, q4              @ w8 * inr4\n"
-          "vmla.f32   q14, q11, q6              @ w8 * inr6\n"
-          "vmla.f32   q15, q11, q8              @ w8 * inr8\n"
-          /* transpose */
-          "vtrn.32 q12, q13\n"    /* a0a1c0c1, b0b1d0d1*/
-          "vtrn.32 q14, q15\n"    /* a2a3c2c3, b2b3d2d3*/
-          "vswp   d25, d28\n"     /* a0a1a2a3, c0c1c2c3*/
-          "vswp   d27, d30\n"     /* b0b1b2b3, d0d1d2d3*/
-          "cmp  %[flag_relu], #0\n"
-          "beq  0f\n"             /* skip relu*/
-          "vmov.u32 q0, #0\n"
-          "vmax.f32 q12, q12, q0\n"
-          "vmax.f32 q13, q13, q0\n"
-          "vmax.f32 q14, q14, q0\n"
-          "vmax.f32 q15, q15, q0\n"
-          "0:\n"
-          "vst1.32 {d24-d25}, [%[outc0]]!\n" /* save outc0*/
-          "vst1.32 {d26-d27}, [%[outc1]]!\n" /* save outc1*/
-          "vst1.32 {d28-d29}, [%[outc2]]!\n" /* save outc2*/
-          "vst1.32 {d30-d31}, [%[outc3]]!\n" /* save outc3*/
-          :[r0] "+r"(inr0), [r1] "+r"(inr1),
-           [r2] "+r"(inr2), [wc0] "+r" (weight_c),
-           [outc0]"+r"(outc0), [outc1]"+r"(outc1),
-           [outc2]"+r"(outc2), [outc3]"+r"(outc3)
-          :[bias] "r" (bias_local),
-           [flag_relu]"r"(flag_relu)
-          :"cc", "memory",
-            "q0","q1","q2","q3","q4","q5","q6","q7",
-            "q8", "q9","q10","q11","q12","q13","q14","q15"
-          );
-#endif  //  __arch64__
-          // clang-format off
-          if (flag_mask) {
-            for (int i = 0; i < remain; ++i) {
-              c0[i] = pre_out[i];
-              c1[i] = pre_out[i + 4];
-              c2[i] = pre_out[i + 8];
-              c3[i] = pre_out[i + 12];
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_3x3p0.cc b/lite/backends/arm/math/conv_depthwise_3x3p0.cc
deleted file mode 100644
index 0c050ffe6f..0000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3p0.cc
+++ /dev/null
@@ -1,4178 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_3x3s1p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s2p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s1p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3s2p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3p0_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx) {
-  if (stride == 1) {
-    if (flag_relu) {
-      if (w_in > 5) {
-        conv_depthwise_3x3s1p0_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s1p0_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 5) {
-        conv_depthwise_3x3s1p0_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s1p0_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  } else {  //! stride = 2
-    if (flag_relu) {
-      if (w_in > 8) {
-        conv_depthwise_3x3s2p0_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s2p0_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 8) {
-        conv_depthwise_3x3s2p0_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s2p0_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-// 4line
-void conv_depthwise_3x3s1p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = w_out >> 2;
-  int remain = w_out % 4;
-
-  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
-  const int remian_idx[4] = {0, 1, 2, 3};
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-      // wr0 = vsetq_lane_f32(0.f, wr0, 3);
-      // wr1 = vsetq_lane_f32(0.f, wr1, 3);
-      // wr2 = vsetq_lane_f32(0.f, wr2, 3);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_out; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr5;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 >= h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            case 0:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = tile_w;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */
-
-            // mid
-            // "cmp  %[cnt], #1                \n"
-            // "blt 5f                         \n"
-            "4:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"      /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v1.4s}, [%[din_ptr0]]        \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 4b \n"
-
-            // right
-            "5:                             \n"
-            "cmp  %[remain], #1             \n"
-            "blt 0f                         \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v16 = 2345 */
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            // end
-            "0:                             \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-
-        dr0 = dr2;
-        dr1 = dr3;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            case 0:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = tile_w;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r3\n"
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "cmp %[remain], #1                             @ check whether has "
-            "mid cols\n"
-            "blt  0f                                @ jump to main loop start "
-            "point\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-            "0:                         \n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- */
-// w_in > 7
-void conv_depthwise_3x3s2p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-
-  int tile_w = w_out >> 2;
-  int cnt_remain = w_out % 4;
-
-  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 >= h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            case 0:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = tile_w;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-                                                        // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i++) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = tile_w;
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-            // mid
-            "2:                                             \n"
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-
-// 4line
-void conv_depthwise_3x3s1p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = w_out >> 2;
-  int remain = w_out % 4;
-
-  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
-  const int remian_idx[4] = {0, 1, 2, 3};
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-      // wr0 = vsetq_lane_f32(0.f, wr0, 3);
-      // wr1 = vsetq_lane_f32(0.f, wr1, 3);
-      // wr2 = vsetq_lane_f32(0.f, wr2, 3);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_out; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr5;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 >= h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            case 0:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = tile_w;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v17 = 2345 */
-
-            // mid
-            "4:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"      /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v1.4s}, [%[din_ptr0]]        \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v13.4s}, [%[bias_val]]      \n"   /*vdupq_n_f32(bias_val)*/
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 4b \n"
-
-            // right
-            "5:                             \n"
-            "cmp  %[remain], #1             \n"
-            "blt 0f                         \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v16 = 2345 */
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /* relu */
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /* relu */
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            // end
-            "0:                             \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_out; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-
-        dr0 = dr2;
-        dr1 = dr3;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            case 0:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = tile_w;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r3\n"
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32 q4, q4, %q[vzero]          @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vmax.f32 q5, q5, %q[vzero]          @ relu \n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "cmp %[remain], #1                             @ check whether has "
-            "mid cols\n"
-            "blt  0f                                @ jump to main loop start "
-            "point\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmax.f32 q4, q4, %q[vzero]          @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmax.f32 q5, q5, %q[vzero]          @ relu \n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-            "0:                         \n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero),
-              [remain] "r"(remain)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, with reulu
- */
-// w_in > 7
-void conv_depthwise_3x3s2p0_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-
-  int tile_w = w_out >> 2;
-  int cnt_remain = w_out % 4;
-
-  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 >= h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            case 0:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = tile_w;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-                                                        // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"       // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"   // v10 = vbias
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i++) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = tile_w;
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-            // mid
-            "2:                                             \n"
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "subs %[cnt], #1                                \n"
-            "vmax.f32 q3, q3, q9                     @ relu \n"
-
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                     @ relu \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp1 =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
-  uint32x4_t vmask_rp2 =
-      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_out; j += 2) {
-        const float* dr0 = din_channel + j * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        doutr0 = dout_channel + j * w_out;
-        doutr1 = doutr0 + w_out;
-
-        if (j + 3 >= h_in) {
-          switch (j + 3 - h_in) {
-            case 3:
-              dr1 = zero_ptr;
-            case 2:
-              dr2 = zero_ptr;
-            case 1:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            case 0:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s, v1.4s}, [%[din0]]\n"
-            "ld1 {v2.4s, v3.4s}, [%[din1]]\n"
-            "ld1 {v4.4s, v5.4s}, [%[din2]]\n"
-            "ld1 {v6.4s, v7.4s}, [%[din3]]\n"
-
-            "bif v0.16b, %[zero].16b, %[mask1].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask2].16b\n"  // d0_1234
-
-            "bif v2.16b, %[zero].16b, %[mask1].16b\n"  // d1_1234
-            "bif v3.16b, %[zero].16b, %[mask2].16b\n"  // d1_1234
-
-            "bif v4.16b, %[zero].16b, %[mask1].16b\n"  // d2_1234
-            "bif v5.16b, %[zero].16b, %[mask2].16b\n"  // d2_1234
-
-            "bif v6.16b, %[zero].16b, %[mask1].16b\n"  // d3_1234
-            "bif v7.16b, %[zero].16b, %[mask2].16b\n"  // d3_1234
-
-            "ext v8.16b, v0.16b, v1.16b, #4\n"  // d1_2345
-            "ext v9.16b, v0.16b, v1.16b, #8\n"  // d1_3450
-
-            "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"  // v12 = vbias
-            "and  v13.16b, %[vbias].16b, %[vbias].16b  \n"  // v13 = vbias
-
-            // r0
-            "fmul v10.4s, v0.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmul v11.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v12.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v2.16b, v3.16b, #4\n"  // d1_2345
-            "ext v9.16b, v2.16b, v3.16b, #8\n"  // d1_3450
-
-            // r1
-            "fmul v14.4s, v2.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v2.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmul v15.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v4.16b, v5.16b, #4\n"  // d1_2345
-            "ext v9.16b, v4.16b, v5.16b, #8\n"  // d1_3450
-
-            // r2
-            "fmla v14.4s, v4.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v4.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v6.16b, v7.16b, #4\n"  // d1_2345
-            "ext v9.16b, v6.16b, v7.16b, #8\n"  // d1_3450
-
-            // r3
-            "fmla v14.4s, v6.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fadd v12.4s, v12.4s, v10.4s\n"
-
-            "fmla v13.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "fadd v12.4s, v12.4s, v11.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v14.4s\n"  // out2
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out2
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vbias] "w"(wbias),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [zero] "w"(vzero),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-#else
-        unsigned int* vmask_ptr = vmask;
-        float bias_val = flag_bias ? bias[i] : 0.f;
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-            "vadd.f32 q4, q4, q10         @ q4 += q10 \n"
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vadd.f32 q4, q4, q11         @ q4 += q10 \n"
-
-            "vadd.f32 q5, q5, q8         @ q4 += q10 \n"
-            "vadd.f32 q5, q5, q9         @ q4 += q10 \n"
-
-            "vst1.32  {d8-d9},   [%[out1]]  @ store result, add pointer\n"
-            "vst1.32  {d10-d11},   [%[out2]]  @ store result, add pointer\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vzero] "w"(vzero),
-              [bias_val] "r"(bias_val),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
- */
-
-void conv_depthwise_3x3s2p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      float out_buf[4];
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      for (int j = 0; j < h_out; ++j) {
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-            "and  v4.16b, %[bias].16b, %[bias].16b  \n"  // v10 = vbias
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v10.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v7.16b, v12.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v8.16b, v14.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-
-            "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"   // 0246 * w00
-            "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"   // 1357 * w01
-            "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n"  // 2468  * w02
-
-            "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"   // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"   // v13 * w12
-            "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"   // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"   // v15 * w21
-            "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v16.4s                       \n"
-
-            // "fadd v4.4s, v4.4s, %[bias].4s                  \n"
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,0}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q7 = {2,4,6,0}
-            "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"  // q8 = {2,4,6,0}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {0,2,4,6}
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // {1,3,5,7}
-            "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {2,4,6,0}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf),
-              [mask_ptr] "r"(dmask)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp1 =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
-  uint32x4_t vmask_rp2 =
-      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_out; j += 2) {
-        const float* dr0 = din_channel + j * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        doutr0 = dout_channel + j * w_out;
-        doutr1 = doutr0 + w_out;
-
-        if (j + 3 >= h_in) {
-          switch (j + 3 - h_in) {
-            case 3:
-              dr1 = zero_ptr;
-            case 2:
-              dr2 = zero_ptr;
-            case 1:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            case 0:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s, v1.4s}, [%[din0]]\n"
-            "ld1 {v2.4s, v3.4s}, [%[din1]]\n"
-            "ld1 {v4.4s, v5.4s}, [%[din2]]\n"
-            "ld1 {v6.4s, v7.4s}, [%[din3]]\n"
-
-            "bif v0.16b, %[zero].16b, %[mask1].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask2].16b\n"  // d0_1234
-
-            "bif v2.16b, %[zero].16b, %[mask1].16b\n"  // d1_1234
-            "bif v3.16b, %[zero].16b, %[mask2].16b\n"  // d1_1234
-
-            "bif v4.16b, %[zero].16b, %[mask1].16b\n"  // d2_1234
-            "bif v5.16b, %[zero].16b, %[mask2].16b\n"  // d2_1234
-
-            "bif v6.16b, %[zero].16b, %[mask1].16b\n"  // d3_1234
-            "bif v7.16b, %[zero].16b, %[mask2].16b\n"  // d3_1234
-
-            "ext v8.16b, v0.16b, v1.16b, #4\n"  // d1_2345
-            "ext v9.16b, v0.16b, v1.16b, #8\n"  // d1_3450
-
-            "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"  // v12 = vbias
-            "and  v13.16b, %[vbias].16b, %[vbias].16b  \n"  // v13 = vbias
-
-            // r0
-            "fmul v10.4s, v0.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmul v11.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v12.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v2.16b, v3.16b, #4\n"  // d1_2345
-            "ext v9.16b, v2.16b, v3.16b, #8\n"  // d1_3450
-
-            // r1
-            "fmul v14.4s, v2.4s, %[wr0].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v2.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmul v15.4s, v8.4s, %[wr0].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v4.16b, v5.16b, #4\n"  // d1_2345
-            "ext v9.16b, v4.16b, v5.16b, #8\n"  // d1_3450
-
-            // r2
-            "fmla v14.4s, v4.4s, %[wr1].s[0]\n"  // d0_1234 * w0[0]
-            "fmla v10.4s, v4.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr1].s[1]\n"  // d1_2345 * w0[1]
-            "fmla v11.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fmla v13.4s, v9.4s, %[wr1].s[2]\n"  // d0_3456 * w0[2]
-            "fmla v12.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "ext v8.16b, v6.16b, v7.16b, #4\n"  // d1_2345
-            "ext v9.16b, v6.16b, v7.16b, #8\n"  // d1_3450
-
-            // r3
-            "fmla v14.4s, v6.4s, %[wr2].s[0]\n"  // d0_1234 * w0[0]
-
-            "fmla v15.4s, v8.4s, %[wr2].s[1]\n"  // d1_2345 * w0[1]
-
-            "fadd v12.4s, v12.4s, v10.4s\n"
-
-            "fmla v13.4s, v9.4s, %[wr2].s[2]\n"  // d0_3456 * w0[2]
-
-            "fadd v12.4s, v12.4s, v11.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v14.4s\n"  // out2
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out2
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-            "fmax v12.4s, v12.4s, %[zero].4s                       \n"
-            "fmax v13.4s, v13.4s, %[zero].4s                       \n"
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vbias] "w"(wbias),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [zero] "w"(vzero),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-#else
-        unsigned int* vmask_ptr = vmask;
-        float bias_val = flag_bias ? bias[i] : 0.f;
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-            "vadd.f32 q4, q4, q10         @ q4 += q10 \n"
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vadd.f32 q4, q4, q11         @ q4 += q10 \n"
-
-            "vadd.f32 q5, q5, q8         @ q4 += q10 \n"
-            "vadd.f32 q5, q5, q9         @ q4 += q10 \n"
-            "vmax.f32 q4, q4, %q[vzero]   @ relu \n"
-            "vmax.f32 q5, q5, %q[vzero]   @ relu \n"
-
-            "vst1.32  {d8-d9},   [%[out1]]  @ store result, add pointer\n"
-            "vst1.32  {d10-d11},   [%[out2]]  @ store result, add pointer\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [vzero] "w"(vzero),
-              [bias_val] "r"(bias_val),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        // doutr0 = doutr1;
-        // doutr1 += w_out;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 7
- */
-void conv_depthwise_3x3s2p0_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      float out_buf[4];
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      for (int j = 0; j < h_out; ++j) {
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]]        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-            "and  v4.16b, %[bias].16b, %[bias].16b  \n"  // v10 = vbias
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v10.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v7.16b, v12.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-            "ext v8.16b, v14.16b, v9.16b, #4               \n"  // v6 =
-                                                                // {2,4,6,8}
-
-            "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"   // 0246 * w00
-            "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"   // 1357 * w01
-            "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n"  // 2468  * w02
-
-            "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"   // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"   // v13 * w12
-            "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"   // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"   // v15 * w21
-            "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v16.4s                       \n"
-            "fmax v4.4s, v4.4s, v9.4s                       \n"
-
-            // "fadd v4.4s, v4.4s, %[bias].4s                  \n"
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf),
-              [mask_ptr] "r"(mask_ptr)
-            : "cc",
-              "memory",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,0}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q7 = {2,4,6,0}
-            "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"  // q8 = {2,4,6,0}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {0,2,4,6}
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // {1,3,5,7}
-            "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // {2,4,6,0}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                            @ relu \n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf),
-              [mask_ptr] "r"(mask_ptr)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_3x3p1.cc b/lite/backends/arm/math/conv_depthwise_3x3p1.cc
deleted file mode 100644
index 6f28d48d6d..0000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3p1.cc
+++ /dev/null
@@ -1,4850 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_3x3s1p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s1p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s1p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx);
-
-//! for input width <= 4
-void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx);
-
-void conv_depthwise_3x3p1_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int stride,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx) {
-  if (stride == 1) {
-    if (flag_relu) {
-      if (w_in > 4) {
-        conv_depthwise_3x3s1p1_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 4) {
-        conv_depthwise_3x3s1p1_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s1p1_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  } else {  //! stride = 2
-    if (flag_relu) {
-      if (w_in > 7) {
-        conv_depthwise_3x3s2p1_bias_relu(dout,
-                                         din,
-                                         weights,
-                                         bias,
-                                         flag_bias,
-                                         num,
-                                         ch_in,
-                                         h_in,
-                                         w_in,
-                                         h_out,
-                                         w_out,
-                                         ctx);
-      } else {
-        conv_depthwise_3x3s2p1_bias_s_relu(dout,
-                                           din,
-                                           weights,
-                                           bias,
-                                           flag_bias,
-                                           num,
-                                           ch_in,
-                                           h_in,
-                                           w_in,
-                                           h_out,
-                                           w_out,
-                                           ctx);
-      }
-    } else {
-      if (w_in > 7) {
-        conv_depthwise_3x3s2p1_bias(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-      } else {
-        conv_depthwise_3x3s2p1_bias_s(dout,
-                                      din,
-                                      weights,
-                                      bias,
-                                      flag_bias,
-                                      num,
-                                      ch_in,
-                                      h_in,
-                                      w_in,
-                                      h_out,
-                                      w_out,
-                                      ctx);
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-// 4line
-void conv_depthwise_3x3s1p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  // printf("conv3x3_dw start \n");
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v0.16b, v1.16b, #4 \n"        /* v16 = 1234 */
-
-            // left
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 *
-                                                    w0[1]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */
-            "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 *
-                                                      w0[0]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v2.16b, v3.16b, #4 \n"        /* v16 = 1234 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 *
-                                                     w0[1]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 *
-                                                     w1[1]*/
-            "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v4.16b, v5.16b, #4 \n"        /* v16 = 1234 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v6.16b, v7.16b, #4 \n"        /* v16 = 1234 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */
-            "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16 \n"    /* vst1q_f32() */
-            "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */
-            "cmp  %[cnt], #1                \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 1b \n"
-
-            // right
-            "3:                             \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"
-            "vext.32  q7, q8, q9, #1     @ 1234\n"
-
-            // left
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"
-
-            "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"
-            "vext.32  q7, q10, q11, #1     @ 1234\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"
-            "vext.32  q7, q12, q13, #1     @ 1234\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"
-            "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "cmp %[cnt], #1                             @ check whether has "
-            "mid cols\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-            "blt  3f                                @ jump to main loop start "
-            "point\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- */
-// w_in > 7
-void conv_depthwise_3x3s2p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  int size_pad_bottom = h_out * 2 - h_in;
-
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
-
-  int size_right_pad = w_out * 2 - w_in;
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          din4_ptr = dr3;
-          dr0 = dr3;
-          dr1 = dr4;
-        } else {
-          dr0 = dr4;
-          dr1 = dr0 + w_in;
-        }
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i / 2 + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = cnt_col;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   // {0,2,4,6} * w01
-            "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr0], %[inptr0], #4            \n"
-            "sub %[inptr1], %[inptr1], #4             \n"
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v12.4s, v3.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr2], %[inptr2], #4            \n"
-            "sub %[inptr3], %[inptr3], #4             \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[1]            \n"  // {0,2,4,6} * w01
-            "fmla v11.4s, v4.4s, %[w2].s[1]            \n"  // {0,2,4,6} * w01
-
-            "fmul v14.4s, v5.4s, %[w0].s[2]            \n"  // {1,3,5,7} * w02
-            "fmla v12.4s, v5.4s, %[w2].s[2]            \n"  // {1,3,5,7} * w02
-
-            "fmla v17.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-            "fmla v16.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr4], %[inptr4], #4            \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v7.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"  // v10 = {0,1,3,5}
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v9.4s, %[w2].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "cmp %[cnt], #1                             \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "blt 1f                                     \n"
-            // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_in; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q10, q11
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q12, q13
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v13={0,2,4,6} v14={1,3,5,7}, q14, q15
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vext.32 q6, q9, q11, #3                        @ shift right 1 "
-            "data\n"  // q2 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"
-            "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"
-            "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, "
-            "out1\n"  // q0 * w01
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q1 * w02
-            "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q2 * w00
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "cmp %[cnt], #1                                 \n"
-            "blt 1f                                         \n"
-            // mid
-            "2:                                             \n"
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-
-// 4line
-void conv_depthwise_3x3s1p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  // printf("conv3x3_dw start \n");
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int tile_h = (h_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-  int size_pad_bottom = (unsigned int)(1 + (tile_h << 2) - h_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-#ifdef __aarch64__
-    for (int c = 0; c < ch_in; c++) {
-      float* dout_ptr = dout_batch + c * size_out_channel;
-
-      const float* din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float* wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float* doutr0 = dout_ptr;
-      float* doutr1 = doutr0 + w_out;
-      float* doutr2 = doutr1 + w_out;
-      float* doutr3 = doutr2 + w_out;
-
-      const float* dr0 = din_ch_ptr;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-      const float* dr5 = dr4 + w_in;
-
-      const float* din_ptr0 = dr0;
-      const float* din_ptr1 = dr1;
-      const float* din_ptr2 = dr2;
-      const float* din_ptr3 = dr3;
-      const float* din_ptr4 = dr4;
-      const float* din_ptr5 = dr5;
-
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        asm volatile(
-            "PRFM PLDL1KEEP, [%[din_ptr0]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr1]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr2]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr3]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr4]] \n"
-            "PRFM PLDL1KEEP, [%[din_ptr5]] \n"
-            "movi   v21.4s, #0x0\n" /* out0 = 0 */
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v0.16b, v1.16b, #4 \n"        /* v16 = 1234 */
-
-            // left
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 *
-                                                    w0[1]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */
-            "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 *
-                                                      w0[0]*/
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v2.16b, v3.16b, #4 \n"        /* v16 = 1234 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 *
-                                                     w0[1]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 *
-                                                     w1[1]*/
-            "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */
-            "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v4.16b, v5.16b, #4 \n"        /* v16 = 1234 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v6.16b, v7.16b, #4 \n"        /* v16 = 1234 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 *
-                                                     w0[1]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-            // r4
-            "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 *
-                                                     w1[1]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                     w2[1]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */
-            "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w1[1]*/
-
-            "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/
-            "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            // r5
-            "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 *
-                                                      w1[1]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 *
-                                                      w0[1]*/
-
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */
-            "cmp  %[cnt], #1                \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "blt 3f                         \n"
-            // mid
-            "1:                             \n"
-            // r0
-            "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-
-            "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/
-            "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-            "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/
-            "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            "subs %[cnt], %[cnt], #1 \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-            "bne 1b \n"
-
-            // right
-            "3:                             \n"
-            "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"
-            "ld1 {v22.4s}, [%[doutr0]]         \n"
-            "ld1 {v23.4s}, [%[doutr1]]         \n"
-            "ld1 {v24.4s}, [%[doutr2]]         \n"
-            "ld1 {v25.4s}, [%[doutr3]]         \n"
-
-            "bif v0.16b, %[vzero].16b, v18.16b \n"
-            "bif v1.16b, %[vzero].16b, v19.16b \n"
-            "bif v2.16b, %[vzero].16b, v18.16b \n"
-            "bif v3.16b, %[vzero].16b, v19.16b \n"
-
-            "bif v4.16b, %[vzero].16b, v18.16b \n"
-            "bif v5.16b, %[vzero].16b, v19.16b \n"
-            "bif v6.16b, %[vzero].16b, v18.16b \n"
-            "bif v7.16b, %[vzero].16b, v19.16b \n"
-
-            "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */
-
-            // r0
-            "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                    w0[0]*/
-
-            "bif v8.16b, %[vzero].16b, v18.16b \n"
-            "bif v9.16b, %[vzero].16b, v19.16b \n"
-            "bif v10.16b, %[vzero].16b, v18.16b \n"
-            "bif v11.16b, %[vzero].16b, v19.16b \n"
-
-            "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                     w0[1]*/
-
-            "ld1 {v18.4s}, [%[rmask]]         \n"
-
-            "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */
-
-            // r1
-            "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v4.16b, v5.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */
-
-            // r2
-            "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v12.16b, v22.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v8.16b, v9.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */
-
-            // r3
-            "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-            "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                     w0[0]*/
-
-            "st1 {v12.4s}, [%[doutr0]], #16     \n"
-            "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-            "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v13.16b, v23.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-            "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/
-            "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */
-
-            "st1 {v13.4s}, [%[doutr1]], #16     \n"
-
-            // r3
-            "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 *
-                                                      w0[0]*/
-
-            "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/
-
-            "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 *
-                                                      w0[1]*/
-
-            "bif v14.16b, v24.16b, v18.16b \n"
-
-            "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 *
-                                                      w0[2]*/
-
-            "st1 {v14.4s}, [%[doutr2]], #16     \n"
-
-            "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/
-
-            "bif v15.16b, v25.16b, v18.16b \n"
-
-            "st1 {v15.4s}, [%[doutr3]], #16     \n"
-            : [cnt] "+r"(cnt),
-              [din_ptr0] "+r"(din_ptr0),
-              [din_ptr1] "+r"(din_ptr1),
-              [din_ptr2] "+r"(din_ptr2),
-              [din_ptr3] "+r"(din_ptr3),
-              [din_ptr4] "+r"(din_ptr4),
-              [din_ptr5] "+r"(din_ptr5),
-              [doutr0] "+r"(doutr0),
-              [doutr1] "+r"(doutr1),
-              [doutr2] "+r"(doutr2),
-              [doutr3] "+r"(doutr3)
-            : [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [bias_val] "r"(vbias),
-              [vmask] "r"(vmask),
-              [rmask] "r"(rmask),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21",
-              "v22",
-              "v23",
-              "v24",
-              "v25");
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-    }
-#else
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float bias_val = flag_bias ? bias[i] : 0.f;
-
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-
-      const float* din0_ptr = nullptr;
-      const float* din1_ptr = nullptr;
-      const float* din2_ptr = nullptr;
-      const float* din3_ptr = nullptr;
-
-      float* doutr0 = nullptr;
-      float* doutr1 = nullptr;
-
-      float* ptr_zero = const_cast<float*>(zero);
-
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-
-        doutr0 = dout_channel;
-        doutr1 = dout_channel + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din1_ptr = zero_ptr;
-            case 2:
-              din2_ptr = zero_ptr;
-            case 1:
-              din3_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int* rmask_ptr = rmask;
-        unsigned int* vmask_ptr = vmask;
-        asm volatile(
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                      @ preload data\n"
-            "pld [%[din2_ptr]]                      @ preload data\n"
-            "pld [%[din3_ptr]]                      @ preload data\n"
-
-            "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"
-            "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"
-            "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"
-
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-
-            "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"
-            "vext.32  q7, q8, q9, #1     @ 1234\n"
-
-            // left
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"
-            "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"
-
-            "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"
-            "vext.32  q7, q10, q11, #1     @ 1234\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"
-            "vext.32  q7, q12, q13, #1     @ 1234\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"
-            "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "cmp %[cnt], #1                             @ check whether has "
-            "mid cols\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q5
-                                                                           // =
-            // vbias
-            "blt  3f                                @ jump to main loop start "
-            "point\n"
-
-            // mid
-            "1:                                    @ right pad entry\n"
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "pld [%[din0_ptr]]                             @ preload data\n"
-            "pld [%[din1_ptr]]                             @ preload data\n"
-            "pld [%[din2_ptr]]                             @ preload data\n"
-            "pld [%[din3_ptr]]                             @ preload data\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-            "vdup.32 q4, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            "subs %[cnt], #1 @ loop count minus 1\n"
-
-            "vdup.32 q5, %[bias_val]                            @ and \n"  // q4
-                                                                           // =
-            // vbias
-
-            "bne    1b                             @ jump to main loop start "
-            "point\n"
-
-            // right
-            "3:                                    @ right pad entry\n"
-            "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"
-
-            "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"
-            "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"
-
-            "vbif d16, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d17, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d18, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vbif d20, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d21, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d22, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vext.32  q6, q8, q9, #1     @ 1234\n"
-            "vext.32  q7, q8, q9, #2     @ 2345\n"
-
-            // r0
-            "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vbif d24, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d25, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d26, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d28, %e[vzero], d19              @ bit select, deal with "
-            "right pad\n"
-            "vbif d29, %e[vzero], d23              @ bit select, deal with "
-            "right pad\n"
-            "vbif d30, %e[vzero], d27             @ bit select, deal with "
-            "right pad\n"
-
-            "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"
-
-            "vext.32  q6, q10, q11, #1     @ 1234\n"
-            "vext.32  q7, q10, q11, #2     @ 2345\n"
-
-            // r1
-            "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"
-            "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"
-
-            "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"
-            "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"
-
-            "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q12, q13, #1     @ 1234\n"
-            "vext.32  q7, q12, q13, #2     @ 2345\n"
-
-            // r2
-            "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"
-            "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"
-            "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"
-
-            "vext.32  q6, q14, q15, #1     @ 1234\n"
-            "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-            // r3
-            "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"
-
-            "vmax.f32  q4, q4, %q[vzero]  @ relu \n"
-
-            "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"
-
-            "vbif d8, d16, d19              @ bit select, deal with right pad\n"
-            "vbif d9, d17, d23              @ bit select, deal with right pad\n"
-
-            "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"
-            "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"
-
-            "vmax.f32  q5, q5, %q[vzero]  @ relu \n"
-
-            "vbif d10, d20, d19              @ bit select, deal with right "
-            "pad\n"
-            "vbif d11, d21, d23              @ bit select, deal with right "
-            "pad\n"
-
-            "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add "
-            "pointer\n"
-
-            : [dout_ptr1] "+r"(doutr0),
-              [dout_ptr2] "+r"(doutr1),
-              [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [din3_ptr] "+r"(din3_ptr),
-              [cnt] "+r"(cnt),
-              [rmask] "+r"(rmask_ptr),
-              [vmask] "+r"(vmask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias_val] "r"(bias_val),
-              [vzero] "w"(vzero)
-            : "cc",
-              "memory",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-        dout_channel += 2 * w_out;
-      }  //! end of processing mid rows
-    }
-#endif
-  }
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, with reulu
- */
-// w_in > 7
-void conv_depthwise_3x3s2p1_bias_relu(float* dout,
-                                      const float* din,
-                                      const float* weights,
-                                      const float* bias,
-                                      bool flag_bias,
-                                      const int num,
-                                      const int ch_in,
-                                      const int h_in,
-                                      const int w_in,
-                                      const int h_out,
-                                      const int w_out,
-                                      ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  int size_pad_bottom = h_out * 2 - h_in;
-
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
-
-  int size_right_pad = w_out * 2 - w_in;
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-      float32x4_t wbias;
-      float bias_c = 0.f;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-        bias_c = bias[i];
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          din4_ptr = dr3;
-          dr0 = dr3;
-          dr1 = dr4;
-        } else {
-          dr0 = dr4;
-          dr1 = dr0 + w_in;
-        }
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i / 2 + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = cnt_col;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                      \n"
-            "prfm pldl1keep, [%[inptr0]]             \n"
-            "prfm pldl1keep, [%[inptr1]]             \n"
-            "prfm pldl1keep, [%[inptr2]]             \n"
-            "prfm pldl1keep, [%[inptr3]]             \n"
-            "prfm pldl1keep, [%[inptr4]]             \n"
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   // {0,2,4,6} * w01
-            "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr0], %[inptr0], #4            \n"
-            "sub %[inptr1], %[inptr1], #4             \n"
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v12.4s, v3.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v16.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr2], %[inptr2], #4            \n"
-            "sub %[inptr3], %[inptr3], #4             \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[1]            \n"  // {0,2,4,6} * w01
-            "fmla v11.4s, v4.4s, %[w2].s[1]            \n"  // {0,2,4,6} * w01
-
-            "fmul v14.4s, v5.4s, %[w0].s[2]            \n"  // {1,3,5,7} * w02
-            "fmla v12.4s, v5.4s, %[w2].s[2]            \n"  // {1,3,5,7} * w02
-
-            "fmla v17.4s, v10.4s, %[w0].s[0]            \n"  // {0,1,3,5} * w00
-            "fmla v16.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"  // v10 = {0,1,3,5}
-
-            "sub %[inptr4], %[inptr4], #4            \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v7.4s, %[w1].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w1].s[0]            \n"  // {0,1,3,5} * w00
-
-            "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"  // v10 = {0,1,3,5}
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[1]            \n"   // {0,2,4,6} * w01
-            "fmla v14.4s, v9.4s, %[w2].s[2]            \n"   // {1,3,5,7} * w02
-            "fmla v17.4s, v10.4s, %[w2].s[0]            \n"  // {0,1,3,5} * w00
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "cmp %[cnt], #1                             \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "blt 1f                                     \n"
-            // mid
-            "2:                                          \n"
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, v18.16b, #4     \n"     // v10 = {2,4,6,8}
-            "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  // v0={0,2,4,6}
-                                                           // v1={1,3,5,7}
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, v19.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, v20.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, v21.16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"
-            "ld1 {v15.4s}, [%[inptr0]]                 \n"
-            "ld1 {v18.4s}, [%[inptr1]]                 \n"
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "ld1 {v19.4s}, [%[inptr2]]                 \n"
-            "ld1 {v20.4s}, [%[inptr3]]                 \n"
-            "ld1 {v21.4s}, [%[inptr4]]                 \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "ext  v10.16b, v0.16b, v15.16b, #4     \n"      // v10 = {2,4,6,8}
-            "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"  // v10 = vbias
-            "subs %[cnt], %[cnt], #1                    \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-
-            "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"  // v16 = vbias
-
-            "bne  2b                                    \n"
-
-            // right
-            "1:                                          \n"
-            "cmp %[remain], #1                           \n"
-            "blt 4f                                     \n"
-            "3:                                         \n"
-            "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r0
-            "fmul v11.4s, v0.4s, %[w0].s[0]            \n"   // {0,2,4,6} * w00
-            "fmul v12.4s, v1.4s, %[w0].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"  // pipei
-            "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n"  // pipei
-
-            // r1
-            "fmla v11.4s, v2.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v12.4s, v3.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v16.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r2
-            "fmul v13.4s, v4.4s, %[w0].s[0]            \n"  // {0,2,4,6} * w00
-            "fmla v11.4s, v4.4s, %[w2].s[0]            \n"  // {0,2,4,6} * w00
-
-            "fmul v14.4s, v5.4s, %[w0].s[1]            \n"  // {1,3,5,7} * w01
-            "fmla v12.4s, v5.4s, %[w2].s[1]            \n"  // {1,3,5,7} * w01
-
-            "fmla v17.4s, v10.4s, %[w0].s[2]            \n"  // {2,4,6,8} * w02
-            "fmla v16.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-
-            // r3
-            "fmla v13.4s, v6.4s, %[w1].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v7.4s, %[w1].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w1].s[2]            \n"  // {2,4,6,8} * w02
-
-            "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"  // v10 = {2,4,6,8}
-            "ld1 {v0.4s}, [%[outptr0]]                  \n"
-
-            "fadd v16.4s, v16.4s, v11.4s                  \n"
-            "fadd v16.4s, v16.4s, v12.4s                  \n"
-            "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-            // r4
-            "fmla v13.4s, v8.4s, %[w2].s[0]            \n"   // {0,2,4,6} * w00
-            "fmla v14.4s, v9.4s, %[w2].s[1]            \n"   // {1,3,5,7} * w01
-            "fmla v17.4s, v10.4s, %[w2].s[2]            \n"  // {2,4,6,8} * w02
-
-            "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */
-
-            "fadd v17.4s, v17.4s, v13.4s                  \n"
-
-            "bif  v16.16b, v0.16b, %[wmask].16b    \n"  // pipei
-
-            "fadd v17.4s, v17.4s, v14.4s                  \n"
-
-            "st1 {v16.4s}, [%[outptr0]], #16              \n"
-
-            "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */
-
-            "bif  v17.16b, v1.16b, %[wmask].16b    \n"  // pipei
-
-            "st1 {v17.4s}, [%[outptr1]], #16              \n"
-            "4:                                          \n"
-            : [inptr0] "+r"(din0_ptr),
-              [inptr1] "+r"(din1_ptr),
-              [inptr2] "+r"(din2_ptr),
-              [inptr3] "+r"(din3_ptr),
-              [inptr4] "+r"(din4_ptr),
-              [outptr0] "+r"(doutr0_ptr),
-              [outptr1] "+r"(doutr1_ptr),
-              [cnt] "+r"(cnt)
-            : [vzero] "w"(vzero),
-              [w0] "w"(wr0),
-              [w1] "w"(wr1),
-              [w2] "w"(wr2),
-              [remain] "r"(cnt_remain),
-              [mask1] "w"(vmask_rp1),
-              [mask2] "w"(vmask_rp2),
-              [wmask] "w"(wmask),
-              [vbias] "w"(wbias)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17",
-              "v18",
-              "v19",
-              "v20",
-              "v21");
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-
-      for (int i = 0; i < h_in; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-
-        unsigned int* mask_ptr = dmask;
-        asm volatile(
-            // top
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "0:                                                     \n"
-            "vmov.u32 q9, #0                                \n"
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q10, q11
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v11={0,2,4,6} v12={1,3,5,7}, q12, q13
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v13={0,2,4,6} v14={1,3,5,7}, q14, q15
-            "pld [%[din0_ptr]]                              @ preload data\n"
-            "pld [%[din1_ptr]]                              @ preload data\n"
-            "pld [%[din2_ptr]]                              @ preload data\n"
-
-            "vdup.32 q3, %[bias]                            @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vext.32 q6, q9, q11, #3                        @ shift right 1 "
-            "data\n"  // q2 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift right 1 "
-            "data\n"  // q6 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"
-            "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"
-            "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q11 * w01
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q12 * w02
-            "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w00
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, "
-            "out1\n"  // q0 * w01
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q1 * w02
-            "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, "
-            "out1\n"  // q2 * w00
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "cmp %[cnt], #1                                 \n"
-            "blt 1f                                         \n"
-            // mid
-            "2:                                             \n"
-            "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"  // q2={8,10,12,14}
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-            "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"  // q2={8,10,12,14}
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // v0={0,2,4,6} v1={1,3,5,7}
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // v4={0,2,4,6} v5={1,3,5,7}
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "subs %[cnt], #1                                \n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "bne  2b                                        \n"
-
-            // right
-            "1:                                             \n"
-            "cmp %[remain], #1                              \n"
-            "blt 3f                                         \n"
-
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q10 =
-                                                                       // vbias
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-
-            "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q0 * w00
-            "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6 * w02
-
-            "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"  // q6 = {2,4,6,8}
-            "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"
-
-            "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q6 * w02
-
-            "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"
-
-            "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q0 * w00
-            "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q1 * w01
-            "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q6 * w02
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                    @ relu \n"
-
-            "vbif.f32 q3, q10, q11                          @ write mask\n"
-
-            "vst1.32 {d6-d7}, [%[outptr]]!                  \n"
-            "3:                                             \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [outptr] "+r"(doutr0_ptr),
-              [cnt] "+r"(cnt),
-              [mask_ptr] "+r"(mask_ptr)
-            : [remain] "r"(cnt_remain),
-              [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s}, [%[din0]], #16\n"
-            "ld1 {v1.4s}, [%[din1]], #16\n"
-            "ld1 {v2.4s}, [%[din2]], #16\n"
-            "ld1 {v3.4s}, [%[din3]], #16\n"
-
-            "bif v0.16b, %[zero].16b, %[mask].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask].16b\n"  // d1_1234
-            "bif v2.16b, %[zero].16b, %[mask].16b\n"  // d2_1234
-            "bif v3.16b, %[zero].16b, %[mask].16b\n"  // d3_1234
-
-            "ext v4.16b, %[zero].16b, v0.16b, #12\n"  // d0_0123
-            "ext v5.16b, %[zero].16b, v1.16b, #12\n"  // d1_0123
-            "ext v6.16b, %[zero].16b, v2.16b, #12\n"  // d2_0123
-            "ext v7.16b, %[zero].16b, v3.16b, #12\n"  // d3_0123
-
-            "ext v8.16b, v0.16b, %[zero].16b, #4\n"   // d0_2340
-            "ext v9.16b, v1.16b, %[zero].16b, #4\n"   // d1_2340
-            "ext v10.16b, v2.16b, %[zero].16b, #4\n"  // d2_2340
-            "ext v11.16b, v3.16b, %[zero].16b, #4\n"  // d3_2340
-
-            "fmul v12.4s, v0.4s, %[wr0].s[1]\n"
-            "fmul v13.4s, v1.4s, %[wr0].s[1]\n"
-
-            "fmul v14.4s, v1.4s, %[wr1].s[1]\n"
-            "fmul v15.4s, v2.4s, %[wr1].s[1]\n"
-
-            "fmul v16.4s, v2.4s, %[wr2].s[1]\n"
-            "fmul v17.4s, v3.4s, %[wr2].s[1]\n"
-
-            "fmla v12.4s, v4.4s, %[wr0].s[0]\n"
-            "fmla v13.4s, v5.4s, %[wr0].s[0]\n"
-
-            "fmla v14.4s, v5.4s, %[wr1].s[0]\n"
-            "fmla v15.4s, v6.4s, %[wr1].s[0]\n"
-
-            "fmla v16.4s, v6.4s, %[wr2].s[0]\n"
-            "fmla v17.4s, v7.4s, %[wr2].s[0]\n"
-
-            "fmla v12.4s, v8.4s, %[wr0].s[2]\n"
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"
-
-            "fmla v14.4s, v9.4s, %[wr1].s[2]\n"
-            "fmla v15.4s, v10.4s, %[wr1].s[2]\n"
-
-            "fmla v16.4s, v10.4s, %[wr2].s[2]\n"
-            "fmla v17.4s, v11.4s, %[wr2].s[2]\n"
-
-            "fadd v12.4s, v12.4s, v14.4s\n"
-            "fadd v12.4s, v12.4s, v16.4s\n"
-
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v17.4s\n"  // out2
-
-            "fadd v12.4s, v12.4s, %[bias].4s\n"  // out1 add bias
-            "fadd v13.4s, v13.4s, %[bias].4s\n"  // out2 add bias
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17");
-#else
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32 {d12-d13}, [%[din0]]!\n"
-            "vld1.32 {d14-d15}, [%[din1]]!\n"
-            "vld1.32 {d16-d17}, [%[din2]]!\n"
-            "vld1.32 {d18-d19}, [%[din3]]!\n"
-
-            "vbif q6, %q[zero], %q[mask]\n"  // d0_1234
-            "vbif q7, %q[zero], %q[mask]\n"  // d1_1234
-            "vbif q8, %q[zero], %q[mask]\n"  // d2_1234
-            "vbif q9, %q[zero], %q[mask]\n"  // d3_1234
-
-            "vmul.f32 q14, q6, %e[wr0][1]\n"
-            "vmul.f32 q15, q7, %e[wr0][1]\n"
-
-            "vmla.f32 q14, q7, %e[wr1][1]\n"
-            "vmla.f32 q15, q8, %e[wr1][1]\n"
-
-            "vmla.f32 q14, q8, %e[wr2][1]\n"
-            "vmla.f32 q15, q9, %e[wr2][1]\n"
-
-            "vext.32 q10, %q[zero], q6, #3\n"  // d0_0123
-            "vext.32 q11, %q[zero], q7, #3\n"  // d1_0123
-            "vext.32 q12, %q[zero], q8, #3\n"  // d2_0123
-            "vext.32 q13, %q[zero], q9, #3\n"  // d3_0123
-
-            "vmla.f32 q14, q10, %e[wr0][0]\n"
-            "vmla.f32 q15, q11, %e[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %e[wr1][0]\n"
-            "vmla.f32 q15, q12, %e[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %e[wr2][0]\n"
-            "vmla.f32 q15, q13, %e[wr2][0]\n"
-
-            "vext.32 q10, q6, %q[zero], #1\n"  // d0_2340
-            "vext.32 q11, q7, %q[zero], #1\n"  // d1_2340
-            "vext.32 q12, q8, %q[zero], #1\n"  // d2_2340
-            "vext.32 q13, q9, %q[zero], #1\n"  // d3_2340
-
-            "vmla.f32 q14, q10, %f[wr0][0]\n"
-            "vmla.f32 q15, q11, %f[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %f[wr1][0]\n"
-            "vmla.f32 q15, q12, %f[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %f[wr2][0]\n"  // out1
-            "vmla.f32 q15, q13, %f[wr2][0]\n"  // out2
-
-            "vadd.f32 q14, q14, %q[bias]\n"  // out1 add bias
-            "vadd.f32 q15, q15, %q[bias]\n"  // out2 add bias
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vst1.32 {d28-d29}, [%[out1]]\n"
-            "vst1.32 {d30-d31}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
- */
-
-void conv_depthwise_3x3s2p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      int hs = -1;
-      int he = 2;
-      float out_buf[4];
-      for (int j = 0; j < h_out; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        if (hs == -1) {
-          dr0 = zeros;
-        }
-        if (he > h_in) {
-          dr2 = zeros;
-        }
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v9.16b, v11.16b, #12               \n"  // v6 =
-                                                                 // {0,1,3,5}
-            "ext v7.16b, v9.16b, v13.16b, #12               \n"  // v7 =
-                                                                 // {0,1,3,5}
-            "ext v8.16b, v9.16b, v15.16b, #12               \n"  // v8 =
-                                                                 // {0,1,3,5}
-
-            "fmul v4.4s, v10.4s, %[wr0].s[1]                \n"  // v10 * w01
-            "fmul v5.4s, v11.4s, %[wr0].s[2]                \n"  // v11 * w02
-            "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n"  // v6  * w00
-
-            "fmla v4.4s, v12.4s, %[wr1].s[1]                \n"  // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[2]                \n"  // v13 * w12
-            "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[1]                \n"  // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[2]                \n"  // v15 * w21
-            "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v6.4s                       \n"
-
-            "fadd v4.4s, v4.4s, %[bias].4s                  \n"
-
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf)
-            : "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"  // q6 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"  // q7 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"  // q8 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q10 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q11 * w02
-            "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6  * w00
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-        hs += 2;
-        he += 2;
-      }
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float* dout_channel = dout_batch + i * size_out_channel;
-      const float* din_channel = din_batch + i * size_in_channel;
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float* doutr0 = dout_channel;
-      float* doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        const float* dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        asm volatile(
-            "prfm pldl1keep, [%[din0]]\n"
-            "prfm pldl1keep, [%[din1]]\n"
-            "prfm pldl1keep, [%[din2]]\n"
-            "prfm pldl1keep, [%[din3]]\n"
-
-            "ld1 {v0.4s}, [%[din0]], #16\n"
-            "ld1 {v1.4s}, [%[din1]], #16\n"
-            "ld1 {v2.4s}, [%[din2]], #16\n"
-            "ld1 {v3.4s}, [%[din3]], #16\n"
-
-            "bif v0.16b, %[zero].16b, %[mask].16b\n"  // d0_1234
-            "bif v1.16b, %[zero].16b, %[mask].16b\n"  // d1_1234
-            "bif v2.16b, %[zero].16b, %[mask].16b\n"  // d2_1234
-            "bif v3.16b, %[zero].16b, %[mask].16b\n"  // d3_1234
-
-            "ext v4.16b, %[zero].16b, v0.16b, #12\n"  // d0_0123
-            "ext v5.16b, %[zero].16b, v1.16b, #12\n"  // d1_0123
-            "ext v6.16b, %[zero].16b, v2.16b, #12\n"  // d2_0123
-            "ext v7.16b, %[zero].16b, v3.16b, #12\n"  // d3_0123
-
-            "ext v8.16b, v0.16b, %[zero].16b, #4\n"   // d0_2340
-            "ext v9.16b, v1.16b, %[zero].16b, #4\n"   // d1_2340
-            "ext v10.16b, v2.16b, %[zero].16b, #4\n"  // d2_2340
-            "ext v11.16b, v3.16b, %[zero].16b, #4\n"  // d3_2340
-
-            "fmul v12.4s, v0.4s, %[wr0].s[1]\n"
-            "fmul v13.4s, v1.4s, %[wr0].s[1]\n"
-
-            "fmul v14.4s, v1.4s, %[wr1].s[1]\n"
-            "fmul v15.4s, v2.4s, %[wr1].s[1]\n"
-
-            "fmul v16.4s, v2.4s, %[wr2].s[1]\n"
-            "fmul v17.4s, v3.4s, %[wr2].s[1]\n"
-
-            "fmla v12.4s, v4.4s, %[wr0].s[0]\n"
-            "fmla v13.4s, v5.4s, %[wr0].s[0]\n"
-
-            "fmla v14.4s, v5.4s, %[wr1].s[0]\n"
-            "fmla v15.4s, v6.4s, %[wr1].s[0]\n"
-
-            "fmla v16.4s, v6.4s, %[wr2].s[0]\n"
-            "fmla v17.4s, v7.4s, %[wr2].s[0]\n"
-
-            "fmla v12.4s, v8.4s, %[wr0].s[2]\n"
-            "fmla v13.4s, v9.4s, %[wr0].s[2]\n"
-
-            "fmla v14.4s, v9.4s, %[wr1].s[2]\n"
-            "fmla v15.4s, v10.4s, %[wr1].s[2]\n"
-
-            "fmla v16.4s, v10.4s, %[wr2].s[2]\n"
-            "fmla v17.4s, v11.4s, %[wr2].s[2]\n"
-
-            "fadd v12.4s, v12.4s, v14.4s\n"
-            "fadd v12.4s, v12.4s, v16.4s\n"
-
-            "fadd v13.4s, v13.4s, v15.4s\n"  // out1
-            "fadd v13.4s, v13.4s, v17.4s\n"  // out2
-
-            "fadd v12.4s, v12.4s, %[bias].4s\n"  // out1 add bias
-            "fadd v13.4s, v13.4s, %[bias].4s\n"  // out2 add bias
-
-            "prfm pldl1keep, [%[out1]]\n"
-            "prfm pldl1keep, [%[out2]]\n"
-
-            "fmax v12.4s, v12.4s, %[zero].4s\n"  // out1 -> relu
-            "fmax v13.4s, v13.4s, %[zero].4s\n"  // out2 -> relu
-
-            "st1 {v12.4s}, [%[out1]]\n"
-            "st1 {v13.4s}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "v0",
-              "v1",
-              "v2",
-              "v3",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15",
-              "v16",
-              "v17");
-#else
-        asm volatile(
-            "pld [%[din0]]\n"
-            "pld [%[din1]]\n"
-            "pld [%[din2]]\n"
-            "pld [%[din3]]\n"
-
-            "vld1.32 {d12-d13}, [%[din0]]!\n"
-            "vld1.32 {d14-d15}, [%[din1]]!\n"
-            "vld1.32 {d16-d17}, [%[din2]]!\n"
-            "vld1.32 {d18-d19}, [%[din3]]!\n"
-
-            "vbif q6, %q[zero], %q[mask]\n"  // d0_1234
-            "vbif q7, %q[zero], %q[mask]\n"  // d1_1234
-            "vbif q8, %q[zero], %q[mask]\n"  // d2_1234
-            "vbif q9, %q[zero], %q[mask]\n"  // d3_1234
-
-            "vmul.f32 q14, q6, %e[wr0][1]\n"
-            "vmul.f32 q15, q7, %e[wr0][1]\n"
-
-            "vmla.f32 q14, q7, %e[wr1][1]\n"
-            "vmla.f32 q15, q8, %e[wr1][1]\n"
-
-            "vmla.f32 q14, q8, %e[wr2][1]\n"
-            "vmla.f32 q15, q9, %e[wr2][1]\n"
-
-            "vext.32 q10, %q[zero], q6, #3\n"  // d0_0123
-            "vext.32 q11, %q[zero], q7, #3\n"  // d1_0123
-            "vext.32 q12, %q[zero], q8, #3\n"  // d2_0123
-            "vext.32 q13, %q[zero], q9, #3\n"  // d3_0123
-
-            "vmla.f32 q14, q10, %e[wr0][0]\n"
-            "vmla.f32 q15, q11, %e[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %e[wr1][0]\n"
-            "vmla.f32 q15, q12, %e[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %e[wr2][0]\n"
-            "vmla.f32 q15, q13, %e[wr2][0]\n"
-
-            "vext.32 q10, q6, %q[zero], #1\n"  // d0_2340
-            "vext.32 q11, q7, %q[zero], #1\n"  // d1_2340
-            "vext.32 q12, q8, %q[zero], #1\n"  // d2_2340
-            "vext.32 q13, q9, %q[zero], #1\n"  // d3_2340
-
-            "vmla.f32 q14, q10, %f[wr0][0]\n"
-            "vmla.f32 q15, q11, %f[wr0][0]\n"
-
-            "vmla.f32 q14, q11, %f[wr1][0]\n"
-            "vmla.f32 q15, q12, %f[wr1][0]\n"
-
-            "vmla.f32 q14, q12, %f[wr2][0]\n"  // out1
-            "vmla.f32 q15, q13, %f[wr2][0]\n"  // out2
-
-            "vadd.f32 q14, q14, %q[bias]\n"  // out1 add bias
-            "vadd.f32 q15, q15, %q[bias]\n"  // out2 add bias
-
-            "pld [%[out1]]\n"
-            "pld [%[out2]]\n"
-
-            "vmax.f32 q14, q14, %q[zero]\n"  // out1 -> relu
-            "vmax.f32 q15, q15, %q[zero]\n"  // out2 -> relu
-
-            "vst1.32 {d28-d29}, [%[out1]]\n"
-            "vst1.32 {d30-d31}, [%[out2]]\n"
-
-            : [din0] "+r"(dr0),
-              [din1] "+r"(dr1),
-              [din2] "+r"(dr2),
-              [din3] "+r"(dr3)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [zero] "w"(vzero),
-              [mask] "w"(vmask_rp),
-              [bias] "w"(wbias),
-              [out1] "r"(out_buf1),
-              [out2] "r"(out_buf2)
-            : "cc",
-              "memory",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 7
- */
-void conv_depthwise_3x3s2p1_bias_s_relu(float* dout,
-                                        const float* din,
-                                        const float* weights,
-                                        const float* bias,
-                                        bool flag_bias,
-                                        const int num,
-                                        const int ch_in,
-                                        const int h_in,
-                                        const int w_in,
-                                        const int h_out,
-                                        const int w_out,
-                                        ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      int hs = -1;
-      int he = 2;
-      float out_buf[4];
-      for (int j = 0; j < h_out; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        if (hs == -1) {
-          dr0 = zeros;
-        }
-        if (he > h_in) {
-          dr2 = zeros;
-        }
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "movi v9.4s, #0                                 \n"
-            "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"
-
-            "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  // v10={0,2,4,6}
-            // v11={1,3,5,7}
-            "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  // v13={0,2,4,6}
-            // v12={1,3,5,7}
-            "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  // v14={0,2,4,6}
-            // v15={1,3,5,7}
-
-            "bif v10.16b, v9.16b, v6.16b                    \n"
-            "bif v11.16b, v9.16b, v7.16b                    \n"
-            "bif v12.16b, v9.16b, v6.16b                    \n"
-            "bif v13.16b, v9.16b, v7.16b                    \n"
-            "bif v14.16b, v9.16b, v6.16b                    \n"
-            "bif v15.16b, v9.16b, v7.16b                    \n"
-
-            "ext v6.16b, v9.16b, v11.16b, #12               \n"  // v6 =
-                                                                 // {0,1,3,5}
-            "ext v7.16b, v9.16b, v13.16b, #12               \n"  // v7 =
-                                                                 // {0,1,3,5}
-            "ext v8.16b, v9.16b, v15.16b, #12               \n"  // v8 =
-                                                                 // {0,1,3,5}
-
-            "fmul v4.4s, v10.4s, %[wr0].s[1]                \n"  // v10 * w01
-            "fmul v5.4s, v11.4s, %[wr0].s[2]                \n"  // v11 * w02
-            "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n"  // v6  * w00
-
-            "fmla v4.4s, v12.4s, %[wr1].s[1]                \n"  // v12 * w11
-            "fmla v5.4s, v13.4s, %[wr1].s[2]                \n"  // v13 * w12
-            "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n"  // v7  * w10
-
-            "fmla v4.4s, v14.4s, %[wr2].s[1]                \n"  // v14 * w20
-            "fmla v5.4s, v15.4s, %[wr2].s[2]                \n"  // v15 * w21
-            "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n"  // v8  * w22
-
-            "fadd v4.4s, v4.4s, v5.4s                       \n"
-            "fadd v4.4s, v4.4s, v6.4s                       \n"
-
-            "fadd v4.4s, v4.4s, %[bias].4s                  \n"  // out add bias
-            "fmax v4.4s, v4.4s, v9.4s                       \n"
-
-            "st1 {v4.4s}, [%[out]]                          \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "w"(vbias),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "v4",
-              "v5",
-              "v6",
-              "v7",
-              "v8",
-              "v9",
-              "v10",
-              "v11",
-              "v12",
-              "v13",
-              "v14",
-              "v15");
-
-#else
-        asm volatile(
-            // Load up 12 elements (3 vectors) from each of 8 sources.
-            "vmov.u32 q9, #0                                \n"
-            "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"
-            "vdup.32  q3, %[bias]                           @ and \n"  // q3 =
-                                                                       // vbias
-
-            "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"  // q10={0,2,4,6} q11={1,3,5,7}
-            "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  // q13={0,2,4,6} q12={1,3,5,7}
-            "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"  // q14={0,2,4,6} q15={1,3,5,7}
-
-            "vbif q10, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q11, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q12, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q13, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q14, q9, q6                               @ bit select, deal "
-            "with right pad\n"
-            "vbif q15, q9, q7                               @ bit select, deal "
-            "with right pad\n"
-
-            "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"  // q6 = {0,1,3,5}
-            "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"  // q7 = {0,1,3,5}
-            "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"  // q8 = {0,1,3,5}
-
-            "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, "
-            "out0\n"  // q10 * w01
-            "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q11 * w02
-            "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, "
-            "out0\n"  // q6  * w00
-
-            "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, "
-            "out0\n"  // q12 * w11
-            "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q13 * w12
-            "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, "
-            "out0\n"  // q7  * w10
-
-            "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, "
-            "out0\n"  // q14 * w20
-            "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q15 * w21
-            "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, "
-            "out0\n"  // q8  * w22
-
-            "vadd.f32 q3, q3, q4                            @ add \n"
-            "vadd.f32 q3, q3, q5                            @ add \n"
-
-            "vmax.f32 q3, q3, q9                            @ relu\n"
-
-            "vst1.32 {d6-d7}, [%[out]]                            \n"
-            : [din0_ptr] "+r"(din0_ptr),
-              [din1_ptr] "+r"(din1_ptr),
-              [din2_ptr] "+r"(din2_ptr),
-              [mask_ptr] "+r"(mask_ptr)
-            : [wr0] "w"(wr0),
-              [wr1] "w"(wr1),
-              [wr2] "w"(wr2),
-              [bias] "r"(bias_c),
-              [out] "r"(out_buf)
-            : "cc",
-              "memory",
-              "q3",
-              "q4",
-              "q5",
-              "q6",
-              "q7",
-              "q8",
-              "q9",
-              "q10",
-              "q11",
-              "q12",
-              "q13",
-              "q14",
-              "q15");
-#endif  // __aarch64__
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-        hs += 2;
-        he += 2;
-      }
-    }
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_3x3s1.cc b/lite/backends/arm/math/conv_depthwise_3x3s1.cc
deleted file mode 100644
index 8d0ebb58ad..0000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3s1.cc
+++ /dev/null
@@ -1,2539 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void conv_depthwise_3x3s1p0_bias(float *dout,
-                                 const float *din,
-                                 const float *weights,
-                                 const float *bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext *ctx);
-
-void conv_depthwise_3x3s1p0_bias_s(float *dout,
-                                   const float *din,
-                                   const float *weights,
-                                   const float *bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext *ctx);
-
-void conv_depthwise_3x3s1p1_bias(float *dout,
-                                 const float *din,
-                                 const float *weights,
-                                 const float *bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext *ctx);
-
-void conv_depthwise_3x3s1p1_bias_s(float *dout,
-                                   const float *din,
-                                   const float *weights,
-                                   const float *bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext *ctx);
-
-void conv_depthwise_3x3s1_fp32(const float *din,
-                               float *dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float *weights,
-                               const float *bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext *ctx) {
-  if (pad == 0) {
-    if (w_in > 5) {
-      conv_depthwise_3x3s1p0_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  flag_relu,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  ctx);
-    } else {
-      conv_depthwise_3x3s1p0_bias_s(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    flag_relu,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-    }
-  }
-  if (pad == 1) {
-    if (w_in > 4) {
-      conv_depthwise_3x3s1p1_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  flag_relu,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  ctx);
-    } else {
-      conv_depthwise_3x3s1p1_bias_s(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    flag_relu,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-    }
-  }
-}
-
-#ifdef __aarch64__
-#define INIT_S1                                                   \
-  "PRFM PLDL1KEEP, [%[din_ptr0]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr1]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr2]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr3]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr4]] \n"                              \
-  "PRFM PLDL1KEEP, [%[din_ptr5]] \n"                              \
-  "movi   v21.4s, #0x0\n" /* out0 = 0 */                          \
-                                                                  \
-  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
-  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
-  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/  \
-                                                                  \
-  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/       \
-  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/       \
-  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/       \
-  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/       \
-                                                                  \
-  "ld1 {v12.4s}, [%[bias_val]]     \n"  /*vdupq_n_f32(bias_val)*/ \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/
-
-#define LEFT_COMPUTE_S1                                                   \
-  "ext  v16.16b, %[vzero].16b, v0.16b, #12 \n"           /* v16 = 00123*/ \
-  "ext  v17.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234 */ /* r0 */         \
-  "fmla v12.4s,  v0.4s,  %[w0].s[1]\n" /* outr00 += din0_0123 * w0[1]*/   \
-                                                                          \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/         \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/         \
-  "sub %[din_ptr0], %[din_ptr0], #4 \n"   /* din_ptr0-- */                \
-  "sub %[din_ptr1], %[din_ptr1], #4 \n"   /* din_ptr0-- */                \
-                                                                          \
-  "fmla v12.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din0_0012 * w0[0]*/ \
-                                                                          \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/           \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/           \
-  "sub %[din_ptr2], %[din_ptr2], #4 \n" /* din_ptr0-- */                  \
-  "sub %[din_ptr3], %[din_ptr3], #4 \n" /* din_ptr0-- */                  \
-                                                                          \
-  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_1234 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, %[vzero].16b, v2.16b, #12 \n"           /* v16 = 00123*/ \
-  "ext  v17.16b, v2.16b, v3.16b, #4 \n" /* v16 = 1234 */ /* r1 */         \
-  "fmla v13.4s ,  v2.4s,  %[w0].s[1]\n" /* outr00 += din1_0123 * w0[1]*/  \
-  "fmla v12.4s ,  v2.4s,  %[w1].s[1]\n" /* outr00 += din1_0123 * w1[1]*/  \
-  "sub %[din_ptr4], %[din_ptr4], #4 \n" /* din_ptr0-- */                  \
-  "sub %[din_ptr5], %[din_ptr5], #4 \n" /* din_ptr0-- */                  \
-                                                                          \
-  "fmla v13.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ext v17.16b, v4.16b, v5.16b, #4 \n"         /* v16=1234 */             \
-  "ext  v16.16b, %[vzero].16b, v4.16b, #12 \n" /* v16 = 00123*/           \
-                                                                          \
-  /* r2 */                                                                \
-  "fmla v14.4s ,  v4.4s,  %[w0].s[1]\n" /* outr00 += din2_0123 * w0[1]*/  \
-  "fmla v13.4s ,  v4.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
-  "fmla v12.4s ,  v4.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
-                                                                          \
-  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
-  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v14.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v13.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
-                                                                          \
-  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ext  v16.16b, %[vzero].16b, v6.16b, #12 \n"           /* v16 = 00123*/ \
-  "ext  v17.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234 */ /* r3 */         \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[1]\n" /*outr00 += din2_0123 * w0[1]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
-                                                                          \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
-                                                                          \
-  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
-  "ld1 {v7.4s}, [%[din_ptr3]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ext  v16.16b, %[vzero].16b, v8.16b, #12 \n" /* v16 = 00123*/           \
-  "ext  v17.16b, v8.16b, v9.16b, #4 \n"        /* v16 = 1234 */
-
-#define LEFT_RESULT_S1                                                      \
-  /* r4 */                                                                  \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/    \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/    \
-                                                                            \
-  "st1 {v12.4s}, [%[doutr0]], #16 \n"    /* vst1q_f32() */                  \
-  "st1 {v13.4s}, [%[doutr1]], #16 \n"    /* vst1q_f32() */                  \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/            \
-                                                                            \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/   \
-                                                                            \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/             \
-  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
-                                                                            \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/   \
-                                                                            \
-  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n"            /* v16 = 00123*/ \
-  "ext  v17.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234 */ /* r5 */         \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/   \
-                                                                            \
-  "st1 {v14.4s}, [%[doutr2]], #16 \n"    /* vst1q_f32() */                  \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/            \
-                                                                            \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/   \
-                                                                            \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/             \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
-                                                                            \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/   \
-                                                                            \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                     \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                    \
-                                                                            \
-  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                     \
-  "cmp  %w[cnt], #1                \n"                                      \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/           \
-                                                                            \
-  "blt 3f                         \n"
-
-#define MID_COMPUTE_S1                                                    \
-  "1:                             \n"   /* r0 */                          \
-  "fmla v12.4s ,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "ld1 {v0.4s}, [%[din_ptr0]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v12.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "ld1 {v1.4s}, [%[din_ptr0]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
-  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "ld1 {v2.4s}, [%[din_ptr1]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "ld1 {v3.4s}, [%[din_ptr1]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
-  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "ld1 {v4.4s}, [%[din_ptr2]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "ld1 {v5.4s}, [%[din_ptr2]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
-  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-#define MID_RESULT_S1                                                      \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
-  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
-  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                           \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
-                                                                           \
-  "subs %w[cnt], %w[cnt], #1 \n"                                           \
-                                                                           \
-  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "bne 1b \n"
-
-#define RIGHT_COMPUTE_S1                                                  \
-  "3:                             \n"                                     \
-  "ld1 {v18.4s, v19.4s}, [%[vmask]]         \n"                           \
-  "ld1 {v22.4s}, [%[doutr0]]         \n"                                  \
-  "ld1 {v23.4s}, [%[doutr1]]         \n"                                  \
-  "ld1 {v24.4s}, [%[doutr2]]         \n"                                  \
-  "ld1 {v25.4s}, [%[doutr3]]         \n"                                  \
-                                                                          \
-  "bif v0.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v1.16b, %[vzero].16b, v19.16b \n"                                  \
-  "bif v2.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v3.16b, %[vzero].16b, v19.16b \n"                                  \
-                                                                          \
-  "bif v4.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v5.16b, %[vzero].16b, v19.16b \n"                                  \
-  "bif v6.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v7.16b, %[vzero].16b, v19.16b \n"                                  \
-                                                                          \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */ /* r0 */         \
-  "fmla v12.4s,  v0.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                          \
-  "bif v8.16b, %[vzero].16b, v18.16b \n"                                  \
-  "bif v9.16b, %[vzero].16b, v19.16b \n"                                  \
-  "bif v10.16b, %[vzero].16b, v18.16b \n"                                 \
-  "bif v11.16b, %[vzero].16b, v19.16b \n"                                 \
-                                                                          \
-  "fmla v12.4s,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                          \
-  "ld1 {v18.4s}, [%[rmask]]         \n"                                   \
-                                                                          \
-  "fmla v12.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v2.16b, v3.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v2.16b, v3.16b, #8 \n" /* v16 = 2345 */ /* r1 */         \
-  "fmla v13.4s ,  v2.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v12.4s ,  v2.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "fmla v13.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "fmla v13.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v4.16b, v5.16b, #4 \n"                  /* v16 = 1234*/  \
-  "ext  v17.16b, v4.16b, v5.16b, #8 \n" /* v16 = 2345 */ /* r2 */         \
-  "fmla v14.4s ,  v4.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v13.4s ,  v4.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-  "fmla v12.4s ,  v4.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                          \
-  "fmla v14.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v13.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-  "fmla v12.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/ \
-                                                                          \
-  "fmla v14.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v13.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-  "fmla v12.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/ \
-                                                                          \
-  "ext  v16.16b, v6.16b, v7.16b, #4 \n" /* v16 = 1234*/                   \
-  "ext  v17.16b, v6.16b, v7.16b, #8 \n" /* v16 = 2345 */
-
-#define RIGHT_RESULT_S1                                                    \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "bif v12.16b, v22.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
-  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "bif v13.16b, v23.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v10.16b, v11.16b, #4 \n"                  /* v16 = 1234*/ \
-  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */ /* r3 */        \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/  \
-                                                                           \
-  "bif v14.16b, v24.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "bif v15.16b, v25.16b, v18.16b \n"                                       \
-                                                                           \
-  "st1 {v15.4s}, [%[doutr3]], #16     \n"
-
-#define LEFT_RESULT_S1_RELU                                               \
-  /* r4 */                                                                \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[1]\n" /* outr00 += din2_0123 * w1[1]*/  \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w2[1]*/  \
-                                                                          \
-  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                          \
-  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                          \
-                                                                          \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w1[1]*/ \
-                                                                          \
-  "st1 {v12.4s}, [%[doutr0]], #16 \n" /* vst1q_f32() */                   \
-  "st1 {v13.4s}, [%[doutr1]], #16 \n" /* vst1q_f32() */                   \
-                                                                          \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n" /*vld1q_f32(din_ptr0)*/               \
-                                                                          \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w1[1]*/ \
-                                                                          \
-  "ext  v16.16b, %[vzero].16b, v10.16b, #12 \n" /* v16 = 00123*/          \
-  "ext  v17.16b, v10.16b, v11.16b, #4 \n"       /* v16 = 1234 */          \
-  "ld1 {v12.4s}, [%[bias_val]]      \n"         /*vdupq_n_f32(bias_val)*/ \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/ /* r5*/ \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[1]\n" /* outr00 += din2_0123 * w1[1]*/ \
-                                                                          \
-  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                          \
-                                                                          \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16  \n" /*vld1q_f32(din_ptr0)*/          \
-                                                                          \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[0]\n" /* outr00 += din2_0123 * w0[1]*/ \
-                                                                          \
-  "st1 {v14.4s}, [%[doutr2]], #16 \n" /* vst1q_f32() */                   \
-                                                                          \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n" /*vld1q_f32(din_ptr0)*/              \
-                                                                          \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din1_0123 * w0[1]*/ \
-                                                                          \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
-                                                                          \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                   \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                  \
-                                                                          \
-  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                          \
-                                                                          \
-  "st1 {v15.4s}, [%[doutr3]], #16 \n" /* vst1q_f32() */                   \
-  "cmp  %w[cnt], #1                \n"                                    \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/         \
-  "blt 3f                         \n"
-
-#define MID_RESULT_S1_RELU                                                 \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v6.4s}, [%[din_ptr3]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "fmax v12.4s, v12.4s, %[vzero].4s \n"  /*relu*/                          \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
-                                                                           \
-  "ld1 {v7.4s}, [%[din_ptr3]]   \n"     /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v12.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
-  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "ld1 {v8.4s}, [%[din_ptr4]], #16   \n" /*vld1q_f32(din_ptr0)*/           \
-  "fmax v13.4s, v13.4s, %[vzero].4s \n"  /*relu*/                          \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v13.4s}, [%[doutr1]], #16     \n"                                  \
-                                                                           \
-  "ld1 {v9.4s}, [%[din_ptr4]]   \n"     /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v13.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
-  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
-                                                                           \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
-  "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/          \
-  "fmax v14.4s, v14.4s, %[vzero].4s \n"   /*relu*/                         \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
-                                                                           \
-  "ld1 {v11.4s}, [%[din_ptr5]]   \n"    /*vld1q_f32(din_ptr0)*/            \
-  "ld1 {v14.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v0.16b, v1.16b, #4 \n" /* v16 = 1234*/                    \
-  "ext  v17.16b, v0.16b, v1.16b, #8 \n" /* v16 = 2345 */                   \
-                                                                           \
-  "subs %w[cnt], %w[cnt], #1 \n"                                           \
-                                                                           \
-  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "st1 {v15.4s}, [%[doutr3]], #16     \n"                                  \
-  "ld1 {v15.4s}, [%[bias_val]]      \n" /*vdupq_n_f32(bias_val)*/          \
-                                                                           \
-  "bne 1b \n"
-
-#define RIGHT_RESULT_S1_RELU                                               \
-  /* r3 */                                                                 \
-  "fmla v15.4s ,  v6.4s,  %[w0].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v6.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v13.4s ,  v6.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "fmax v12.4s, v12.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w0].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v13.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "bif v12.16b, v22.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w0].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v13.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v8.16b, v9.16b, #4 \n"                  /* v16 = 1234*/   \
-  "ext  v17.16b, v8.16b, v9.16b, #8 \n" /* v16 = 2345 */ /* r3 */          \
-  "fmla v15.4s ,  v8.4s,  %[w1].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-  "fmla v14.4s ,  v8.4s,  %[w2].s[0]\n" /* outr00 += din0_0123 * w0[0]*/   \
-                                                                           \
-  "st1 {v12.4s}, [%[doutr0]], #16     \n"                                  \
-  "fmax v13.4s, v13.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w1].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-  "fmla v14.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "bif v13.16b, v23.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w1].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-  "fmla v14.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "ext  v16.16b, v10.16b, v11.16b, #4 \n" /* v16 = 1234*/                  \
-  "ext  v17.16b, v10.16b, v11.16b, #8 \n" /* v16 = 2345 */                 \
-                                                                           \
-  "st1 {v13.4s}, [%[doutr1]], #16     \n" /* r3 */                         \
-  "fmla v15.4s ,  v10.4s,  %[w2].s[0]\n"  /* outr00 += din0_0123 * w0[0]*/ \
-                                                                           \
-  "fmax v14.4s, v14.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "fmla v15.4s ,  v16.4s,  %[w2].s[1]\n" /* outr00 += din0_1234 * w0[1]*/  \
-                                                                           \
-  "bif v14.16b, v24.16b, v18.16b \n"                                       \
-                                                                           \
-  "fmla v15.4s ,  v17.4s,  %[w2].s[2]\n" /* outr00 += din0_2345 * w0[2]*/  \
-                                                                           \
-  "st1 {v14.4s}, [%[doutr2]], #16     \n"                                  \
-                                                                           \
-  "fmax v15.4s, v15.4s, %[vzero].4s \n" /*relu*/                           \
-                                                                           \
-  "bif v15.16b, v25.16b, v18.16b \n"                                       \
-                                                                           \
-  "st1 {v15.4s}, [%[doutr3]], #16     \n"
-
-#define COMPUTE_S_S1                       \
-  "prfm pldl1keep, [%[din0]]\n"            \
-  "prfm pldl1keep, [%[din1]]\n"            \
-  "prfm pldl1keep, [%[din2]]\n"            \
-  "prfm pldl1keep, [%[din3]]\n"            \
-                                           \
-  "ld1 {v0.4s}, [%[din0]], #16\n"          \
-  "ld1 {v1.4s}, [%[din1]], #16\n"          \
-  "ld1 {v2.4s}, [%[din2]], #16\n"          \
-  "ld1 {v3.4s}, [%[din3]], #16\n"          \
-                                           \
-  "bif v0.16b, %[zero].16b, %[mask].16b\n" \
-  "bif v1.16b, %[zero].16b, %[mask].16b\n" \
-  "bif v2.16b, %[zero].16b, %[mask].16b\n" \
-  "bif v3.16b, %[zero].16b, %[mask].16b\n" \
-                                           \
-  "ext v4.16b, %[zero].16b, v0.16b, #12\n" \
-  "ext v5.16b, %[zero].16b, v1.16b, #12\n" \
-  "ext v6.16b, %[zero].16b, v2.16b, #12\n" \
-  "ext v7.16b, %[zero].16b, v3.16b, #12\n" \
-                                           \
-  "ext v8.16b, v0.16b, %[zero].16b, #4\n"  \
-  "ext v9.16b, v1.16b, %[zero].16b, #4\n"  \
-  "ext v10.16b, v2.16b, %[zero].16b, #4\n" \
-  "ext v11.16b, v3.16b, %[zero].16b, #4\n" \
-                                           \
-  "fmul v12.4s, v0.4s, %[wr0].s[1]\n"      \
-  "fmul v13.4s, v1.4s, %[wr0].s[1]\n"      \
-                                           \
-  "fmul v14.4s, v1.4s, %[wr1].s[1]\n"      \
-  "fmul v15.4s, v2.4s, %[wr1].s[1]\n"      \
-                                           \
-  "fmul v16.4s, v2.4s, %[wr2].s[1]\n"      \
-  "fmul v17.4s, v3.4s, %[wr2].s[1]\n"      \
-                                           \
-  "fmla v12.4s, v4.4s, %[wr0].s[0]\n"      \
-  "fmla v13.4s, v5.4s, %[wr0].s[0]\n"      \
-                                           \
-  "fmla v14.4s, v5.4s, %[wr1].s[0]\n"      \
-  "fmla v15.4s, v6.4s, %[wr1].s[0]\n"      \
-                                           \
-  "fmla v16.4s, v6.4s, %[wr2].s[0]\n"      \
-  "fmla v17.4s, v7.4s, %[wr2].s[0]\n"      \
-                                           \
-  "fmla v12.4s, v8.4s, %[wr0].s[2]\n"      \
-  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"      \
-                                           \
-  "fmla v14.4s, v9.4s, %[wr1].s[2]\n"      \
-  "fmla v15.4s, v10.4s, %[wr1].s[2]\n"     \
-                                           \
-  "fmla v16.4s, v10.4s, %[wr2].s[2]\n"     \
-  "fmla v17.4s, v11.4s, %[wr2].s[2]\n"     \
-                                           \
-  "fadd v12.4s, v12.4s, v14.4s\n"          \
-  "fadd v12.4s, v12.4s, v16.4s\n"          \
-                                           \
-  "fadd v13.4s, v13.4s, v15.4s\n"          \
-  "fadd v13.4s, v13.4s, v17.4s\n"          \
-                                           \
-  "fadd v12.4s, v12.4s, %[bias].4s\n"      \
-  "fadd v13.4s, v13.4s, %[bias].4s\n"
-
-#define RESULT_S_S1             \
-  "prfm pldl1keep, [%[out1]]\n" \
-  "prfm pldl1keep, [%[out2]]\n" \
-                                \
-  "st1 {v12.4s}, [%[out1]]\n"   \
-  "st1 {v13.4s}, [%[out2]]\n"
-
-#define RESULT_S_S1_RELU              \
-  "prfm pldl1keep, [%[out1]]\n"       \
-  "prfm pldl1keep, [%[out2]]\n"       \
-                                      \
-  "fmax v12.4s, v12.4s, %[zero].4s\n" \
-  "fmax v13.4s, v13.4s, %[zero].4s\n" \
-                                      \
-  "st1 {v12.4s}, [%[out1]]\n"         \
-  "st1 {v13.4s}, [%[out2]]\n"
-
-#define COMPUTE_S_S1_P0                                   \
-  "prfm pldl1keep, [%[din0]]\n"                           \
-  "prfm pldl1keep, [%[din1]]\n"                           \
-  "prfm pldl1keep, [%[din2]]\n"                           \
-  "prfm pldl1keep, [%[din3]]\n"                           \
-                                                          \
-  "ld1 {v0.4s, v1.4s}, [%[din0]]\n"                       \
-  "ld1 {v2.4s, v3.4s}, [%[din1]]\n"                       \
-  "ld1 {v4.4s, v5.4s}, [%[din2]]\n"                       \
-  "ld1 {v6.4s, v7.4s}, [%[din3]]\n"                       \
-                                                          \
-  "bif v0.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v1.16b, %[zero].16b, %[mask2].16b\n"               \
-                                                          \
-  "bif v2.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v3.16b, %[zero].16b, %[mask2].16b\n"               \
-                                                          \
-  "bif v4.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v5.16b, %[zero].16b, %[mask2].16b\n"               \
-                                                          \
-  "bif v6.16b, %[zero].16b, %[mask1].16b\n"               \
-  "bif v7.16b, %[zero].16b, %[mask2].16b\n"               \
-                                                          \
-  "ext v8.16b, v0.16b, v1.16b, #4\n"                      \
-  "ext v9.16b, v0.16b, v1.16b, #8\n"                      \
-                                                          \
-  "and  v12.16b, %[vbias].16b, %[vbias].16b  \n"          \
-  "and  v13.16b, %[vbias].16b, %[vbias].16b  \n" /* r0 */ \
-  "fmul v10.4s, v0.4s, %[wr0].s[0]\n"                     \
-  "fmul v11.4s, v8.4s, %[wr0].s[1]\n"                     \
-  "fmla v12.4s, v9.4s, %[wr0].s[2]\n"                     \
-                                                          \
-  "ext v8.16b, v2.16b, v3.16b, #4\n"                      \
-  "ext v9.16b, v2.16b, v3.16b, #8\n" /* r1 */             \
-  "fmul v14.4s, v2.4s, %[wr0].s[0]\n"                     \
-  "fmla v10.4s, v2.4s, %[wr1].s[0]\n"                     \
-                                                          \
-  "fmul v15.4s, v8.4s, %[wr0].s[1]\n"                     \
-  "fmla v11.4s, v8.4s, %[wr1].s[1]\n"                     \
-                                                          \
-  "fmla v13.4s, v9.4s, %[wr0].s[2]\n"                     \
-  "fmla v12.4s, v9.4s, %[wr1].s[2]\n"                     \
-                                                          \
-  "ext v8.16b, v4.16b, v5.16b, #4\n"                      \
-  "ext v9.16b, v4.16b, v5.16b, #8\n" /* r2 */             \
-  "fmla v14.4s, v4.4s, %[wr1].s[0]\n"                     \
-  "fmla v10.4s, v4.4s, %[wr2].s[0]\n"                     \
-                                                          \
-  "fmla v15.4s, v8.4s, %[wr1].s[1]\n"                     \
-  "fmla v11.4s, v8.4s, %[wr2].s[1]\n"                     \
-                                                          \
-  "fmla v13.4s, v9.4s, %[wr1].s[2]\n"                     \
-  "fmla v12.4s, v9.4s, %[wr2].s[2]\n"                     \
-                                                          \
-  "ext v8.16b, v6.16b, v7.16b, #4\n"                      \
-  "ext v9.16b, v6.16b, v7.16b, #8\n"                      \
-                                                          \
-  "fmla v14.4s, v6.4s, %[wr2].s[0]\n"                     \
-                                                          \
-  "fmla v15.4s, v8.4s, %[wr2].s[1]\n"                     \
-                                                          \
-  "fadd v12.4s, v12.4s, v10.4s\n"                         \
-                                                          \
-  "fmla v13.4s, v9.4s, %[wr2].s[2]\n"                     \
-                                                          \
-  "fadd v12.4s, v12.4s, v11.4s\n"                         \
-  "fadd v13.4s, v13.4s, v14.4s\n"                         \
-  "fadd v13.4s, v13.4s, v15.4s\n"  // \
-                    // "prfm pldl1keep, [%[out1]]\n" \
-                    // "prfm pldl1keep, [%[out2]]\n" \
-                    // \
-                    // "st1 {v12.4s}, [%[out1]]\n" \
-                    // "st1 {v13.4s}, [%[out2]]\n" \
-
-
-#else
-#define INIT_S1                                                    \
-  "pld [%[din0_ptr]]                             @ preload data\n" \
-  "pld [%[din1_ptr]]                      @ preload data\n"        \
-  "pld [%[din2_ptr]]                      @ preload data\n"        \
-  "pld [%[din3_ptr]]                      @ preload data\n"        \
-                                                                   \
-  "vld1.32  {d16-d18}, [%[din0_ptr]]!    @ load din r0\n"          \
-  "vld1.32  {d20-d22}, [%[din1_ptr]]!    @ load din r1\n"          \
-  "vld1.32  {d24-d26}, [%[din2_ptr]]!    @ load din r2\n"          \
-  "vld1.32  {d28-d30}, [%[din3_ptr]]!    @ load din r3\n"          \
-                                                                   \
-  "vdup.32 q4, %[bias_val]                            @ and \n"    \
-  "vdup.32 q5, %[bias_val]                            @ and \n"
-
-#define LEFT_COMPUTE_S1                                            \
-  "vext.32  q6, %q[vzero], q8, #3     @ 0012\n"                    \
-  "vext.32  q7, q8, q9, #1     @ 1234\n" /* r0 */                  \
-  "vmla.f32 q4, q8, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"           \
-                                                                   \
-  "sub %[din0_ptr], #12 @ 1pad + 2 float data overlap\n"           \
-  "sub %[din1_ptr], #12 @ 1pad + 2 float data overlap\n"           \
-  "sub %[din2_ptr], #12 @ 1pad + 2 float data overlap\n"           \
-  "sub %[din3_ptr], #12 @ 1pad + 2 float data overlap\n"           \
-                                                                   \
-  "vmla.f32 q4, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
-                                                                   \
-  "pld [%[din0_ptr]]                             @ preload data\n" \
-  "pld [%[din1_ptr]]                             @ preload data\n" \
-  "pld [%[din2_ptr]]                             @ preload data\n" \
-  "pld [%[din3_ptr]]                             @ preload data\n" \
-                                                                   \
-  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
-                                                                   \
-  "vext.32  q6, %q[vzero], q10, #3     @ 0012\n"                   \
-  "vext.32  q7, q10, q11, #1     @ 1234\n"                         \
-                                                                   \
-  /* r1 */                                                         \
-  "vmla.f32 q5, q10, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"          \
-  "vmla.f32 q4, q10, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
-                                                                   \
-  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"          \
-  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"          \
-                                                                   \
-  "vmla.f32 q5, q6, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"           \
-  "vmla.f32 q4, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
-                                                                   \
-  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"               \
-  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"               \
-                                                                   \
-  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[2]\n"           \
-  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
-                                                                   \
-  "vext.32  q6, %q[vzero], q12, #3     @ 0012\n"                   \
-  "vext.32  q7, q12, q13, #1     @ 1234\n"                         \
-                                                                   \
-  /* r2 */                                                         \
-  "vmla.f32 q5, q12, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"          \
-  "vmla.f32 q4, q12, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"          \
-                                                                   \
-  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"          \
-                                                                   \
-  "vmla.f32 q5, q6, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"           \
-  "vmla.f32 q4, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"           \
-                                                                   \
-  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"               \
-                                                                   \
-  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[2]\n"           \
-  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"           \
-                                                                   \
-  "vext.32  q6, %q[vzero], q14, #3     @ 0012\n"                   \
-  "vext.32  q7, q14, q15, #1     @ 1234\n"
-
-#define LEFT_RESULT_S1                                                        \
-  /* r3 */                                                                    \
-  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
-                                                                              \
-  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
-                                                                              \
-  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
-  "vdup.32 q4, %[bias_val]                            @ and \n"               \
-                                                                              \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
-                                                                              \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
-  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
-  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
-                                                                              \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
-                                                                              \
-  "vdup.32 q5, %[bias_val]                            @ and \n"               \
-  "blt  3f                                @ jump to main loop start point\n"
-
-#define MID_COMPUTE_S1                                                 \
-  "1:                                    @ right pad entry\n" /* r0 */ \
-  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"               \
-                                                                       \
-  "pld [%[din0_ptr]]                             @ preload data\n"     \
-  "pld [%[din1_ptr]]                             @ preload data\n"     \
-  "pld [%[din2_ptr]]                             @ preload data\n"     \
-  "pld [%[din3_ptr]]                             @ preload data\n"     \
-                                                                       \
-  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vld1.32  {d16-d17}, [%[din0_ptr]]!    @ load din r0\n"              \
-                                                                       \
-  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"               \
-                                                                       \
-  "vld1.32  {d18}, [%[din0_ptr]]    @ load din r0\n"                   \
-                                                                       \
-  "vext.32  q6, q10, q11, #1     @ 1234\n"                             \
-  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                    \
-  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"              \
-  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"              \
-                                                                       \
-  "vld1.32  {d20-d21}, [%[din1_ptr]]!    @ load din r0\n"              \
-                                                                       \
-  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"               \
-  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vld1.32  {d22}, [%[din1_ptr]]    @ load din r0\n"                   \
-                                                                       \
-  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"               \
-  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vext.32  q6, q12, q13, #1     @ 1234\n"                             \
-  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                    \
-  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"              \
-  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"              \
-                                                                       \
-  "vld1.32  {d24-d25}, [%[din2_ptr]]!    @ load din r0\n"              \
-                                                                       \
-  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"               \
-  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vld1.32  {d26}, [%[din2_ptr]]    @ load din r0\n"                   \
-                                                                       \
-  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"               \
-  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"               \
-                                                                       \
-  "vext.32  q6, q14, q15, #1     @ 1234\n"                             \
-  "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-#define MID_RESULT_S1                                                    \
-  /* r3 */                                                               \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
-                                                                         \
-  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
-                                                                         \
-  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
-                                                                         \
-  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
-  "vdup.32 q4, %[bias_val]                            @ and \n"          \
-                                                                         \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
-                                                                         \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
-  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
-                                                                         \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
-                                                                         \
-  "subs %[cnt], #1 @ loop count minus 1\n"                               \
-                                                                         \
-  "vdup.32 q5, %[bias_val]                            @ and \n"          \
-                                                                         \
-  "bne    1b                             @ jump to main loop start point\n"
-
-#define RIGHT_COMPUTE_S1                                                      \
-  "3:                                    @ right pad entry\n"                 \
-  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
-  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
-  "vld1.32  {d31}, [%[vmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
-  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
-  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
-                                                                              \
-  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
-                                                                              \
-  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
-  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
-  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
-  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vld1.32  {d19}, [%[rmask]]!    @ load din r0\n"                            \
-  "vld1.32  {d23}, [%[rmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vmla.f32 q5, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q4, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vld1.32  {d16-d17}, [%[dout_ptr1]]    @ load din r0\n"                     \
-  "vld1.32  {d20-d21}, [%[dout_ptr2]]    @ load din r0\n"                     \
-                                                                              \
-  "vmla.f32 q5, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q4, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
-  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
-  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
-  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmla.f32 q5, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q4, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vmla.f32 q5, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q4, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
-  "vext.32  q7, q14, q15, #2     @ 2345\n"
-
-#define RIGHT_RESULT_S1                                                 \
-  /* r3 */                                                              \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
-                                                                        \
-  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
-  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
-                                                                        \
-  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
-                                                                        \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
-                                                                        \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
-                                                                        \
-  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
-  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
-                                                                        \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
-
-#define LEFT_RESULT_S1_RELU                                                   \
-  /* r3 */                                                                    \
-  "vmla.f32 q5, q14, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                     \
-  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                                    \
-                                                                              \
-  "vmla.f32 q5, q6, %e[wr2][0]  @ q4 += 1234 * wr0[0]\n"                      \
-                                                                              \
-  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                          \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"        \
-                                                                              \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 1234 * wr0[2]\n"                      \
-                                                                              \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
-  "vext.32  q7, q8, q9, #2     @ 2345\n"                                      \
-  "vdup.32 q4, %[bias_val]                            @ and \n"               \
-                                                                              \
-  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                                    \
-                                                                              \
-  "cmp %[cnt], #1                             @ check whether has mid cols\n" \
-                                                                              \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"      \
-                                                                              \
-  "vdup.32 q5, %[bias_val]                            @ and \n"               \
-  "blt  3f                                @ jump to main loop start point\n"
-
-#define MID_RESULT_S1_RELU                                               \
-  /* r3 */                                                               \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                \
-                                                                         \
-  "vld1.32  {d28-d29}, [%[din3_ptr]]!    @ load din r0\n"                \
-  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                               \
-                                                                         \
-  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                 \
-                                                                         \
-  "vld1.32  {d30}, [%[din3_ptr]]    @ load din r0\n"                     \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"   \
-                                                                         \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                 \
-                                                                         \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                 \
-  "vext.32  q7, q8, q9, #2     @ 2345\n"                                 \
-  "vdup.32 q4, %[bias_val]                            @ and \n"          \
-                                                                         \
-  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                               \
-                                                                         \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n" \
-                                                                         \
-  "subs %[cnt], #1 @ loop count minus 1\n"                               \
-                                                                         \
-  "vdup.32 q5, %[bias_val]                            @ and \n"          \
-                                                                         \
-  "bne    1b                             @ jump to main loop start point\n"
-
-#define RIGHT_RESULT_S1_RELU                                            \
-  /* r3 */                                                              \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"               \
-                                                                        \
-  "vmax.f32  q4, q4, %q[vzero]  @ relu \n"                              \
-                                                                        \
-  "vmla.f32 q5, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                \
-                                                                        \
-  "vbif d8, d16, d19              @ bit select, deal with right pad\n"  \
-  "vbif d9, d17, d23              @ bit select, deal with right pad\n"  \
-                                                                        \
-  "vmla.f32 q5, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                \
-  "vst1.32  {d8-d9},   [%[dout_ptr1]]!  @ store result, add pointer\n"  \
-                                                                        \
-  "vmax.f32  q5, q5, %q[vzero]  @ relu \n"                              \
-                                                                        \
-  "vbif d10, d20, d19              @ bit select, deal with right pad\n" \
-  "vbif d11, d21, d23              @ bit select, deal with right pad\n" \
-                                                                        \
-  "vst1.32  {d10-d11},   [%[dout_ptr2]]!  @ store result, add pointer\n"
-
-#define COMPUTE_S_S1                 \
-  "pld [%[din0]]\n"                  \
-  "pld [%[din1]]\n"                  \
-  "pld [%[din2]]\n"                  \
-  "pld [%[din3]]\n"                  \
-                                     \
-  "vld1.32 {d12-d13}, [%[din0]]!\n"  \
-  "vld1.32 {d14-d15}, [%[din1]]!\n"  \
-  "vld1.32 {d16-d17}, [%[din2]]!\n"  \
-  "vld1.32 {d18-d19}, [%[din3]]!\n"  \
-                                     \
-  "vbif q6, %q[vzero], %q[mask]\n"   \
-  "vbif q7, %q[vzero], %q[mask]\n"   \
-  "vbif q8, %q[vzero], %q[mask]\n"   \
-  "vbif q9, %q[vzero], %q[mask]\n"   \
-                                     \
-  "vmul.f32 q14, q6, %e[wr0][1]\n"   \
-  "vmul.f32 q15, q7, %e[wr0][1]\n"   \
-                                     \
-  "vmla.f32 q14, q7, %e[wr1][1]\n"   \
-  "vmla.f32 q15, q8, %e[wr1][1]\n"   \
-                                     \
-  "vmla.f32 q14, q8, %e[wr2][1]\n"   \
-  "vmla.f32 q15, q9, %e[wr2][1]\n"   \
-                                     \
-  "vext.32 q10, %q[vzero], q6, #3\n" \
-  "vext.32 q11, %q[vzero], q7, #3\n" \
-  "vext.32 q12, %q[vzero], q8, #3\n" \
-  "vext.32 q13, %q[vzero], q9, #3\n" \
-                                     \
-  "vmla.f32 q14, q10, %e[wr0][0]\n"  \
-  "vmla.f32 q15, q11, %e[wr0][0]\n"  \
-                                     \
-  "vmla.f32 q14, q11, %e[wr1][0]\n"  \
-  "vmla.f32 q15, q12, %e[wr1][0]\n"  \
-                                     \
-  "vmla.f32 q14, q12, %e[wr2][0]\n"  \
-  "vmla.f32 q15, q13, %e[wr2][0]\n"  \
-                                     \
-  "vext.32 q10, q6, %q[vzero], #1\n" \
-  "vext.32 q11, q7, %q[vzero], #1\n" \
-  "vext.32 q12, q8, %q[vzero], #1\n" \
-  "vext.32 q13, q9, %q[vzero], #1\n" \
-                                     \
-  "vmla.f32 q14, q10, %f[wr0][0]\n"  \
-  "vmla.f32 q15, q11, %f[wr0][0]\n"  \
-                                     \
-  "vmla.f32 q14, q11, %f[wr1][0]\n"  \
-  "vmla.f32 q15, q12, %f[wr1][0]\n"  \
-                                     \
-  "vmla.f32 q14, q12, %f[wr2][0]\n"  \
-  "vmla.f32 q15, q13, %f[wr2][0]\n"  \
-                                     \
-  "vadd.f32 q14, q14, %q[bias]\n"    \
-  "vadd.f32 q15, q15, %q[bias]\n"
-
-#define RESULT_S_S1                \
-  "pld [%[out1]]\n"                \
-  "pld [%[out2]]\n"                \
-                                   \
-  "vst1.32 {d28-d29}, [%[out1]]\n" \
-  "vst1.32 {d30-d31}, [%[out2]]\n"
-
-#define RESULT_S_S1_RELU           \
-  "pld [%[out1]]\n"                \
-  "pld [%[out2]]\n"                \
-                                   \
-  "vmax.f32 q14, q14, %q[vzero]\n" \
-  "vmax.f32 q15, q15, %q[vzero]\n" \
-                                   \
-  "vst1.32 {d28-d29}, [%[out1]]\n" \
-  "vst1.32 {d30-d31}, [%[out2]]\n"
-
-#define COMPUTE_S_S1_P0                                                       \
-  "pld [%[din0]]\n"                                                           \
-  "pld [%[din1]]\n"                                                           \
-  "pld [%[din2]]\n"                                                           \
-  "pld [%[din3]]\n"                                                           \
-  "vld1.32  {d16-d18}, [%[din0]]    @ load din r0\n"                          \
-  "vld1.32  {d20-d22}, [%[din1]]    @ load din r1\n"                          \
-  "vld1.32  {d24-d26}, [%[din2]]    @ load din r2\n"                          \
-  "vld1.32  {d28-d30}, [%[din3]]    @ load din r3\n"                          \
-                                                                              \
-  "vdup.32 q4, %[bias_val]                            @ and \n"               \
-  "vdup.32 q5, %[bias_val]                            @ and \n"               \
-                                                                              \
-  "vld1.32  {d19}, [%[vmask]]!    @ load din r0\n"                            \
-  "vld1.32  {d23}, [%[vmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vld1.32  {d27}, [%[vmask]]!    @ load din r0\n"                            \
-                                                                              \
-  "vbif d16, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d20, %e[vzero], d19              @ bit select, deal with right pad\n" \
-                                                                              \
-  "vbif d17, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d21, %e[vzero], d23              @ bit select, deal with right pad\n" \
-                                                                              \
-  "vbif d18, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-  "vbif d22, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vext.32  q6, q8, q9, #1     @ 1234\n"                                      \
-  "vext.32  q7, q8, q9, #2     @ 2345\n" /* r0 */                             \
-  "vmla.f32 q4, q8, %e[wr0][0]  @ q4 += 0123 * wr0[0]\n"                      \
-                                                                              \
-  "vbif d24, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d25, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d26, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vmla.f32 q4, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
-                                                                              \
-  "vbif d28, %e[vzero], d19              @ bit select, deal with right pad\n" \
-  "vbif d29, %e[vzero], d23              @ bit select, deal with right pad\n" \
-  "vbif d30, %e[vzero], d27             @ bit select, deal with right pad\n"  \
-                                                                              \
-  "vmla.f32 q4, q7, %f[wr0][0]  @ q4 += 2345 * wr0[2]\n"                      \
-                                                                              \
-  "vext.32  q6, q10, q11, #1     @ 1234\n"                                    \
-  "vext.32  q7, q10, q11, #2     @ 2345\n" /* r1 */                           \
-  "vmla.f32 q5, q10, %e[wr0][0]  @ q4 += 1234 * wr0[0]\n"                     \
-  "vmla.f32 q4, q10, %e[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmul.f32 q8, q6, %e[wr0][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmul.f32 q10, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmul.f32 q9, q7, %f[wr0][0]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmul.f32 q11, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vext.32  q6, q12, q13, #1     @ 1234\n"                                    \
-  "vext.32  q7, q12, q13, #2     @ 2345\n" /* r2 */                           \
-  "vmla.f32 q5, q12, %e[wr1][0]  @ q4 += 1234 * wr0[0]\n"                     \
-  "vmla.f32 q4, q12, %e[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmla.f32 q8, q6, %e[wr1][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q10, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vmla.f32 q9, q7, %f[wr1][0]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vmla.f32 q11, q7, %f[wr2][0]  @ q4 += 1234 * wr0[1]\n"                     \
-                                                                              \
-  "vext.32  q6, q14, q15, #1     @ 1234\n"                                    \
-  "vext.32  q7, q14, q15, #2     @ 2345\n" /* r3 */                           \
-  "vmla.f32 q5, q14, %e[wr2][0]  @ q4 += 0123 * wr0[0]\n"                     \
-                                                                              \
-  "vmla.f32 q8, q6, %e[wr2][1]  @ q4 += 1234 * wr0[1]\n"                      \
-  "vadd.f32 q4, q4, q10         @ q4 += q10 \n"                               \
-                                                                              \
-  "pld [%[out1]]\n"                                                           \
-  "pld [%[out2]]\n"                                                           \
-                                                                              \
-  "vmla.f32 q9, q7, %f[wr2][0]  @ q4 += 2345 * wr0[2]\n"                      \
-  "vadd.f32 q14, q4, q11         @ q4 += q10 \n"                              \
-                                                                              \
-  "vadd.f32 q5, q5, q8         @ q4 += q10 \n"                                \
-  "vadd.f32 q15, q5, q9         @ q4 += q10 \n"
-
-#endif
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-void conv_depthwise_3x3s1p1_bias(float *dout,
-                                 const float *din,
-                                 const float *weights,
-                                 const float *bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext *ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float *zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float *write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = (w_in + 3) >> 2;
-  int cnt_col = tile_w - 2;
-
-  unsigned int size_pad_right = (unsigned int)(1 + (tile_w << 2) - w_in);
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      float *dout_ptr = dout_batch + c * size_out_channel;
-
-      const float *din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float *wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float *doutr0 = dout_ptr;
-      float *doutr1 = doutr0 + w_out;
-      float *doutr2 = doutr1 + w_out;
-      float *doutr3 = doutr2 + w_out;
-
-      const float *dr0 = din_ch_ptr;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-      const float *dr4 = dr3 + w_in;
-      const float *dr5 = dr4 + w_in;
-
-      const float *din_ptr0 = dr0;
-      const float *din_ptr1 = dr1;
-      const float *din_ptr2 = dr2;
-      const float *din_ptr3 = dr3;
-      const float *din_ptr4 = dr4;
-      const float *din_ptr5 = dr5;
-      float *ptr_zero = const_cast<float *>(zero);
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          din_ptr4 = dr3;
-          din_ptr5 = dr4;
-          dr0 = dr3;
-          dr1 = dr4;
-          dr2 = dr5;
-        } else {
-          dr0 = dr4;
-          dr1 = dr5;
-          dr2 = dr1 + w_in;
-        }
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 > h_in) {
-          switch (i + 5 - h_in) {
-            case 5:
-              din_ptr1 = zero_ptr;
-            case 4:
-              din_ptr2 = zero_ptr;
-            case 3:
-              din_ptr3 = zero_ptr;
-            case 2:
-              din_ptr4 = zero_ptr;
-            case 1:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = cnt_col;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        } else {
-          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
-                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-                       : [cnt] "+r"(cnt),
-                         [din_ptr0] "+r"(din_ptr0),
-                         [din_ptr1] "+r"(din_ptr1),
-                         [din_ptr2] "+r"(din_ptr2),
-                         [din_ptr3] "+r"(din_ptr3),
-                         [din_ptr4] "+r"(din_ptr4),
-                         [din_ptr5] "+r"(din_ptr5),
-                         [doutr0] "+r"(doutr0),
-                         [doutr1] "+r"(doutr1),
-                         [doutr2] "+r"(doutr2),
-                         [doutr3] "+r"(doutr3)
-                       : [w0] "w"(wr0),
-                         [w1] "w"(wr1),
-                         [w2] "w"(wr2),
-                         [bias_val] "r"(vbias),
-                         [vmask] "r"(vmask),
-                         [rmask] "r"(rmask),
-                         [vzero] "w"(vzero)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17",
-                         "v18",
-                         "v19",
-                         "v20",
-                         "v21",
-                         "v22",
-                         "v23",
-                         "v24",
-                         "v25");
-        }
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-#else
-      for (int i = 0; i < h_in; i += 2) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-
-        doutr0 = dout_ptr;
-        doutr1 = dout_ptr + w_out;
-        // unsigned int* rst_mask = rmask;
-
-        if (i == 0) {
-          din_ptr0 = zero_ptr;
-          din_ptr1 = dr0;
-          din_ptr2 = dr1;
-          din_ptr3 = dr2;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr3;
-          dr3 = dr2 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr3;
-          dr2 = dr1 + w_in;
-          dr3 = dr2 + w_in;
-        }
-        //! process bottom pad
-        if (i + 3 > h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din_ptr1 = zero_ptr;
-            case 2:
-              din_ptr2 = zero_ptr;
-            case 1:
-              din_ptr3 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = cnt_col;
-        unsigned int *rmask_ptr = rmask;
-        unsigned int *vmask_ptr = vmask;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1_RELU MID_COMPUTE_S1
-                  MID_RESULT_S1_RELU RIGHT_COMPUTE_S1 RIGHT_RESULT_S1_RELU
-              : [dout_ptr1] "+r"(doutr0),
-                [dout_ptr2] "+r"(doutr1),
-                [din0_ptr] "+r"(din_ptr0),
-                [din1_ptr] "+r"(din_ptr1),
-                [din2_ptr] "+r"(din_ptr2),
-                [din3_ptr] "+r"(din_ptr3),
-                [cnt] "+r"(cnt),
-                [rmask] "+r"(rmask_ptr),
-                [vmask] "+r"(vmask_ptr)
-              : [wr0] "w"(wr0),
-                [wr1] "w"(wr1),
-                [wr2] "w"(wr2),
-                [bias_val] "r"(bias_val),
-                [vzero] "w"(vzero)
-              : "cc",
-                "memory",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        } else {
-          asm volatile(INIT_S1 LEFT_COMPUTE_S1 LEFT_RESULT_S1 MID_COMPUTE_S1
-                           MID_RESULT_S1 RIGHT_COMPUTE_S1 RIGHT_RESULT_S1
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-        dout_ptr += 2 * w_out;
-      }  //! end of processing mid rows
-#endif
-    }
-  }
-}
-
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p1_bias_s(float *dout,
-                                   const float *din,
-                                   const float *weights,
-                                   const float *bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext *ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[4] = {3, 2, 1, 0};
-  const float zero[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(4 - w_in));
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float *dout_channel = dout_batch + i * size_out_channel;
-      const float *din_channel = din_batch + i * size_in_channel;
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-
-      int hs = -1;
-      int he = 3;
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      int h_cnt = (h_out + 1) >> 1;
-      float *doutr0 = dout_channel;
-      float *doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_cnt; ++j) {
-        const float *dr0 = din_channel + hs * w_in;
-        const float *dr1 = dr0 + w_in;
-        const float *dr2 = dr1 + w_in;
-        const float *dr3 = dr2 + w_in;
-
-        if (hs == -1) {
-          dr0 = zero;
-        }
-
-        switch (he - h_in) {
-          case 2:
-            dr2 = zero;
-            doutr1 = trash_buf;
-          case 1:
-            dr3 = zero;
-          default:
-            break;
-        }
-#ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [zero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17");
-        } else {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [zero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17");
-        }
-#else
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S1 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [mask] "w"(vmask_rp),
-                         [bias] "w"(wbias),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-        doutr0 = doutr1;
-        doutr1 += w_out;
-        hs += 2;
-        he += 2;
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width > 4
- */
-void conv_depthwise_3x3s1p0_bias(float *dout,
-                                 const float *din,
-                                 const float *weights,
-                                 const float *bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext *ctx) {
-  //! pad is done implicit
-  const float zero[8] = {0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
-  //! for 4x6 convolution window
-  const unsigned int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-
-  float *zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float *write_ptr = zero_ptr + w_in;
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  int w_stride = 9;
-
-  int tile_w = w_out >> 2;
-  int remain = w_out % 4;
-
-  unsigned int size_pad_right = (unsigned int)(6 + (tile_w << 2) - w_in);
-  const int remian_idx[4] = {0, 1, 2, 3};
-
-  uint32x4_t vmask_rp1 =
-      vcgeq_u32(vld1q_u32(right_pad_idx), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_rp2 =
-      vcgeq_u32(vld1q_u32(right_pad_idx + 4), vdupq_n_u32(size_pad_right));
-  uint32x4_t vmask_result =
-      vcgtq_s32(vdupq_n_s32(remain), vld1q_s32(remian_idx));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  unsigned int rmask[4];
-  vst1q_u32(rmask, vmask_result);
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int c = 0; c < ch_in; c++) {
-      float *dout_ptr = dout_batch + c * size_out_channel;
-
-      const float *din_ch_ptr = din_batch + c * size_in_channel;
-
-      float bias_val = flag_bias ? bias[c] : 0.f;
-      float vbias[4] = {bias_val, bias_val, bias_val, bias_val};
-
-      const float *wei_ptr = weights + c * w_stride;
-
-      float32x4_t wr0 = vld1q_f32(wei_ptr);
-      float32x4_t wr1 = vld1q_f32(wei_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(wei_ptr + 6);
-
-      float *doutr0 = dout_ptr;
-      float *doutr1 = doutr0 + w_out;
-      float *doutr2 = doutr1 + w_out;
-      float *doutr3 = doutr2 + w_out;
-
-      const float *dr0 = din_ch_ptr;
-      const float *dr1 = dr0 + w_in;
-      const float *dr2 = dr1 + w_in;
-      const float *dr3 = dr2 + w_in;
-      const float *dr4 = dr3 + w_in;
-      const float *dr5 = dr4 + w_in;
-
-      const float *din_ptr0 = dr0;
-      const float *din_ptr1 = dr1;
-      const float *din_ptr2 = dr2;
-      const float *din_ptr3 = dr3;
-      const float *din_ptr4 = dr4;
-      const float *din_ptr5 = dr5;
-
-      float *ptr_zero = const_cast<float *>(zero);
-#ifdef __aarch64__
-      for (int i = 0; i < h_out; i += 4) {
-        //! process top pad pad_h = 1
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-        din_ptr4 = dr4;
-        din_ptr5 = dr5;
-
-        doutr0 = dout_ptr;
-        doutr1 = doutr0 + w_out;
-        doutr2 = doutr1 + w_out;
-        doutr3 = doutr2 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr5;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-        dr5 = dr4 + w_in;
-
-        //! process bottom pad
-        if (i + 5 >= h_in) {
-          switch (i + 5 - h_in) {
-            case 4:
-              din_ptr1 = zero_ptr;
-            case 3:
-              din_ptr2 = zero_ptr;
-            case 2:
-              din_ptr3 = zero_ptr;
-            case 1:
-              din_ptr4 = zero_ptr;
-            case 0:
-              din_ptr5 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 4 > h_out) {
-          switch (i + 4 - h_out) {
-            case 3:
-              doutr1 = write_ptr;
-            case 2:
-              doutr2 = write_ptr;
-            case 1:
-              doutr3 = write_ptr;
-            default:
-              break;
-          }
-        }
-
-        int cnt = tile_w;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S1
-              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-              MID_COMPUTE_S1 MID_RESULT_S1_RELU
-              "cmp  %w[remain], #1             \n"
-              "blt 0f                         \n" RIGHT_COMPUTE_S1
-                  RIGHT_RESULT_S1_RELU "0: \n"
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero),
-                [remain] "r"(remain)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        } else {
-          asm volatile(
-              INIT_S1
-              "ld1 {v8.4s}, [%[din_ptr4]], #16   \n"  /*vld1q_f32(din_ptr0)*/
-              "ld1 {v10.4s}, [%[din_ptr5]], #16   \n" /*vld1q_f32(din_ptr0)*/
-              "ext  v16.16b, v0.16b, v1.16b, #4 \n"   /* v16 = 1234 */
-              "ext  v17.16b, v0.16b, v1.16b, #8 \n"   /* v17 = 2345 */
-              "ld1 {v9.4s}, [%[din_ptr4]]   \n"       /*vld1q_f32(din_ptr0)*/
-              "ld1 {v11.4s}, [%[din_ptr5]]   \n"      /*vld1q_f32(din_ptr0)*/
-              MID_COMPUTE_S1 MID_RESULT_S1
-              "cmp  %w[remain], #1             \n"
-              "blt 0f                         \n" RIGHT_COMPUTE_S1
-                  RIGHT_RESULT_S1 "0: \n"
-              : [cnt] "+r"(cnt),
-                [din_ptr0] "+r"(din_ptr0),
-                [din_ptr1] "+r"(din_ptr1),
-                [din_ptr2] "+r"(din_ptr2),
-                [din_ptr3] "+r"(din_ptr3),
-                [din_ptr4] "+r"(din_ptr4),
-                [din_ptr5] "+r"(din_ptr5),
-                [doutr0] "+r"(doutr0),
-                [doutr1] "+r"(doutr1),
-                [doutr2] "+r"(doutr2),
-                [doutr3] "+r"(doutr3)
-              : [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [bias_val] "r"(vbias),
-                [vmask] "r"(vmask),
-                [rmask] "r"(rmask),
-                [vzero] "w"(vzero),
-                [remain] "r"(remain)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21",
-                "v22",
-                "v23",
-                "v24",
-                "v25");
-        }
-        dout_ptr = dout_ptr + 4 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i += 2) {
-        din_ptr0 = dr0;
-        din_ptr1 = dr1;
-        din_ptr2 = dr2;
-        din_ptr3 = dr3;
-
-        doutr0 = dout_ptr;
-        doutr1 = dout_ptr + w_out;
-
-        dr0 = dr2;
-        dr1 = dr3;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        //! process bottom pad
-        if (i + 3 >= h_in) {
-          switch (i + 3 - h_in) {
-            case 3:
-              din_ptr1 = zero_ptr;
-            case 2:
-              din_ptr2 = zero_ptr;
-            case 1:
-              din_ptr3 = zero_ptr;
-            case 0:
-              din_ptr3 = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process bottom remain
-        if (i + 2 > h_out) {
-          doutr1 = write_ptr;
-        }
-        int cnt = tile_w;
-        unsigned int *rmask_ptr = rmask;
-        unsigned int *vmask_ptr = vmask;
-        if (flag_relu) {
-          asm volatile(INIT_S1
-                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "vext.32  q6, q8, q9, #1     @ 0012\n"
-                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                           MID_RESULT_S1_RELU
-                       "cmp  %[remain], #1             \n"
-                       "blt 0f                         \n" RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1_RELU "0:                         \n"
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero),
-                         [remain] "r"(remain)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(INIT_S1
-                       "sub %[din0_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din1_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din2_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "sub %[din3_ptr], #8 @ 0pad + 2 float data overlap\n"
-                       "vext.32  q6, q8, q9, #1     @ 0012\n"
-                       "vext.32  q7, q8, q9, #2     @ 1234\n" MID_COMPUTE_S1
-                           MID_RESULT_S1
-                       "cmp  %[remain], #1             \n"
-                       "blt 0f                         \n" RIGHT_COMPUTE_S1
-                           RIGHT_RESULT_S1 "0:                         \n"
-                       : [dout_ptr1] "+r"(doutr0),
-                         [dout_ptr2] "+r"(doutr1),
-                         [din0_ptr] "+r"(din_ptr0),
-                         [din1_ptr] "+r"(din_ptr1),
-                         [din2_ptr] "+r"(din_ptr2),
-                         [din3_ptr] "+r"(din_ptr3),
-                         [cnt] "+r"(cnt),
-                         [rmask] "+r"(rmask_ptr),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias_val] "r"(bias_val),
-                         [vzero] "w"(vzero),
-                         [remain] "r"(remain)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-        dout_ptr += 2 * w_out;
-      }  //! end of processing mid rows
-#endif
-    }
-  }
-}
-/**
- * \brief depthwise convolution, kernel size 3x3, stride 1, pad 1, with bias,
- * width <= 4
- */
-void conv_depthwise_3x3s1p0_bias_s(float *dout,
-                                   const float *din,
-                                   const float *weights,
-                                   const float *bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext *ctx) {
-  //! 3x3s1 convolution, implemented by direct algorithm
-  //! pad is done implicit
-  //! for 4x6 convolution window
-  const int right_pad_idx[8] = {5, 4, 3, 2, 1, 0, 0, 0};
-  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-
-  float32x4_t vzero = vdupq_n_f32(0.f);
-  uint32x4_t vmask_rp1 =
-      vcgeq_s32(vld1q_s32(right_pad_idx), vdupq_n_s32(6 - w_in));
-  uint32x4_t vmask_rp2 =
-      vcgeq_s32(vld1q_s32(right_pad_idx + 4), vdupq_n_s32(6 - w_in));
-
-  unsigned int vmask[8];
-  vst1q_u32(vmask, vmask_rp1);
-  vst1q_u32(vmask + 4, vmask_rp2);
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-  for (int n = 0; n < num; ++n) {
-    const float *din_batch = din + n * ch_in * size_in_channel;
-    float *dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      float *dout_channel = dout_batch + i * size_out_channel;
-      const float *din_channel = din_batch + i * size_in_channel;
-      const float *weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-#ifdef __aarch64__
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-#endif  // __aarch64__
-
-      float out_buf1[4];
-      float out_buf2[4];
-      float trash_buf[4];
-
-      float *doutr0 = dout_channel;
-      float *doutr1 = dout_channel + w_out;
-
-      for (int j = 0; j < h_out; j += 2) {
-        const float *dr0 = din_channel + j * w_in;
-        const float *dr1 = dr0 + w_in;
-        const float *dr2 = dr1 + w_in;
-        const float *dr3 = dr2 + w_in;
-
-        doutr0 = dout_channel + j * w_out;
-        doutr1 = doutr0 + w_out;
-
-        if (j + 3 >= h_in) {
-          switch (j + 3 - h_in) {
-            case 3:
-              dr1 = zero_ptr;
-            case 2:
-              dr2 = zero_ptr;
-            case 1:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            case 0:
-              dr3 = zero_ptr;
-              doutr1 = trash_buf;
-            default:
-              break;
-          }
-        }
-#ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vbias] "w"(wbias),
-                         [mask1] "w"(vmask_rp1),
-                         [mask2] "w"(vmask_rp2),
-                         [zero] "w"(vzero),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        } else {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vbias] "w"(wbias),
-                         [mask1] "w"(vmask_rp1),
-                         [mask2] "w"(vmask_rp2),
-                         [zero] "w"(vzero),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        }
-#else
-        unsigned int *vmask_ptr = vmask;
-        float bias_val = flag_bias ? bias[i] : 0.f;
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1_RELU
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [bias_val] "r"(bias_val),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S1_P0 RESULT_S_S1
-                       : [din0] "+r"(dr0),
-                         [din1] "+r"(dr1),
-                         [din2] "+r"(dr2),
-                         [din3] "+r"(dr3),
-                         [vmask] "+r"(vmask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [vzero] "w"(vzero),
-                         [bias_val] "r"(bias_val),
-                         [out1] "r"(out_buf1),
-                         [out2] "r"(out_buf2)
-                       : "cc",
-                         "memory",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *doutr0++ = out_buf1[w];
-          *doutr1++ = out_buf2[w];
-        }
-      }  // end of processing heights
-    }    // end of processing channels
-  }      // end of processing batchs
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/conv_depthwise_3x3s2.cc b/lite/backends/arm/math/conv_depthwise_3x3s2.cc
deleted file mode 100644
index ec039af98c..0000000000
--- a/lite/backends/arm/math/conv_depthwise_3x3s2.cc
+++ /dev/null
@@ -1,1862 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/arm/math/conv_depthwise.h"
-#include <arm_neon.h>
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-void conv_depthwise_3x3s2p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-void conv_depthwise_3x3s2p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx);
-
-void conv_depthwise_3x3s2p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx);
-
-void conv_depthwise_3x3s2_fp32(const float* din,
-                               float* dout,
-                               int num,
-                               int ch_out,
-                               int h_out,
-                               int w_out,
-                               int ch_in,
-                               int h_in,
-                               int w_in,
-                               const float* weights,
-                               const float* bias,
-                               int pad,
-                               bool flag_bias,
-                               bool flag_relu,
-                               ARMContext* ctx) {
-  if (pad == 0) {
-    if (w_in > 7) {
-      conv_depthwise_3x3s2p0_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  flag_relu,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  ctx);
-    } else {
-      conv_depthwise_3x3s2p0_bias_s(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    flag_relu,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-    }
-  }
-  if (pad == 1) {
-    if (w_in > 7) {
-      conv_depthwise_3x3s2p1_bias(dout,
-                                  din,
-                                  weights,
-                                  bias,
-                                  flag_bias,
-                                  flag_relu,
-                                  num,
-                                  ch_in,
-                                  h_in,
-                                  w_in,
-                                  h_out,
-                                  w_out,
-                                  ctx);
-    } else {
-      conv_depthwise_3x3s2p1_bias_s(dout,
-                                    din,
-                                    weights,
-                                    bias,
-                                    flag_bias,
-                                    flag_relu,
-                                    num,
-                                    ch_in,
-                                    h_in,
-                                    w_in,
-                                    h_out,
-                                    w_out,
-                                    ctx);
-    }
-  }
-}
-#ifdef __aarch64__
-#define INIT_S2                                  \
-  "prfm pldl1keep, [%[inptr0]]             \n"   \
-  "prfm pldl1keep, [%[inptr1]]             \n"   \
-  "prfm pldl1keep, [%[inptr2]]             \n"   \
-  "prfm pldl1keep, [%[inptr3]]             \n"   \
-  "prfm pldl1keep, [%[inptr4]]             \n"   \
-  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"  \
-  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"  \
-  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"  \
-  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"  \
-  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"  \
-                                                 \
-  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n" \
-  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"
-
-#define LEFT_COMPUTE_S2                                                   \
-  "ext  v10.16b, %[vzero].16b, v1.16b, #12     \n" /* r0 */               \
-  "fmul v11.4s, v0.4s, %[w0].s[1]            \n"   /*  {0,2,4,6} * w01 */ \
-  "fmul v12.4s, v1.4s, %[w0].s[2]            \n"   /* {1,3,5,7} * w02 */  \
-  "fmla v16.4s, v10.4s, %[w0].s[0]            \n"  /* {0,1,3,5} * w00*/   \
-                                                                          \
-  "ext  v10.16b, %[vzero].16b, v3.16b, #12     \n" /* v10 = {0,1,3,5} */  \
-                                                                          \
-  "sub %[inptr0], %[inptr0], #4            \n"                            \
-  "sub %[inptr1], %[inptr1], #4             \n" /* r1 */                  \
-  "fmla v11.4s, v2.4s, %[w1].s[1]            \n"                          \
-  "fmla v12.4s, v3.4s, %[w1].s[2]            \n"                          \
-  "fmla v16.4s, v10.4s, %[w1].s[0]            \n"                         \
-                                                                          \
-  "ext  v10.16b, %[vzero].16b, v5.16b, #12     \n"                        \
-                                                                          \
-  "sub %[inptr2], %[inptr2], #4            \n"                            \
-  "sub %[inptr3], %[inptr3], #4             \n" /* r2 */                  \
-  "fmul v13.4s, v4.4s, %[w0].s[1]            \n"                          \
-  "fmla v11.4s, v4.4s, %[w2].s[1]            \n"                          \
-                                                                          \
-  "fmul v14.4s, v5.4s, %[w0].s[2]            \n"                          \
-  "fmla v12.4s, v5.4s, %[w2].s[2]            \n"                          \
-                                                                          \
-  "fmla v17.4s, v10.4s, %[w0].s[0]            \n"                         \
-  "fmla v16.4s, v10.4s, %[w2].s[0]            \n"                         \
-                                                                          \
-  "ext  v10.16b, %[vzero].16b, v7.16b, #12     \n"                        \
-                                                                          \
-  "sub %[inptr4], %[inptr4], #4            \n" /* r3 */                   \
-  "fmla v13.4s, v6.4s, %[w1].s[1]            \n"                          \
-  "fmla v14.4s, v7.4s, %[w1].s[2]            \n"                          \
-  "fmla v17.4s, v10.4s, %[w1].s[0]            \n"                         \
-                                                                          \
-  "ext  v10.16b, %[vzero].16b, v9.16b, #12     \n"                        \
-  "fadd v16.4s, v16.4s, v11.4s                  \n"                       \
-  "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-#define LEFT_RESULT_S2                              \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
-                                                    \
-  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
-                                                    \
-  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
-  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
-  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
-                                                    \
-  "fadd v17.4s, v17.4s, v13.4s                  \n" \
-                                                    \
-  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
-  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
-  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
-  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
-                                                    \
-  "fadd v17.4s, v17.4s, v14.4s                  \n" \
-                                                    \
-  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
-  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
-                                                    \
-  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
-                                                    \
-  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
-  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
-                                                    \
-  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
-                                                    \
-  "cmp %w[cnt], #1                             \n"  \
-                                                    \
-  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
-                                                    \
-  "blt 1f                                     \n"
-
-#define MID_COMPUTE_S2                                      \
-  "2:                                          \n" /* r0 */ \
-  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"            \
-  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"            \
-  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v2.16b, v18.16b, #4     \n"                \
-  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n" /* r1 */    \
-  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"            \
-  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"            \
-  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v4.16b, v19.16b, #4     \n"                \
-                                                            \
-  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n" /* r2 */    \
-  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"            \
-  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"            \
-                                                            \
-  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"            \
-  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"            \
-                                                            \
-  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"           \
-  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v6.16b, v20.16b, #4     \n"                \
-                                                            \
-  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n" /* r3 */    \
-  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"            \
-  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"            \
-  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"           \
-                                                            \
-  "ext  v10.16b, v8.16b, v21.16b, #4     \n"                \
-                                                            \
-  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"             \
-                                                            \
-  "fadd v16.4s, v16.4s, v11.4s                  \n"         \
-  "fadd v16.4s, v16.4s, v12.4s                  \n"
-
-#define MID_RESULT_S2                               \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
-                                                    \
-  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
-  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
-  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
-  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
-                                                    \
-  "fadd v17.4s, v17.4s, v13.4s                  \n" \
-                                                    \
-  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
-  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
-  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
-                                                    \
-  "fadd v17.4s, v17.4s, v14.4s                  \n" \
-                                                    \
-  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
-  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
-  "subs %w[cnt], %w[cnt], #1                    \n" \
-                                                    \
-  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
-                                                    \
-  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
-                                                    \
-  "bne  2b                                    \n"
-
-#define RIGHT_COMPUTE_S2                                   \
-  "1:                                          \n"         \
-  "cmp %w[remain], #1                           \n"        \
-  "blt 4f                                     \n"          \
-  "3:                                         \n"          \
-  "bif  v0.16b, %[vzero].16b, %[mask1].16b    \n"          \
-  "bif  v1.16b, %[vzero].16b, %[mask2].16b    \n"          \
-                                                           \
-  "bif  v2.16b, %[vzero].16b, %[mask1].16b    \n"          \
-  "bif  v3.16b, %[vzero].16b, %[mask2].16b    \n"          \
-                                                           \
-  "bif  v4.16b, %[vzero].16b, %[mask1].16b    \n"          \
-  "bif  v5.16b, %[vzero].16b, %[mask2].16b    \n"          \
-                                                           \
-  "ext  v10.16b, v0.16b, %[vzero].16b, #4     \n"          \
-                                                           \
-  "bif  v6.16b, %[vzero].16b, %[mask1].16b    \n"          \
-  "bif  v7.16b, %[vzero].16b, %[mask2].16b    \n" /* r0 */ \
-  "fmul v11.4s, v0.4s, %[w0].s[0]            \n"           \
-  "fmul v12.4s, v1.4s, %[w0].s[1]            \n"           \
-  "fmla v16.4s, v10.4s, %[w0].s[2]            \n"          \
-                                                           \
-  "ext  v10.16b, v2.16b, %[vzero].16b, #4     \n"          \
-  "bif  v8.16b, %[vzero].16b, %[mask1].16b    \n"          \
-  "bif  v9.16b, %[vzero].16b, %[mask2].16b    \n" /* r1 */ \
-  "fmla v11.4s, v2.4s, %[w1].s[0]            \n"           \
-  "fmla v12.4s, v3.4s, %[w1].s[1]            \n"           \
-  "fmla v16.4s, v10.4s, %[w1].s[2]            \n"          \
-                                                           \
-  "ext  v10.16b, v4.16b, %[vzero].16b, #4     \n" /* r2 */ \
-  "fmul v13.4s, v4.4s, %[w0].s[0]            \n"           \
-  "fmla v11.4s, v4.4s, %[w2].s[0]            \n"           \
-                                                           \
-  "fmul v14.4s, v5.4s, %[w0].s[1]            \n"           \
-  "fmla v12.4s, v5.4s, %[w2].s[1]            \n"           \
-                                                           \
-  "fmla v17.4s, v10.4s, %[w0].s[2]            \n"          \
-  "fmla v16.4s, v10.4s, %[w2].s[2]            \n"          \
-                                                           \
-  "ext  v10.16b, v6.16b, %[vzero].16b, #4     \n" /* r3 */ \
-  "fmla v13.4s, v6.4s, %[w1].s[0]            \n"           \
-  "fmla v14.4s, v7.4s, %[w1].s[1]            \n"           \
-  "fmla v17.4s, v10.4s, %[w1].s[2]            \n"          \
-                                                           \
-  "ext  v10.16b, v8.16b, %[vzero].16b, #4     \n"          \
-  "ld1 {v0.4s}, [%[outptr0]]                  \n"          \
-                                                           \
-  "fadd v16.4s, v16.4s, v11.4s                  \n"        \
-  "fadd v16.4s, v16.4s, v12.4s                  \n"        \
-  "ld1 {v1.4s}, [%[outptr1]]                  \n"
-
-#define RIGHT_RESULT_S2                             \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"   \
-                                                    \
-  "bif  v16.16b, v0.16b, %[wmask].16b    \n"        \
-                                                    \
-  "fadd v17.4s, v17.4s, v13.4s                  \n" \
-                                                    \
-  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
-                                                    \
-  "fadd v17.4s, v17.4s, v14.4s                  \n" \
-                                                    \
-  "bif  v17.16b, v1.16b, %[wmask].16b    \n"        \
-                                                    \
-  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
-  "4:                                          \n"
-
-#define LEFT_RESULT_S2_RELU                         \
-  /* r4 */                                          \
-  "fmla v13.4s, v8.4s, %[w2].s[1]            \n"    \
-  "fmla v14.4s, v9.4s, %[w2].s[2]            \n"    \
-  "fmla v17.4s, v10.4s, %[w2].s[0]            \n"   \
-                                                    \
-  "fmax v16.4s, v16.4s, %[vzero].4s            \n"  \
-                                                    \
-  "ld2  {v0.4s, v1.4s}, [%[inptr0]], #32    \n"     \
-  "ld2  {v2.4s, v3.4s}, [%[inptr1]], #32    \n"     \
-  "ld2  {v4.4s, v5.4s}, [%[inptr2]], #32    \n"     \
-                                                    \
-  "fadd v17.4s, v17.4s, v13.4s                  \n" \
-                                                    \
-  "st1 {v16.4s}, [%[outptr0]], #16              \n" \
-                                                    \
-  "ld2  {v6.4s, v7.4s}, [%[inptr3]], #32    \n"     \
-  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"     \
-  "ld1 {v15.4s}, [%[inptr0]]                 \n"    \
-                                                    \
-  "fadd v17.4s, v17.4s, v14.4s                  \n" \
-                                                    \
-  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"    \
-                                                    \
-  "ld1 {v18.4s}, [%[inptr1]]                 \n"    \
-  "ld1 {v19.4s}, [%[inptr2]]                 \n"    \
-                                                    \
-  "ext  v10.16b, v0.16b, v15.16b, #4     \n"        \
-                                                    \
-  "fmax v17.4s, v17.4s, %[vzero].4s            \n"  \
-                                                    \
-  "ld1 {v20.4s}, [%[inptr3]]                 \n"    \
-  "ld1 {v21.4s}, [%[inptr4]]                 \n"    \
-                                                    \
-  "st1 {v17.4s}, [%[outptr1]], #16              \n" \
-                                                    \
-  "cmp %w[cnt], #1                             \n"  \
-                                                    \
-  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"    \
-                                                    \
-  "blt 1f                                     \n"
-
-#define MID_RESULT_S2_RELU                                    \
-  /* r4 */                                                    \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
-                                                              \
-  "ld2  {v8.4s, v9.4s}, [%[inptr4]], #32    \n"               \
-  "ld1 {v15.4s}, [%[inptr0]]                 \n"              \
-  "ld1 {v18.4s}, [%[inptr1]]                 \n"              \
-  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
-                                                              \
-  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
-                                                              \
-  "ld1 {v19.4s}, [%[inptr2]]                 \n"              \
-  "ld1 {v20.4s}, [%[inptr3]]                 \n"              \
-  "ld1 {v21.4s}, [%[inptr4]]                 \n"              \
-                                                              \
-  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
-                                                              \
-  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
-                                                              \
-  "ext  v10.16b, v0.16b, v15.16b, #4     \n"                  \
-  "and  v16.16b, %[vbias].16b, %[vbias].16b  \n"              \
-  "subs %w[cnt], %w[cnt], #1                    \n"           \
-                                                              \
-  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
-                                                              \
-  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
-                                                              \
-  "and  v17.16b, %[vbias].16b, %[vbias].16b  \n"              \
-                                                              \
-  "bne  2b                                    \n"
-
-#define RIGHT_RESULT_S2_RELU                                  \
-  /* r4 */                                                    \
-  "fmla v13.4s, v8.4s, %[w2].s[0]            \n"              \
-  "fmla v14.4s, v9.4s, %[w2].s[1]            \n"              \
-  "fmla v17.4s, v10.4s, %[w2].s[2]            \n"             \
-                                                              \
-  "fmax v16.4s, v16.4s, %[vzero].4s            \n" /* relu */ \
-                                                              \
-  "fadd v17.4s, v17.4s, v13.4s                  \n"           \
-                                                              \
-  "bif  v16.16b, v0.16b, %[wmask].16b    \n"                  \
-                                                              \
-  "fadd v17.4s, v17.4s, v14.4s                  \n"           \
-                                                              \
-  "st1 {v16.4s}, [%[outptr0]], #16              \n"           \
-                                                              \
-  "fmax v17.4s, v17.4s, %[vzero].4s            \n" /* relu */ \
-                                                              \
-  "bif  v17.16b, v1.16b, %[wmask].16b    \n"                  \
-                                                              \
-  "st1 {v17.4s}, [%[outptr1]], #16              \n"           \
-  "4:                                          \n"
-
-#define COMPUTE_S_S2                                  \
-  "movi v9.4s, #0                                 \n" \
-  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n" \
-                                                      \
-  "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n" \
-  "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n" \
-  "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n" \
-                                                      \
-  "bif v10.16b, v9.16b, v6.16b                    \n" \
-  "bif v11.16b, v9.16b, v7.16b                    \n" \
-  "bif v12.16b, v9.16b, v6.16b                    \n" \
-  "bif v13.16b, v9.16b, v7.16b                    \n" \
-  "bif v14.16b, v9.16b, v6.16b                    \n" \
-  "bif v15.16b, v9.16b, v7.16b                    \n" \
-                                                      \
-  "ext v6.16b, v9.16b, v11.16b, #12               \n" \
-  "ext v7.16b, v9.16b, v13.16b, #12               \n" \
-  "ext v8.16b, v9.16b, v15.16b, #12               \n" \
-                                                      \
-  "fmul v4.4s, v10.4s, %[wr0].s[1]                \n" \
-  "fmul v5.4s, v11.4s, %[wr0].s[2]                \n" \
-  "fmul v6.4s, v6.4s,  %[wr0].s[0]                \n" \
-                                                      \
-  "fmla v4.4s, v12.4s, %[wr1].s[1]                \n" \
-  "fmla v5.4s, v13.4s, %[wr1].s[2]                \n" \
-  "fmla v6.4s, v7.4s,  %[wr1].s[0]                \n" \
-                                                      \
-  "fmla v4.4s, v14.4s, %[wr2].s[1]                \n" \
-  "fmla v5.4s, v15.4s, %[wr2].s[2]                \n" \
-  "fmla v6.4s, v8.4s,  %[wr2].s[0]                \n" \
-                                                      \
-  "fadd v4.4s, v4.4s, v5.4s                       \n" \
-  "fadd v4.4s, v4.4s, v6.4s                       \n"
-
-#define RESULT_S_S2                                   \
-  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
-                                                      \
-  "st1 {v4.4s}, [%[out]]                          \n"
-
-#define RESULT_S_S2_RELU                              \
-  "fadd v4.4s, v4.4s, %[bias].4s                  \n" \
-  "fmax v4.4s, v4.4s, v9.4s                       \n" \
-                                                      \
-  "st1 {v4.4s}, [%[out]]                          \n"
-
-#define COMPUTE_S_S2_P0                                \
-  "movi v9.4s, #0                                 \n"  \
-  "ld1  {v6.4s, v7.4s}, [%[mask_ptr]], #32        \n"  \
-                                                       \
-  "ld2  {v10.4s, v11.4s}, [%[din0_ptr]], #32      \n"  \
-  "ld2  {v12.4s, v13.4s}, [%[din1_ptr]], #32      \n"  \
-  "ld2  {v14.4s, v15.4s}, [%[din2_ptr]], #32      \n"  \
-  "and  v4.16b, %[bias].16b, %[bias].16b  \n"          \
-                                                       \
-  "bif v10.16b, v9.16b, v6.16b                    \n"  \
-  "bif v11.16b, v9.16b, v7.16b                    \n"  \
-  "bif v12.16b, v9.16b, v6.16b                    \n"  \
-  "bif v13.16b, v9.16b, v7.16b                    \n"  \
-  "bif v14.16b, v9.16b, v6.16b                    \n"  \
-  "bif v15.16b, v9.16b, v7.16b                    \n"  \
-                                                       \
-  "ext v6.16b, v10.16b, v9.16b, #4               \n"   \
-  "ext v7.16b, v12.16b, v9.16b, #4               \n"   \
-  "ext v8.16b, v14.16b, v9.16b, #4               \n"   \
-                                                       \
-  "fmla v4.4s, v10.4s, %[wr0].s[0]                \n"  \
-  "fmul v5.4s, v11.4s, %[wr0].s[1]                \n"  \
-  "fmul v16.4s, v6.4s,  %[wr0].s[2]                \n" \
-                                                       \
-  "fmla v4.4s, v12.4s, %[wr1].s[0]                \n"  \
-  "fmla v5.4s, v13.4s, %[wr1].s[1]                \n"  \
-  "fmla v16.4s, v7.4s,  %[wr1].s[2]                \n" \
-                                                       \
-  "fmla v4.4s, v14.4s, %[wr2].s[0]                \n"  \
-  "fmla v5.4s, v15.4s, %[wr2].s[1]                \n"  \
-  "fmla v16.4s, v8.4s,  %[wr2].s[2]                \n" \
-                                                       \
-  "fadd v4.4s, v4.4s, v5.4s                       \n"  \
-  "fadd v4.4s, v4.4s, v16.4s                       \n"
-
-#define RESULT_S_S2_P0 "st1 {v4.4s}, [%[out]]                          \n"
-
-#define RESULT_S_S2_P0_RELU                           \
-  "fmax v4.4s, v4.4s, v9.4s                       \n" \
-  "st1 {v4.4s}, [%[out]]                          \n"
-
-#else
-#define INIT_S2                                                     \
-  "vmov.u32 q9, #0                                \n"               \
-  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r1\n"  \
-  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"  \
-  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"  \
-  "pld [%[din0_ptr]]                              @ preload data\n" \
-  "pld [%[din1_ptr]]                              @ preload data\n" \
-  "pld [%[din2_ptr]]                              @ preload data\n" \
-                                                                    \
-  "vdup.32 q3, %[bias]                            @ and \n"
-
-#define LEFT_COMPUTE_S2                                                   \
-  "vext.32 q6, q9, q11, #3                        @ shift right 1 data\n" \
-  "vext.32 q7, q9, q13, #3                        @ shift right 1 data\n" \
-  "vext.32 q8, q9, q15, #3                        @ shift right 1 data\n" \
-  "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 1, out0\n" \
-  "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 1, out0\n" \
-  "vmla.f32 q3,  q6, %e[wr0][0]                   @ mul weight 1, out0\n" \
-                                                                          \
-  "sub %[din0_ptr], #4                            @ inpitr0 - 1\n"        \
-  "sub %[din1_ptr], #4                            @ inpitr1 - 1\n"        \
-  "sub %[din2_ptr], #4                            @ inpitr2 - 1\n"        \
-                                                                          \
-  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"        \
-                                                                          \
-  "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, out0\n" \
-  "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, out0\n" \
-  "vmla.f32 q3,  q7, %e[wr1][0]                   @ mul weight 1, out0\n" \
-                                                                          \
-  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"        \
-                                                                          \
-  "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 1, out1\n" \
-  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 1, out1\n" \
-  "vmla.f32 q3,  q8, %e[wr2][0]                   @ mul weight 1, out1\n" \
-                                                                          \
-  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r1\n"        \
-                                                                          \
-  "vadd.f32 q3, q3, q4                            @ add \n"               \
-  "vadd.f32 q3, q3, q5                            @ add \n"
-
-#define LEFT_RESULT_S2                                \
-  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
-  "cmp %[cnt], #1                                 \n" \
-  "blt 1f                                         \n"
-
-#define MID_COMPUTE_S2                                                    \
-  "2:                                             \n"                     \
-  "vld1.32  {d16}, [%[din0_ptr]]                  @ load din r0\n"        \
-  "vdup.32  q3, %[bias]                           @ and \n"               \
-  "vext.32  q6, q10, q8, #1                       @ shift left 1 \n"      \
-  "vld1.32 {d16}, [%[din1_ptr]]                   @ load din r1\n"        \
-                                                                          \
-  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n" \
-  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n" \
-  "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, out0\n" \
-                                                                          \
-  "vext.32  q7, q12, q8, #1                       @ shift left 1 \n"      \
-  "vld1.32 {d16}, [%[din2_ptr]]                   @ load din r1\n"        \
-                                                                          \
-  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"        \
-                                                                          \
-  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n" \
-  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n" \
-  "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, out0\n" \
-                                                                          \
-  "vext.32  q6, q14, q8, #1                       @ shift left 1 \n"      \
-                                                                          \
-  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"        \
-                                                                          \
-  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n" \
-  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n" \
-  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n" \
-                                                                          \
-  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"        \
-                                                                          \
-  "vadd.f32 q3, q3, q4                            @ add \n"               \
-  "vadd.f32 q3, q3, q5                            @ add \n"
-
-#define MID_RESULT_S2                                 \
-  "subs %[cnt], #1                                \n" \
-                                                      \
-  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
-  "bne  2b                                        \n"
-
-#define RIGHT_COMPUTE_S2                                                    \
-  "1:                                             \n"                       \
-  "cmp %[remain], #1                              \n"                       \
-  "blt 3f                                         \n"                       \
-                                                                            \
-  "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"            \
-  "vdup.32  q3, %[bias]                           @ and \n"                 \
-                                                                            \
-  "vbif q10, q9, q6                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q11, q9, q7                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q12, q9, q6                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q13, q9, q7                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q14, q9, q6                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q15, q9, q7                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-                                                                            \
-  "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"        \
-  "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"        \
-                                                                            \
-  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n"   \
-  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n"   \
-  "vmla.f32 q3,  q6, %f[wr0][0]                   @ mul weight 0, out0\n"   \
-                                                                            \
-  "vext.32 q6, q14, q9, #1                        @ shift left 1 \n"        \
-  "vld1.f32   {d20-d21}, [%[outptr]]              @ load output\n"          \
-                                                                            \
-  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n"   \
-  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n"   \
-  "vmla.f32 q3,  q7, %f[wr1][0]                   @ mul weight 1, out0\n"   \
-                                                                            \
-  "vld1.f32   {d22-d23}, [%[mask_ptr]]            @ load mask\n"            \
-                                                                            \
-  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
-  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
-  "vmla.f32 q3,  q6, %f[wr2][0]                   @ mul weight 2, out0\n"   \
-                                                                            \
-  "vadd.f32 q3, q3, q4                            @ add \n"                 \
-  "vadd.f32 q3, q3, q5                            @ add \n"
-
-#define RIGHT_RESULT_S2                                           \
-  "vbif.f32 q3, q10, q11                          @ write mask\n" \
-                                                                  \
-  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
-  "3:                                             \n"
-
-#define LEFT_RESULT_S2_RELU                           \
-  "vmax.f32 q3, q3, q9                    @ relu \n"  \
-  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
-  "cmp %[cnt], #1                                 \n" \
-  "blt 1f                                         \n"
-
-#define MID_RESULT_S2_RELU                            \
-  "vmax.f32 q3, q3, q9                    @ relu \n"  \
-  "subs %[cnt], #1                                \n" \
-                                                      \
-  "vst1.32 {d6-d7}, [%[outptr]]!                  \n" \
-  "bne  2b                                        \n"
-
-#define RIGHT_RESULT_S2_RELU                                      \
-  "vmax.f32 q3, q3, q9                    @ relu \n"              \
-  "vbif.f32 q3, q10, q11                          @ write mask\n" \
-                                                                  \
-  "vst1.32 {d6-d7}, [%[outptr]]!                  \n"             \
-  "3:                                             \n"
-
-#define COMPUTE_S_S2                                                        \
-  "vmov.u32 q9, #0                                \n"                       \
-  "vld1.f32   {d12-d15}, [%[mask_ptr]]!           @ load mask\n"            \
-  "vdup.32  q3, %[bias]                           @ and \n"                 \
-                                                                            \
-  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"          \
-  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"          \
-  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"          \
-                                                                            \
-  "vbif q10, q9, q6                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q11, q9, q7                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q12, q9, q6                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q13, q9, q7                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q14, q9, q6                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q15, q9, q7                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-                                                                            \
-  "vext.32 q6, q9, q11, #3                        @ shift left 1 \n"        \
-  "vext.32 q7, q9, q13, #3                        @ shift left 1 \n"        \
-  "vext.32 q8, q9, q15, #3                        @ shift left 1 \n"        \
-                                                                            \
-  "vmul.f32 q4, q10, %e[wr0][1]                   @ mul weight 0, out0\n"   \
-  "vmul.f32 q5, q11, %f[wr0][0]                   @ mul weight 0, out0\n"   \
-  "vmla.f32 q3, q6,  %e[wr0][0]                   @ mul weight 0, out0\n"   \
-                                                                            \
-  "vmla.f32 q4, q12, %e[wr1][1]                   @ mul weight 1, out0\n"   \
-  "vmla.f32 q5, q13, %f[wr1][0]                   @ mul weight 1, out0\n"   \
-  "vmla.f32 q3, q7,  %e[wr1][0]                   @ mul weight 1, out0\n"   \
-                                                                            \
-  "vmla.f32 q4, q14, %e[wr2][1]                   @ mul weight 2, out0\n"   \
-  "vmla.f32 q5, q15, %f[wr2][0]                   @ mul weight 2, out0\n"   \
-  "vmla.f32 q3, q8,  %e[wr2][0]                   @ mul weight 2, out0\n"   \
-                                                                            \
-  "vadd.f32 q3, q3, q4                            @ add \n"                 \
-  "vadd.f32 q3, q3, q5                            @ add \n"
-
-#define RESULT_S_S2 "vst1.32 {d6-d7}, [%[out]]                            \n"
-
-#define RESULT_S_S2_RELU                                    \
-  "vmax.f32 q3, q3, q9                            @ relu\n" \
-                                                            \
-  "vst1.32 {d6-d7}, [%[out]]                            \n"
-
-#define COMPUTE_S_S2_P0                                                     \
-  "vmov.u32 q9, #0                                \n"                       \
-  "vld1.f32   {d12-d15}, [%[mask_ptr]]           @ load mask\n"             \
-  "vdup.32  q3, %[bias]                           @ and \n"                 \
-                                                                            \
-  "vld2.32  {d20-d23}, [%[din0_ptr]]!             @ load din r0\n"          \
-  "vld2.32  {d24-d27}, [%[din1_ptr]]!             @ load din r1\n"          \
-  "vld2.32  {d28-d31}, [%[din2_ptr]]!             @ load din r2\n"          \
-                                                                            \
-  "vbif q10, q9, q6                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q11, q9, q7                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q12, q9, q6                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q13, q9, q7                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q14, q9, q6                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-  "vbif q15, q9, q7                               @ bit select, deal with " \
-  "right pad\n"                                                             \
-                                                                            \
-  "vext.32 q6, q10, q9, #1                        @ shift left 1 \n"        \
-  "vext.32 q7, q12, q9, #1                        @ shift left 1 \n"        \
-  "vext.32 q8, q14, q9, #1                        @ shift left 1 \n"        \
-                                                                            \
-  "vmul.f32 q4, q10, %e[wr0][0]                   @ mul weight 0, out0\n"   \
-  "vmul.f32 q5, q11, %e[wr0][1]                   @ mul weight 0, out0\n"   \
-  "vmla.f32 q3, q6,  %f[wr0][0]                   @ mul weight 0, out0\n"   \
-                                                                            \
-  "vmla.f32 q4, q12, %e[wr1][0]                   @ mul weight 1, out0\n"   \
-  "vmla.f32 q5, q13, %e[wr1][1]                   @ mul weight 1, out0\n"   \
-  "vmla.f32 q3, q7,  %f[wr1][0]                   @ mul weight 1, out0\n"   \
-                                                                            \
-  "vmla.f32 q4, q14, %e[wr2][0]                   @ mul weight 2, out0\n"   \
-  "vmla.f32 q5, q15, %e[wr2][1]                   @ mul weight 2, out0\n"   \
-  "vmla.f32 q3, q8,  %f[wr2][0]                   @ mul weight 2, out0\n"   \
-                                                                            \
-  "vadd.f32 q3, q3, q4                            @ add \n"                 \
-  "vadd.f32 q3, q3, q5                            @ add \n"
-
-#define RESULT_S_S2_P0 "vst1.32 {d6-d7}, [%[out]]                            \n"
-
-#define RESULT_S_S2_P0_RELU                                  \
-  "vmax.f32 q3, q3, q9                            @ relu \n" \
-  "vst1.32 {d6-d7}, [%[out]]                            \n"
-
-#endif
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- * w_in > 7
- */
-void conv_depthwise_3x3s2p1_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  int size_pad_bottom = h_out * 2 - h_in;
-
-  int cnt_col = (w_out >> 2) - 2;
-  int size_right_remain = w_in - (7 + cnt_col * 8);
-  if (size_right_remain >= 9) {
-    cnt_col++;
-    size_right_remain -= 8;
-  }
-  int cnt_remain = (size_right_remain == 8) ? 4 : (w_out % 4);  //
-
-  int size_right_pad = w_out * 2 - w_in;
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-#ifdef __aarch64__
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-#else
-      float bias_c = 0.f;
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-#endif  // __aarch64__
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_in; i += 4) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          din3_ptr = dr2;
-          din4_ptr = dr3;
-          dr0 = dr3;
-          dr1 = dr4;
-        } else {
-          dr0 = dr4;
-          dr1 = dr0 + w_in;
-        }
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i + 4 > h_in) {
-          switch (i + 4 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i / 2 + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = cnt_col;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
-                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-              : [inptr0] "+r"(din0_ptr),
-                [inptr1] "+r"(din1_ptr),
-                [inptr2] "+r"(din2_ptr),
-                [inptr3] "+r"(din3_ptr),
-                [inptr4] "+r"(din4_ptr),
-                [outptr0] "+r"(doutr0_ptr),
-                [outptr1] "+r"(doutr1_ptr),
-                [cnt] "+r"(cnt)
-              : [vzero] "w"(vzero),
-                [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [remain] "r"(cnt_remain),
-                [mask1] "w"(vmask_rp1),
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        } else {
-          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
-                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
-                       : [inptr0] "+r"(din0_ptr),
-                         [inptr1] "+r"(din1_ptr),
-                         [inptr2] "+r"(din2_ptr),
-                         [inptr3] "+r"(din3_ptr),
-                         [inptr4] "+r"(din4_ptr),
-                         [outptr0] "+r"(doutr0_ptr),
-                         [outptr1] "+r"(doutr1_ptr),
-                         [cnt] "+r"(cnt)
-                       : [vzero] "w"(vzero),
-                         [w0] "w"(wr0),
-                         [w1] "w"(wr1),
-                         [w2] "w"(wr2),
-                         [remain] "r"(cnt_remain),
-                         [mask1] "w"(vmask_rp1),
-                         [mask2] "w"(vmask_rp2),
-                         [wmask] "w"(wmask),
-                         [vbias] "w"(wbias)
-                       : "cc",
-                         "memory",
-                         "v0",
-                         "v1",
-                         "v2",
-                         "v3",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16",
-                         "v17",
-                         "v18",
-                         "v19",
-                         "v20",
-                         "v21");
-        }
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_in; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        if (i == 0) {
-          din0_ptr = zero_ptr;
-          din1_ptr = dr0;
-          din2_ptr = dr1;
-          dr0 = dr1;
-          dr1 = dr2;
-          dr2 = dr1 + w_in;
-        } else {
-          dr0 = dr2;
-          dr1 = dr0 + w_in;
-          dr2 = dr1 + w_in;
-        }
-
-        //! process bottom pad
-        if (i + 2 > h_in) {
-          switch (i + 2 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = cnt_col;
-        unsigned int* mask_ptr = dmask;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2_RELU MID_COMPUTE_S2
-                  MID_RESULT_S2_RELU RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-              : [din0_ptr] "+r"(din0_ptr),
-                [din1_ptr] "+r"(din1_ptr),
-                [din2_ptr] "+r"(din2_ptr),
-                [outptr] "+r"(doutr0_ptr),
-                [cnt] "+r"(cnt),
-                [mask_ptr] "+r"(mask_ptr)
-              : [remain] "r"(cnt_remain),
-                [wr0] "w"(wr0),
-                [wr1] "w"(wr1),
-                [wr2] "w"(wr2),
-                [bias] "r"(bias_c)
-              : "cc",
-                "memory",
-                "q3",
-                "q4",
-                "q5",
-                "q6",
-                "q7",
-                "q8",
-                "q9",
-                "q10",
-                "q11",
-                "q12",
-                "q13",
-                "q14",
-                "q15");
-        } else {
-          asm volatile(INIT_S2 LEFT_COMPUTE_S2 LEFT_RESULT_S2 MID_COMPUTE_S2
-                           MID_RESULT_S2 RIGHT_COMPUTE_S2 RIGHT_RESULT_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [outptr] "+r"(doutr0_ptr),
-                         [cnt] "+r"(cnt),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [remain] "r"(cnt_remain),
-                         [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
- */
-void conv_depthwise_3x3s2p1_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      int hs = -1;
-      int he = 2;
-      float out_buf[4];
-      for (int j = 0; j < h_out; ++j) {
-        const float* dr0 = din_channel + hs * w_in;
-        const float* dr1 = dr0 + w_in;
-        const float* dr2 = dr1 + w_in;
-        if (hs == -1) {
-          dr0 = zeros;
-        }
-        if (he > h_in) {
-          dr2 = zeros;
-        }
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        } else {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15");
-        }
-#else
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S2 RESULT_S_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-        hs += 2;
-        he += 2;
-      }
-    }
-  }
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2
- */
-// w_in > 7
-void conv_depthwise_3x3s2p0_bias(float* dout,
-                                 const float* din,
-                                 const float* weights,
-                                 const float* bias,
-                                 bool flag_bias,
-                                 bool flag_relu,
-                                 const int num,
-                                 const int ch_in,
-                                 const int h_in,
-                                 const int w_in,
-                                 const int h_out,
-                                 const int w_out,
-                                 ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-
-  int tile_w = w_out >> 2;
-  int cnt_remain = w_out % 4;
-
-  unsigned int size_right_remain = (unsigned int)(w_in - (tile_w << 3));
-
-  uint32x4_t vmask_rp1 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 = vcgtq_s32(vdupq_n_s32(size_right_remain),
-                                   vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-  uint32x4_t wmask =
-      vcgtq_s32(vdupq_n_s32(cnt_remain), vld1q_s32(out_pad_idx));  // 0 1 2 3
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  float* zero_ptr = ctx->workspace_data<float>();
-  memset(zero_ptr, 0, w_in * sizeof(float));
-  float* write_ptr = zero_ptr + w_in;
-
-  unsigned int dmask[12];
-
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-  vst1q_u32(dmask + 8, wmask);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float32x4_t vzero = vdupq_n_f32(0.f);
-
-#ifdef __aarch64__
-      float32x4_t wbias;
-      if (flag_bias) {
-        wbias = vdupq_n_f32(bias[i]);
-      } else {
-        wbias = vdupq_n_f32(0.f);
-      }
-#else
-      float bias_c = 0.f;
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-#endif  // __aarch64__
-
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      const float* dr3 = dr2 + w_in;
-      const float* dr4 = dr3 + w_in;
-
-      const float* din0_ptr = dr0;
-      const float* din1_ptr = dr1;
-      const float* din2_ptr = dr2;
-      const float* din3_ptr = dr3;
-      const float* din4_ptr = dr4;
-
-      float* doutr0 = dout_channel;
-      float* doutr0_ptr = nullptr;
-      float* doutr1_ptr = nullptr;
-
-#ifdef __aarch64__
-      for (int i = 0; i < h_out; i += 2) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-        din3_ptr = dr3;
-        din4_ptr = dr4;
-
-        doutr0_ptr = doutr0;
-        doutr1_ptr = doutr0 + w_out;
-
-        dr0 = dr4;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-        dr3 = dr2 + w_in;
-        dr4 = dr3 + w_in;
-
-        //! process bottom pad
-        if (i * 2 + 5 > h_in) {
-          switch (i * 2 + 5 - h_in) {
-            case 4:
-              din1_ptr = zero_ptr;
-            case 3:
-              din2_ptr = zero_ptr;
-            case 2:
-              din3_ptr = zero_ptr;
-            case 1:
-              din4_ptr = zero_ptr;
-            case 0:
-              din4_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        //! process output pad
-        if (i + 2 > h_out) {
-          doutr1_ptr = write_ptr;
-        }
-        int cnt = tile_w;
-        if (flag_relu) {
-          asm volatile(
-              INIT_S2
-              "ld1 {v15.4s}, [%[inptr0]]                 \n"
-              "ld1 {v18.4s}, [%[inptr1]]                 \n"
-              "ld1 {v19.4s}, [%[inptr2]]                 \n"
-              "ld1 {v20.4s}, [%[inptr3]]                 \n"
-              "ld1 {v21.4s}, [%[inptr4]]                 \n"
-              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-              MID_COMPUTE_S2 MID_RESULT_S2_RELU
-              "cmp %w[remain], #1                           \n"
-              "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                  RIGHT_RESULT_S2_RELU
-              "4:                                          \n"
-              : [inptr0] "+r"(din0_ptr),
-                [inptr1] "+r"(din1_ptr),
-                [inptr2] "+r"(din2_ptr),
-                [inptr3] "+r"(din3_ptr),
-                [inptr4] "+r"(din4_ptr),
-                [outptr0] "+r"(doutr0_ptr),
-                [outptr1] "+r"(doutr1_ptr),
-                [cnt] "+r"(cnt)
-              : [vzero] "w"(vzero),
-                [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [remain] "r"(cnt_remain),
-                [mask1] "w"(vmask_rp1),
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        } else {
-          asm volatile(
-              INIT_S2
-              "ld1 {v15.4s}, [%[inptr0]]                 \n"
-              "ld1 {v18.4s}, [%[inptr1]]                 \n"
-              "ld1 {v19.4s}, [%[inptr2]]                 \n"
-              "ld1 {v20.4s}, [%[inptr3]]                 \n"
-              "ld1 {v21.4s}, [%[inptr4]]                 \n"
-              "ext  v10.16b, v0.16b, v15.16b, #4     \n"  // v10 = {2,4,6,8}
-              MID_COMPUTE_S2 MID_RESULT_S2
-              "cmp %w[remain], #1                           \n"
-              "blt 4f                                     \n" RIGHT_COMPUTE_S2
-                  RIGHT_RESULT_S2
-              "4:                                          \n"
-              : [inptr0] "+r"(din0_ptr),
-                [inptr1] "+r"(din1_ptr),
-                [inptr2] "+r"(din2_ptr),
-                [inptr3] "+r"(din3_ptr),
-                [inptr4] "+r"(din4_ptr),
-                [outptr0] "+r"(doutr0_ptr),
-                [outptr1] "+r"(doutr1_ptr),
-                [cnt] "+r"(cnt)
-              : [vzero] "w"(vzero),
-                [w0] "w"(wr0),
-                [w1] "w"(wr1),
-                [w2] "w"(wr2),
-                [remain] "r"(cnt_remain),
-                [mask1] "w"(vmask_rp1),
-                [mask2] "w"(vmask_rp2),
-                [wmask] "w"(wmask),
-                [vbias] "w"(wbias)
-              : "cc",
-                "memory",
-                "v0",
-                "v1",
-                "v2",
-                "v3",
-                "v4",
-                "v5",
-                "v6",
-                "v7",
-                "v8",
-                "v9",
-                "v10",
-                "v11",
-                "v12",
-                "v13",
-                "v14",
-                "v15",
-                "v16",
-                "v17",
-                "v18",
-                "v19",
-                "v20",
-                "v21");
-        }
-        doutr0 = doutr0 + 2 * w_out;
-      }
-#else
-      for (int i = 0; i < h_out; i++) {
-        din0_ptr = dr0;
-        din1_ptr = dr1;
-        din2_ptr = dr2;
-
-        doutr0_ptr = doutr0;
-
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        //! process bottom pad
-        if (i * 2 + 3 > h_in) {
-          switch (i * 2 + 3 - h_in) {
-            case 2:
-              din1_ptr = zero_ptr;
-            case 1:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        int cnt = tile_w;
-        unsigned int* mask_ptr = dmask;
-        if (flag_relu) {
-          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2_RELU
-                           RIGHT_COMPUTE_S2 RIGHT_RESULT_S2_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [outptr] "+r"(doutr0_ptr),
-                         [cnt] "+r"(cnt),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [remain] "r"(cnt_remain),
-                         [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(INIT_S2 MID_COMPUTE_S2 MID_RESULT_S2 RIGHT_COMPUTE_S2
-                           RIGHT_RESULT_S2
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [outptr] "+r"(doutr0_ptr),
-                         [cnt] "+r"(cnt),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [remain] "r"(cnt_remain),
-                         [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-        doutr0 = doutr0 + w_out;
-      }
-#endif
-    }
-  }
-}
-
-/**
- * \brief depthwise convolution kernel 3x3, stride 2, width <= 4
- */
-void conv_depthwise_3x3s2p0_bias_s(float* dout,
-                                   const float* din,
-                                   const float* weights,
-                                   const float* bias,
-                                   bool flag_bias,
-                                   bool flag_relu,
-                                   const int num,
-                                   const int ch_in,
-                                   const int h_in,
-                                   const int w_in,
-                                   const int h_out,
-                                   const int w_out,
-                                   ARMContext* ctx) {
-  int right_pad_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
-  int out_pad_idx[4] = {0, 1, 2, 3};
-  float zeros[8] = {0.0f};
-  const float zero_ptr[4] = {0.f, 0.f, 0.f, 0.f};
-
-  uint32x4_t vmask_rp1 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx));  // 0 2 4 6
-  uint32x4_t vmask_rp2 =
-      vcgtq_s32(vdupq_n_s32(w_in), vld1q_s32(right_pad_idx + 4));  // 1 3 5 7
-
-  int size_in_channel = w_in * h_in;
-  int size_out_channel = w_out * h_out;
-
-  unsigned int dmask[8];
-  vst1q_u32(dmask, vmask_rp1);
-  vst1q_u32(dmask + 4, vmask_rp2);
-
-  for (int n = 0; n < num; ++n) {
-    const float* din_batch = din + n * ch_in * size_in_channel;
-    float* dout_batch = dout + n * ch_in * size_out_channel;
-#pragma omp parallel for
-    for (int i = 0; i < ch_in; ++i) {
-      const float* din_channel = din_batch + i * size_in_channel;
-      float* dout_channel = dout_batch + i * size_out_channel;
-
-      const float* weight_ptr = weights + i * 9;
-      float32x4_t wr0 = vld1q_f32(weight_ptr);
-      float32x4_t wr1 = vld1q_f32(weight_ptr + 3);
-      float32x4_t wr2 = vld1q_f32(weight_ptr + 6);
-
-      float bias_c = 0.f;
-
-      if (flag_bias) {
-        bias_c = bias[i];
-      }
-      float32x4_t vbias = vdupq_n_f32(bias_c);
-      float out_buf[4];
-      const float* dr0 = din_channel;
-      const float* dr1 = dr0 + w_in;
-      const float* dr2 = dr1 + w_in;
-      for (int j = 0; j < h_out; j++) {
-        const float* din0_ptr = dr0;
-        const float* din1_ptr = dr1;
-        const float* din2_ptr = dr2;
-        if (j * 2 + 2 >= h_in) {
-          switch (j + 2 - h_in) {
-            case 1:
-              din1_ptr = zero_ptr;
-            case 0:
-              din2_ptr = zero_ptr;
-            default:
-              break;
-          }
-        }
-        dr0 = dr2;
-        dr1 = dr0 + w_in;
-        dr2 = dr1 + w_in;
-
-        unsigned int* mask_ptr = dmask;
-#ifdef __aarch64__
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16");
-        } else {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr),
-                         [mask_ptr] "+r"(mask_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "w"(vbias),
-                         [out] "r"(out_buf)
-                       : "cc",
-                         "memory",
-                         "v4",
-                         "v5",
-                         "v6",
-                         "v7",
-                         "v8",
-                         "v9",
-                         "v10",
-                         "v11",
-                         "v12",
-                         "v13",
-                         "v14",
-                         "v15",
-                         "v16");
-        }
-#else
-        if (flag_relu) {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0_RELU
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf),
-                         [mask_ptr] "r"(dmask)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        } else {
-          asm volatile(COMPUTE_S_S2_P0 RESULT_S_S2_P0
-                       : [din0_ptr] "+r"(din0_ptr),
-                         [din1_ptr] "+r"(din1_ptr),
-                         [din2_ptr] "+r"(din2_ptr)
-                       : [wr0] "w"(wr0),
-                         [wr1] "w"(wr1),
-                         [wr2] "w"(wr2),
-                         [bias] "r"(bias_c),
-                         [out] "r"(out_buf),
-                         [mask_ptr] "r"(dmask)
-                       : "cc",
-                         "memory",
-                         "q3",
-                         "q4",
-                         "q5",
-                         "q6",
-                         "q7",
-                         "q8",
-                         "q9",
-                         "q10",
-                         "q11",
-                         "q12",
-                         "q13",
-                         "q14",
-                         "q15");
-        }
-#endif
-        for (int w = 0; w < w_out; ++w) {
-          *dout_channel++ = out_buf[w];
-        }
-      }
-    }
-  }
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/arm/math/reduce_prod.cc b/lite/backends/arm/math/reduce_prod.cc
old mode 100755
new mode 100644
diff --git a/lite/backends/arm/math/reduce_prod.h b/lite/backends/arm/math/reduce_prod.h
old mode 100755
new mode 100644
diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.cc b/lite/backends/arm/math/split_merge_lod_tenosr.cc
old mode 100755
new mode 100644
diff --git a/lite/backends/arm/math/split_merge_lod_tenosr.h b/lite/backends/arm/math/split_merge_lod_tenosr.h
old mode 100755
new mode 100644
diff --git a/lite/backends/fpga/KD/debugger.hpp b/lite/backends/fpga/KD/debugger.hpp
old mode 100755
new mode 100644
diff --git a/lite/backends/fpga/KD/dl_engine.cpp b/lite/backends/fpga/KD/dl_engine.cpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/dl_engine.hpp b/lite/backends/fpga/KD/dl_engine.hpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.cpp b/lite/backends/fpga/KD/llapi/zynqmp_api.cpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/llapi/zynqmp_api.h b/lite/backends/fpga/KD/llapi/zynqmp_api.h
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/pes/conv_process.hpp b/lite/backends/fpga/KD/pes/conv_process.hpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/pes/crop_pe.cpp b/lite/backends/fpga/KD/pes/crop_pe.cpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp b/lite/backends/fpga/KD/pes/depthwise_conv_pe.hpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp b/lite/backends/fpga/KD/pes/elementwise_mul_pe.hpp
old mode 100755
new mode 100644
diff --git a/lite/backends/fpga/KD/pes/fully_connected_pe.hpp b/lite/backends/fpga/KD/pes/fully_connected_pe.hpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/pes/gru_pe.hpp b/lite/backends/fpga/KD/pes/gru_pe.hpp
old mode 100755
new mode 100644
diff --git a/lite/backends/fpga/KD/pes/gru_util.hpp b/lite/backends/fpga/KD/pes/gru_util.hpp
old mode 100755
new mode 100644
diff --git a/lite/backends/fpga/KD/pes/output_pe.hpp b/lite/backends/fpga/KD/pes/output_pe.hpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/pes/pooling_pe.hpp b/lite/backends/fpga/KD/pes/pooling_pe.hpp
old mode 100644
new mode 100755
diff --git a/lite/backends/fpga/KD/pes/scale_pe.hpp b/lite/backends/fpga/KD/pes/scale_pe.hpp
old mode 100755
new mode 100644
diff --git a/lite/backends/fpga/lite_tensor.cc b/lite/backends/fpga/lite_tensor.cc
old mode 100644
new mode 100755
diff --git a/lite/backends/npu/builder.cc b/lite/backends/npu/builder.cc
deleted file mode 100644
index 954fad8c91..0000000000
--- a/lite/backends/npu/builder.cc
+++ /dev/null
@@ -1,192 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/npu/builder.h"
-#include <mutex>  // NOLINT
-#include <utility>
-#include "lite/backends/npu/runtime.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-
-// Build HIAI IR graph to om model, and store om model data into lite tensor
-bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
-                std::vector<ge::Operator>& outputs,  // NOLINT
-                lite::Tensor* model_data) {
-  LOG(INFO) << "[NPU] Build model.";
-  CHECK_GT(inputs.size(), 0);
-  CHECK_GT(outputs.size(), 0);
-  CHECK_NE(model_data, 0);
-  // build IR graph to om model
-  ge::Graph ir_graph("graph");
-  ir_graph.SetInputs(inputs).SetOutputs(outputs);
-  ge::Model om_model("model", "model");
-  om_model.SetGraph(ir_graph);
-  domi::HiaiIrBuild ir_build;
-  domi::ModelBufferData om_model_buf;
-  if (!ir_build.CreateModelBuff(om_model, om_model_buf)) {
-    LOG(WARNING) << "[NPU] CreateModelBuff failed!";
-    return false;
-  }
-  if (!ir_build.BuildIRModel(om_model, om_model_buf)) {
-    LOG(WARNING) << "[NPU] BuildIRModel failed!";
-    return false;
-  }
-  // store om model into tensor
-  model_data->Resize({om_model_buf.length});
-  memcpy(model_data->mutable_data<int8_t>(),
-         om_model_buf.data,
-         om_model_buf.length);
-  ir_build.ReleaseModelBuff(om_model_buf);
-  return true;
-}
-
-std::string UniqueName(const std::string& prefix) {
-  static std::mutex counter_mtx;
-  static std::unordered_map<std::string, int> counter_map;
-  std::unique_lock<std::mutex> counter_lck(counter_mtx);
-  int counter = 1;
-  auto it = counter_map.find(prefix);
-  if (it == counter_map.end()) {
-    counter_map[prefix] = counter;
-  } else {
-    counter = ++(it->second);
-  }
-  return prefix + "_" + std::to_string(counter);
-}
-
-ge::DataType CvtPrecisionType(PrecisionType itype) {
-  ge::DataType otype = ge::DT_FLOAT;
-  switch (itype) {
-    case PRECISION(kFloat):
-      otype = ge::DT_FLOAT;
-      break;
-    case PRECISION(kInt8):
-      otype = ge::DT_INT8;
-      break;
-    case PRECISION(kInt32):
-      otype = ge::DT_INT32;
-      break;
-    default:
-      LOG(FATAL) << "[NPU] Can not convert precision type("
-                 << PrecisionToStr(itype) << ") from Lite to NPU";
-      break;
-  }
-  return otype;
-}
-
-ge::Format CvtDataLayoutType(DataLayoutType itype) {
-  ge::Format otype = ge::FORMAT_NCHW;
-  switch (itype) {
-    case DATALAYOUT(kNCHW):
-      otype = ge::FORMAT_NCHW;
-      break;
-    // TODO(hong19860320) support more data layout type
-    default:
-      LOG(FATAL) << "[NPU] Can not convert data layout type("
-                 << DataLayoutToStr(itype) << ") from Lite to NPU";
-      break;
-  }
-  return otype;
-}
-
-ge::TensorPtr CvtTensor(lite::Tensor* in_tensor,
-                        std::vector<int64_t> out_shape,
-                        PrecisionType in_ptype,
-                        DataLayoutType in_ltype) {
-  uint8_t* in_data = nullptr;
-  auto in_size = in_tensor->dims().production();
-  auto in_shape = in_tensor->dims().Vectorize();
-  if (out_shape.empty()) {
-    out_shape = in_shape;
-  }
-  int in_bytes;
-  if (in_ptype == PRECISION(kFloat)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
-    in_bytes = in_size * sizeof(float);
-  } else if (in_ptype == PRECISION(kInt32)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
-    in_bytes = in_size * sizeof(int32_t);
-  } else if (in_ptype == PRECISION(kInt8)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
-    in_bytes = in_size * sizeof(int8_t);
-  } else {
-    LOG(FATAL) << "[NPU] Unknow precision type " << PrecisionToStr(in_ptype);
-  }
-  ge::DataType out_ptype = CvtPrecisionType(in_ptype);
-  ge::Format out_ltype = CvtDataLayoutType(in_ltype);
-
-  ge::TensorDesc out_desc(ge::Shape(out_shape), out_ltype, out_ptype);
-  CHECK_EQ(out_ltype, ge::FORMAT_NCHW);
-
-  auto out_size = out_desc.GetShape().GetShapeSize();
-  CHECK_EQ(out_size, in_size);
-
-  ge::TensorPtr out_tensor = std::make_shared<ge::Tensor>();
-  out_tensor->SetTensorDesc(out_desc);
-  out_tensor->SetData(in_data, in_bytes);
-  return out_tensor;
-}
-
-int CvtActMode(std::string act_type) {
-  int act_mode = 1;
-  if (act_type == "sigmoid") {
-    act_mode = 0;
-  } else if (act_type == "relu") {
-    act_mode = 1;
-  } else if (act_type == "tanh") {
-    act_mode = 2;
-  } else if (act_type == "relu_clipped") {
-    act_mode = 3;
-  } else if (act_type == "elu") {
-    act_mode = 4;
-  } else if (act_type == "leaky_relu") {
-    act_mode = 5;
-  } else if (act_type == "abs") {
-    act_mode = 6;
-  } else if (act_type == "softsign") {
-    act_mode = 8;
-  } else if (act_type == "softplus") {
-    act_mode = 9;
-  } else if (act_type == "hard_sigmoid") {
-    act_mode = 10;
-  } else {
-    // TODO(hong19860320) support more activation mode
-    LOG(FATAL) << "[NPU] Unsupported activation type " << act_type;
-  }
-  return act_mode;
-}
-
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname) {
-  auto iarg_names = op_info->input_argnames();
-  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
-      iarg_names.end()) {
-    auto inputs = op_info->Input(argname);
-    if (inputs.empty()) {
-      return false;
-    }
-    auto var_name = inputs.front();
-    auto var = scope->FindVar(var_name);
-    return var != nullptr;
-  } else {
-    return false;
-  }
-}
-
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/builder.h b/lite/backends/npu/builder.h
deleted file mode 100644
index 70200354fb..0000000000
--- a/lite/backends/npu/builder.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
-#include "ai_ddk_lib/include/hiai_ir_build.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/tensor.h"
-
-// Extended Ops of HIAI DDK
-namespace ge {
-/**
- * Pads a tensor.
- * <Input>
- *      x : the input tensor
- *      padding : the input tensor must be 2-D
- *      constant_values : constant values must be a scalar
- * <Output>
- *      output : the output tensor
- * <Attr>
- *      t_paddings : Default DT_INT32 , t_paddings must be  the same with
- * datatype of the padding
- *      mode : 0: CONSTANT, 1: REFLECT, 2: SYMMETRIC
- *      T  :  datatype of constant_values  DT_INT32:3   DT_FLOAT:0
- */
-REG_OP(Pad)
-    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
-    .INPUT(padding, TensorType({DT_INT32}))
-    .OPTIONAL_INPUT(constant_values, TensorType({DT_INT32, DT_FLOAT}))
-    .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32}))
-    .ATTR(t_paddings, AttrValue::INT{3})
-    .ATTR(mode, AttrValue::INT{0})
-    .REQUIRED_ATTR(T, AttrValue::INT)
-    .OP_END();
-
-}  // namespace ge
-
-namespace paddle {
-namespace lite {
-namespace npu {
-
-class OpList {
- public:
-  static OpList& Global() {
-    static thread_local OpList x;
-    return x;
-  }
-  void clear() { lists_.clear(); }
-  void add(std::shared_ptr<ge::Operator> p) { lists_.push_back(p); }
-
- private:
-  std::vector<std::shared_ptr<ge::Operator>> lists_;
-};
-
-// Build HIAI IR graph to om model, and store om model data into lite tensor
-bool BuildModel(std::vector<ge::Operator>& inputs,   // NOLINT
-                std::vector<ge::Operator>& outputs,  // NOLINT
-                lite::Tensor* model_data);
-
-std::string UniqueName(const std::string& prefix);
-
-ge::DataType CvtPrecisionType(PrecisionType itype);
-
-ge::Format CvtDataLayoutType(DataLayoutType itype);
-
-ge::TensorPtr CvtTensor(Tensor* in_tensor,
-                        std::vector<int64_t> out_shape = {},
-                        PrecisionType in_ptype = PRECISION(kFloat),
-                        DataLayoutType in_ltype = DATALAYOUT(kNCHW));
-
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(std::vector<T> data,
-                                      std::vector<int64_t> shape = {},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  const std::type_info& info = typeid(T);
-  ge::DataType type = ge::DT_FLOAT;
-  if (info == typeid(float)) {
-    type = ge::DT_FLOAT;
-  } else if (info == typeid(int8_t)) {
-    type = ge::DT_INT8;
-  } else if (info == typeid(int32_t)) {
-    type = ge::DT_INT32;
-  } else {
-    LOG(FATAL) << "[NPU] Unknow value type " << info.name();
-  }
-  if (shape.empty()) {
-    shape = {static_cast<int64_t>(data.size())};
-  } else {
-    int size = 1;
-    for (auto i : shape) {
-      size *= i;
-    }
-    CHECK_EQ(data.size(), size);
-  }
-  ge::TensorDesc desc(ge::Shape(shape), format, type);
-  ge::TensorPtr tensor = std::make_shared<ge::Tensor>();
-  tensor->SetTensorDesc(desc);
-  tensor->SetData(reinterpret_cast<uint8_t*>(data.data()),
-                  data.size() * sizeof(T));
-  return tensor;
-}
-
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(T value,
-                                      std::vector<int64_t> shape = {1},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  int64_t size = 1;
-  for (auto i : shape) {
-    size *= i;
-  }
-  std::vector<T> data(size, value);
-  return CreateTensorAndFillData(data, shape, format);
-}
-
-int CvtActMode(std::string act_type);
-
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname);
-
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/device.cc b/lite/backends/npu/device.cc
old mode 100755
new mode 100644
diff --git a/lite/backends/npu/device.h b/lite/backends/npu/device.h
old mode 100755
new mode 100644
index 3eba0b77e4..411600ae0a
--- a/lite/backends/npu/device.h
+++ b/lite/backends/npu/device.h
@@ -18,8 +18,8 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "ai_ddk_lib/include/hiai_ir_build.h"
+#include "HiAiModelManagerService.h"  // NOLINT
+#include "hiai_ir_build.h"            // NOLINT
 
 namespace paddle {
 namespace lite {
diff --git a/lite/backends/npu/runtime.cc b/lite/backends/npu/runtime.cc
deleted file mode 100644
index 3485f63c7c..0000000000
--- a/lite/backends/npu/runtime.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/npu/runtime.h"
-#include <string>
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-
-// Create hiai model manager to load om model from lite tensor, and return the
-// manager and an unique model name
-bool LoadModel(const lite::Tensor &model_data,
-               std::shared_ptr<hiai::AiModelMngerClient> *model_client,
-               std::string *model_name) {
-  LOG(INFO) << "[NPU] Load model.";
-  auto model_data_ptr = model_data.data<int8_t>();
-  auto model_data_size = model_data.numel() * sizeof(int8_t);
-  if (model_data_ptr == nullptr || model_data_size == 0) {
-    return false;
-  }
-  *model_client = std::make_shared<hiai::AiModelMngerClient>();
-  int ret = (*model_client)->Init(nullptr);
-  if (ret != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient init failed(" << ret << ")!";
-    return false;
-  }
-  *model_name = "model.om";
-  auto model_desc = std::make_shared<hiai::AiModelDescription>(
-      *model_name,
-      DeviceInfo::Global().freq_level(),
-      DeviceInfo::Global().framework_type(),
-      DeviceInfo::Global().model_type(),
-      DeviceInfo::Global().device_type());
-  model_desc->SetModelBuffer(model_data_ptr, model_data_size);
-  std::vector<std::shared_ptr<hiai::AiModelDescription>> model_descs;
-  model_descs.push_back(model_desc);
-  if ((*model_client)->Load(model_descs) != hiai::AI_SUCCESS) {
-    LOG(WARNING) << "[NPU] AiModelMngerClient load model failed!";
-    return false;
-  }
-  return true;
-}
-
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/npu/runtime.h b/lite/backends/npu/runtime.h
deleted file mode 100644
index 8b1ad51518..0000000000
--- a/lite/backends/npu/runtime.h
+++ /dev/null
@@ -1,50 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <memory>
-#include <string>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace npu {
-
-class DeviceInfo {
- public:
-  static DeviceInfo &Global() {
-    static DeviceInfo x;
-    return x;
-  }
-  DeviceInfo() {}
-
-  int freq_level() { return freq_level_; }
-  int framework_type() { return framework_type_; }
-  int model_type() { return model_type_; }
-  int device_type() { return device_type_; }
-
- private:
-  int freq_level_{3};
-  int framework_type_{0};
-  int model_type_{0};
-  int device_type_{0};
-};
-
-bool LoadModel(const lite::Tensor &model_data,
-               std::shared_ptr<hiai::AiModelMngerClient> *model_client,
-               std::string *model_name);
-}  // namespace npu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl b/lite/backends/opencl/cl_kernel/image/conv2d_1x1_kernel.cl
old mode 100755
new mode 100644
diff --git a/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl b/lite/backends/opencl/cl_kernel/image/reshape_kernel.cl
old mode 100755
new mode 100644
diff --git a/lite/backends/x86/jit/README.en.md b/lite/backends/x86/jit/README.en.md
index cd2aa5c242..dc9eb4cf23 100644
--- a/lite/backends/x86/jit/README.en.md
+++ b/lite/backends/x86/jit/README.en.md
@@ -89,7 +89,7 @@ All kernels are inlcuded in `lite/backends/x86/jit/kernels.h`, which is automati
 3. Add reference function of `your_key`. 
 Note:
     - this should be run on CPU and do not depend on any third-party.
-    - Add `USE_JITKERNEL_REFER(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
+    - Add `USE_JITKERNEL_REFER_LITE(your_key)` in `refer/CmakeLists.txt` to make sure this code can be used.
 4. Add unit test in `test.cc`, and verfiy at least `float` and `double`.
 Test more data type for some special functions if necessary, for example `int8`.
 5. Add functions in `benchmark.cc` to test all function of same `KernelType`. Make sure `GetDefaultBestFunc` always get the best one.
diff --git a/lite/backends/x86/jit/README.md b/lite/backends/x86/jit/README.md
index 6998c5d867..bc0e27234d 100644
--- a/lite/backends/x86/jit/README.md
+++ b/lite/backends/x86/jit/README.md
@@ -79,7 +79,7 @@ PaddlePaddle/Paddle/paddle/fluid/
 # 如何添加新的算子
 
 1. 在`KernelType` 中添加 `your_key` 。
-2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER(your_key)`来使用该kernel。
+2. 实现Reference 的逻辑，这个是必须是在CPU上的实现，并且不能依赖任何第三方库。实现后在`refer/CmakeLists.txt`中添加`USE_JITKERNEL_REFER_LITE(your_key)`来使用该kernel。
 3. (optional) 实现更多的算法在`more`目录下，可以依赖mkl，intrinsic或者mkldnn等第三方库。
 4. (optional) 实现基于Xbyak的生成code，在`gen`目下。 jitcode需要实现自己的`JitCodeCreator`，并注册在与refer相同的`KernelType`上。
 5. 添加新的`KernelTuple`，需要与`KernelType`一一对应，是所有类型的一个打包，包括数据类型，属性的类型，以及返回的函数类型。可以参考`SeqPoolTuple`，新加的Attr类型需要特例化`JitCodeKey`方法。
diff --git a/lite/backends/x86/jit/gen/CMakeLists.txt b/lite/backends/x86/jit/gen/CMakeLists.txt
index 99244ea9bd..6250077528 100644
--- a/lite/backends/x86/jit/gen/CMakeLists.txt
+++ b/lite/backends/x86/jit/gen/CMakeLists.txt
@@ -4,33 +4,33 @@ file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
 cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
 
-function(USE_JITKERNEL_GEN TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_GEN(${TARGET});\n")
+function(USE_JITKERNEL_GEN_LITE TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_GEN_LITE(${TARGET});\n")
 endfunction()
 
 # use gen jitcode kernel by name
-USE_JITKERNEL_GEN(kMatMul)
-USE_JITKERNEL_GEN(kVMul)
-USE_JITKERNEL_GEN(kVAdd)
-USE_JITKERNEL_GEN(kVSub)
-USE_JITKERNEL_GEN(kVAddRelu)
-USE_JITKERNEL_GEN(kVScal)
-USE_JITKERNEL_GEN(kVAddBias)
-USE_JITKERNEL_GEN(kVRelu)
-USE_JITKERNEL_GEN(kVSquare)
-USE_JITKERNEL_GEN(kVIdentity)
-USE_JITKERNEL_GEN(kVExp)
-USE_JITKERNEL_GEN(kVSigmoid)
-USE_JITKERNEL_GEN(kVTanh)
-USE_JITKERNEL_GEN(kLSTMCtHt)
-USE_JITKERNEL_GEN(kLSTMC1H1)
-USE_JITKERNEL_GEN(kGRUH1)
-USE_JITKERNEL_GEN(kGRUHtPart1)
-USE_JITKERNEL_GEN(kGRUHtPart2)
-USE_JITKERNEL_GEN(kNCHW16CMulNC)
-USE_JITKERNEL_GEN(kSeqPool)
-USE_JITKERNEL_GEN(kHMax)
-USE_JITKERNEL_GEN(kHSum)
-USE_JITKERNEL_GEN(kEmbSeqPool)
-USE_JITKERNEL_GEN(kSgd)
-USE_JITKERNEL_GEN(kVBroadcast)
+USE_JITKERNEL_GEN_LITE(kMatMul)
+USE_JITKERNEL_GEN_LITE(kVMul)
+USE_JITKERNEL_GEN_LITE(kVAdd)
+USE_JITKERNEL_GEN_LITE(kVSub)
+USE_JITKERNEL_GEN_LITE(kVAddRelu)
+USE_JITKERNEL_GEN_LITE(kVScal)
+USE_JITKERNEL_GEN_LITE(kVAddBias)
+USE_JITKERNEL_GEN_LITE(kVRelu)
+USE_JITKERNEL_GEN_LITE(kVSquare)
+USE_JITKERNEL_GEN_LITE(kVIdentity)
+USE_JITKERNEL_GEN_LITE(kVExp)
+USE_JITKERNEL_GEN_LITE(kVSigmoid)
+USE_JITKERNEL_GEN_LITE(kVTanh)
+USE_JITKERNEL_GEN_LITE(kLSTMCtHt)
+USE_JITKERNEL_GEN_LITE(kLSTMC1H1)
+USE_JITKERNEL_GEN_LITE(kGRUH1)
+USE_JITKERNEL_GEN_LITE(kGRUHtPart1)
+USE_JITKERNEL_GEN_LITE(kGRUHtPart2)
+USE_JITKERNEL_GEN_LITE(kNCHW16CMulNC)
+USE_JITKERNEL_GEN_LITE(kSeqPool)
+USE_JITKERNEL_GEN_LITE(kHMax)
+USE_JITKERNEL_GEN_LITE(kHSum)
+USE_JITKERNEL_GEN_LITE(kEmbSeqPool)
+USE_JITKERNEL_GEN_LITE(kSgd)
+USE_JITKERNEL_GEN_LITE(kVBroadcast)
diff --git a/lite/backends/x86/jit/gen/act.cc b/lite/backends/x86/jit/gen/act.cc
index f1f261c199..45f4f7ddcc 100644
--- a/lite/backends/x86/jit/gen/act.cc
+++ b/lite/backends/x86/jit/gen/act.cc
@@ -156,9 +156,9 @@ size_t VTanhCreator::CodeSize(const int& d) const {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kVRelu, gen::VReluCreator);
-REGISTER_JITKERNEL_GEN(kVSquare, gen::VSquareCreator);
-REGISTER_JITKERNEL_GEN(kVIdentity, gen::VIdentityCreator);
-REGISTER_JITKERNEL_GEN(kVExp, gen::VExpCreator);
-REGISTER_JITKERNEL_GEN(kVSigmoid, gen::VSigmoidCreator);
-REGISTER_JITKERNEL_GEN(kVTanh, gen::VTanhCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVRelu, gen::VReluCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVSquare, gen::VSquareCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVIdentity, gen::VIdentityCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVExp, gen::VExpCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVSigmoid, gen::VSigmoidCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVTanh, gen::VTanhCreator);
diff --git a/lite/backends/x86/jit/gen/blas.cc b/lite/backends/x86/jit/gen/blas.cc
index 0bddea6ace..37183e6640 100644
--- a/lite/backends/x86/jit/gen/blas.cc
+++ b/lite/backends/x86/jit/gen/blas.cc
@@ -181,10 +181,10 @@ DECLARE_BLAS_CREATOR(VAddBias);
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kVMul, gen::VMulCreator);
-REGISTER_JITKERNEL_GEN(kVAdd, gen::VAddCreator);
-REGISTER_JITKERNEL_GEN(kVSub, gen::VSubCreator);
-REGISTER_JITKERNEL_GEN(kVAddRelu, gen::VAddReluCreator);
-REGISTER_JITKERNEL_GEN(kVScal, gen::VScalCreator);
-REGISTER_JITKERNEL_GEN(kVAddBias, gen::VAddBiasCreator);
-REGISTER_JITKERNEL_GEN(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVMul, gen::VMulCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVAdd, gen::VAddCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVSub, gen::VSubCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVAddRelu, gen::VAddReluCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVScal, gen::VScalCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVAddBias, gen::VAddBiasCreator);
+REGISTER_JITKERNEL_GEN_LITE(kNCHW16CMulNC, gen::NCHW16CMulNCCreator);
diff --git a/lite/backends/x86/jit/gen/embseqpool.cc b/lite/backends/x86/jit/gen/embseqpool.cc
index 2ff6894383..7e697014ed 100644
--- a/lite/backends/x86/jit/gen/embseqpool.cc
+++ b/lite/backends/x86/jit/gen/embseqpool.cc
@@ -145,4 +145,4 @@ class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kEmbSeqPool, gen::EmbSeqPoolCreator);
+REGISTER_JITKERNEL_GEN_LITE(kEmbSeqPool, gen::EmbSeqPoolCreator);
diff --git a/lite/backends/x86/jit/gen/gru.cc b/lite/backends/x86/jit/gen/gru.cc
index c5737faf13..4c2c57413e 100644
--- a/lite/backends/x86/jit/gen/gru.cc
+++ b/lite/backends/x86/jit/gen/gru.cc
@@ -111,6 +111,6 @@ DECLARE_GRU_CREATOR(GRUHtPart2);
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kGRUH1, gen::GRUH1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart1, gen::GRUHtPart1Creator);
-REGISTER_JITKERNEL_GEN(kGRUHtPart2, gen::GRUHtPart2Creator);
+REGISTER_JITKERNEL_GEN_LITE(kGRUH1, gen::GRUH1Creator);
+REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart1, gen::GRUHtPart1Creator);
+REGISTER_JITKERNEL_GEN_LITE(kGRUHtPart2, gen::GRUHtPart2Creator);
diff --git a/lite/backends/x86/jit/gen/hopv.cc b/lite/backends/x86/jit/gen/hopv.cc
index 4304dc48c5..0fdd63a740 100644
--- a/lite/backends/x86/jit/gen/hopv.cc
+++ b/lite/backends/x86/jit/gen/hopv.cc
@@ -99,5 +99,5 @@ DECLARE_HOP_CREATOR(HSum);
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator);
-REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator);
+REGISTER_JITKERNEL_GEN_LITE(kHMax, gen::HMaxCreator);
+REGISTER_JITKERNEL_GEN_LITE(kHSum, gen::HSumCreator);
diff --git a/lite/backends/x86/jit/gen/lstm.cc b/lite/backends/x86/jit/gen/lstm.cc
index 44e58d0b75..e441735520 100644
--- a/lite/backends/x86/jit/gen/lstm.cc
+++ b/lite/backends/x86/jit/gen/lstm.cc
@@ -138,5 +138,5 @@ DECLARE_LSTM_CREATOR(LSTMC1H1);
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kLSTMCtHt, gen::LSTMCtHtCreator);
-REGISTER_JITKERNEL_GEN(kLSTMC1H1, gen::LSTMC1H1Creator);
+REGISTER_JITKERNEL_GEN_LITE(kLSTMCtHt, gen::LSTMCtHtCreator);
+REGISTER_JITKERNEL_GEN_LITE(kLSTMC1H1, gen::LSTMC1H1Creator);
diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc
index 2c75f6dd5d..010c80fac4 100644
--- a/lite/backends/x86/jit/gen/matmul.cc
+++ b/lite/backends/x86/jit/gen/matmul.cc
@@ -130,4 +130,4 @@ class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kMatMul, gen::MatMulCreator);
+REGISTER_JITKERNEL_GEN_LITE(kMatMul, gen::MatMulCreator);
diff --git a/lite/backends/x86/jit/gen/seqpool.cc b/lite/backends/x86/jit/gen/seqpool.cc
index e0cf5e5a5a..4c80737aac 100644
--- a/lite/backends/x86/jit/gen/seqpool.cc
+++ b/lite/backends/x86/jit/gen/seqpool.cc
@@ -82,4 +82,4 @@ class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kSeqPool, gen::SeqPoolCreator);
+REGISTER_JITKERNEL_GEN_LITE(kSeqPool, gen::SeqPoolCreator);
diff --git a/lite/backends/x86/jit/gen/sgd.cc b/lite/backends/x86/jit/gen/sgd.cc
index 10659f5084..44e0833661 100644
--- a/lite/backends/x86/jit/gen/sgd.cc
+++ b/lite/backends/x86/jit/gen/sgd.cc
@@ -127,4 +127,4 @@ class SgdCreator : public JitCodeCreator<sgd_attr_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kSgd, gen::SgdCreator);
+REGISTER_JITKERNEL_GEN_LITE(kSgd, gen::SgdCreator);
diff --git a/lite/backends/x86/jit/gen/vbroadcast.cc b/lite/backends/x86/jit/gen/vbroadcast.cc
index 9e02dca8c4..fb1e71f7b0 100644
--- a/lite/backends/x86/jit/gen/vbroadcast.cc
+++ b/lite/backends/x86/jit/gen/vbroadcast.cc
@@ -88,4 +88,4 @@ class VBroadcastCreator : public JitCodeCreator<int64_t> {
 
 namespace gen = paddle::lite::jit::gen;
 
-REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
+REGISTER_JITKERNEL_GEN_LITE(kVBroadcast, gen::VBroadcastCreator);
diff --git a/lite/backends/x86/jit/more/CMakeLists.txt b/lite/backends/x86/jit/more/CMakeLists.txt
index 2ddbbcd16a..5641466d8a 100644
--- a/lite/backends/x86/jit/more/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/CMakeLists.txt
@@ -1,6 +1,6 @@
 
-function(USE_JITKERNEL_MORE TARGET TYPE)
-    file(APPEND ${jit_file} "USE_JITKERNEL_MORE(${TARGET} ${TYPE});\n")
+function(USE_JITKERNEL_MORE_LITE TARGET TYPE)
+    file(APPEND ${jit_file} "USE_JITKERNEL_MORE_LITE(${TARGET} ${TYPE});\n")
 endfunction()
 
 # enable it latter
diff --git a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
index 468937a4f6..80dabc72fb 100644
--- a/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/intrinsic/CMakeLists.txt
@@ -5,5 +5,5 @@ cc_library(jit_kernel_intrinsic SRCS ${jit_kernel_cc_intrinsic} DEPS jit_kernel_
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)
 
 # use mkl kernels by name and type
-USE_JITKERNEL_MORE(kCRFDecoding, intrinsic)
-USE_JITKERNEL_MORE(kLayerNorm, intrinsic)
+USE_JITKERNEL_MORE_LITE(kCRFDecoding, intrinsic)
+USE_JITKERNEL_MORE_LITE(kLayerNorm, intrinsic)
diff --git a/lite/backends/x86/jit/more/mix/CMakeLists.txt b/lite/backends/x86/jit/more/mix/CMakeLists.txt
index dd039d2915..5e0238f26f 100644
--- a/lite/backends/x86/jit/more/mix/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/mix/CMakeLists.txt
@@ -5,11 +5,11 @@ cc_library(jit_kernel_mix SRCS ${jit_kernel_mix_cc} DEPS jit_kernel_base)
 
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_mix PARENT_SCOPE)
 
-USE_JITKERNEL_MORE(kVSigmoid, mix)
-USE_JITKERNEL_MORE(kVTanh, mix)
-USE_JITKERNEL_MORE(kLSTMCtHt, mix)
-USE_JITKERNEL_MORE(kLSTMC1H1, mix)
-USE_JITKERNEL_MORE(kGRUH1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart1, mix)
-USE_JITKERNEL_MORE(kGRUHtPart2, mix)
-USE_JITKERNEL_MORE(kSoftmax, mix)
+USE_JITKERNEL_MORE_LITE(kVSigmoid, mix)
+USE_JITKERNEL_MORE_LITE(kVTanh, mix)
+USE_JITKERNEL_MORE_LITE(kLSTMCtHt, mix)
+USE_JITKERNEL_MORE_LITE(kLSTMC1H1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUH1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUHtPart1, mix)
+USE_JITKERNEL_MORE_LITE(kGRUHtPart2, mix)
+USE_JITKERNEL_MORE_LITE(kSoftmax, mix)
diff --git a/lite/backends/x86/jit/more/mkl/CMakeLists.txt b/lite/backends/x86/jit/more/mkl/CMakeLists.txt
index 56f1a62ad4..3557f531a5 100644
--- a/lite/backends/x86/jit/more/mkl/CMakeLists.txt
+++ b/lite/backends/x86/jit/more/mkl/CMakeLists.txt
@@ -3,18 +3,18 @@ cc_library(jit_kernel_mkl SRCS mkl.cc DEPS jit_kernel_base dynload_mklml)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} dynload_mklml jit_kernel_mkl PARENT_SCOPE)
 
 # use mkl kernels by name and type
-USE_JITKERNEL_MORE(kMatMul, mkl)
-USE_JITKERNEL_MORE(kVMul, mkl)
-USE_JITKERNEL_MORE(kVAdd, mkl)
-USE_JITKERNEL_MORE(kVScal, mkl)
-USE_JITKERNEL_MORE(kStrideScal, mkl)
-USE_JITKERNEL_MORE(kVExp, mkl)
-USE_JITKERNEL_MORE(kVSquare, mkl)
-USE_JITKERNEL_MORE(kVCopy, mkl)
-USE_JITKERNEL_MORE(kVSigmoid, mkl)
-USE_JITKERNEL_MORE(kVTanh, mkl)
-USE_JITKERNEL_MORE(kSeqPool, mkl)
-USE_JITKERNEL_MORE(kSoftmax, mkl)
-USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
-USE_JITKERNEL_MORE(kSgd, mkl)
-USE_JITKERNEL_MORE(kVBroadcast, mkl)
+USE_JITKERNEL_MORE_LITE(kMatMul, mkl)
+USE_JITKERNEL_MORE_LITE(kVMul, mkl)
+USE_JITKERNEL_MORE_LITE(kVAdd, mkl)
+USE_JITKERNEL_MORE_LITE(kVScal, mkl)
+USE_JITKERNEL_MORE_LITE(kStrideScal, mkl)
+USE_JITKERNEL_MORE_LITE(kVExp, mkl)
+USE_JITKERNEL_MORE_LITE(kVSquare, mkl)
+USE_JITKERNEL_MORE_LITE(kVCopy, mkl)
+USE_JITKERNEL_MORE_LITE(kVSigmoid, mkl)
+USE_JITKERNEL_MORE_LITE(kVTanh, mkl)
+USE_JITKERNEL_MORE_LITE(kSeqPool, mkl)
+USE_JITKERNEL_MORE_LITE(kSoftmax, mkl)
+USE_JITKERNEL_MORE_LITE(kEmbSeqPool, mkl)
+USE_JITKERNEL_MORE_LITE(kSgd, mkl)
+USE_JITKERNEL_MORE_LITE(kVBroadcast, mkl)
diff --git a/lite/backends/x86/jit/refer/CMakeLists.txt b/lite/backends/x86/jit/refer/CMakeLists.txt
index 7133f59662..c52b21ad7d 100644
--- a/lite/backends/x86/jit/refer/CMakeLists.txt
+++ b/lite/backends/x86/jit/refer/CMakeLists.txt
@@ -2,39 +2,39 @@
 cc_library(jit_kernel_refer SRCS refer.cc DEPS jit_kernel_base)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_refer PARENT_SCOPE)
 
-function(USE_JITKERNEL_REFER TARGET)
-    file(APPEND ${jit_file} "USE_JITKERNEL_REFER(${TARGET});\n")
+function(USE_JITKERNEL_REFER_LITE TARGET)
+    file(APPEND ${jit_file} "USE_JITKERNEL_REFER_LITE(${TARGET});\n")
 endfunction()
 
 # use refer kernel by name
-USE_JITKERNEL_REFER(kVMul)
-USE_JITKERNEL_REFER(kVAdd)
-USE_JITKERNEL_REFER(kVAddRelu)
-USE_JITKERNEL_REFER(kVSub)
-USE_JITKERNEL_REFER(kVScal)
-USE_JITKERNEL_REFER(kStrideScal)
-USE_JITKERNEL_REFER(kVAddBias)
-USE_JITKERNEL_REFER(kVCopy)
-USE_JITKERNEL_REFER(kVRelu)
-USE_JITKERNEL_REFER(kVIdentity)
-USE_JITKERNEL_REFER(kVExp)
-USE_JITKERNEL_REFER(kVSigmoid)
-USE_JITKERNEL_REFER(kVTanh)
-USE_JITKERNEL_REFER(kLSTMCtHt)
-USE_JITKERNEL_REFER(kLSTMC1H1)
-USE_JITKERNEL_REFER(kGRUH1)
-USE_JITKERNEL_REFER(kGRUHtPart1)
-USE_JITKERNEL_REFER(kGRUHtPart2)
-USE_JITKERNEL_REFER(kCRFDecoding)
-USE_JITKERNEL_REFER(kLayerNorm)
-USE_JITKERNEL_REFER(kNCHW16CMulNC)
-USE_JITKERNEL_REFER(kSeqPool)
-USE_JITKERNEL_REFER(kMatMul)
-USE_JITKERNEL_REFER(kVSquare)
-USE_JITKERNEL_REFER(kHSum)
-USE_JITKERNEL_REFER(kHMax)
-USE_JITKERNEL_REFER(kStrideASum)
-USE_JITKERNEL_REFER(kSoftmax)
-USE_JITKERNEL_REFER(kEmbSeqPool)
-USE_JITKERNEL_REFER(kSgd)
-USE_JITKERNEL_REFER(kVBroadcast)
+USE_JITKERNEL_REFER_LITE(kVMul)
+USE_JITKERNEL_REFER_LITE(kVAdd)
+USE_JITKERNEL_REFER_LITE(kVAddRelu)
+USE_JITKERNEL_REFER_LITE(kVSub)
+USE_JITKERNEL_REFER_LITE(kVScal)
+USE_JITKERNEL_REFER_LITE(kStrideScal)
+USE_JITKERNEL_REFER_LITE(kVAddBias)
+USE_JITKERNEL_REFER_LITE(kVCopy)
+USE_JITKERNEL_REFER_LITE(kVRelu)
+USE_JITKERNEL_REFER_LITE(kVIdentity)
+USE_JITKERNEL_REFER_LITE(kVExp)
+USE_JITKERNEL_REFER_LITE(kVSigmoid)
+USE_JITKERNEL_REFER_LITE(kVTanh)
+USE_JITKERNEL_REFER_LITE(kLSTMCtHt)
+USE_JITKERNEL_REFER_LITE(kLSTMC1H1)
+USE_JITKERNEL_REFER_LITE(kGRUH1)
+USE_JITKERNEL_REFER_LITE(kGRUHtPart1)
+USE_JITKERNEL_REFER_LITE(kGRUHtPart2)
+USE_JITKERNEL_REFER_LITE(kCRFDecoding)
+USE_JITKERNEL_REFER_LITE(kLayerNorm)
+USE_JITKERNEL_REFER_LITE(kNCHW16CMulNC)
+USE_JITKERNEL_REFER_LITE(kSeqPool)
+USE_JITKERNEL_REFER_LITE(kMatMul)
+USE_JITKERNEL_REFER_LITE(kVSquare)
+USE_JITKERNEL_REFER_LITE(kHSum)
+USE_JITKERNEL_REFER_LITE(kHMax)
+USE_JITKERNEL_REFER_LITE(kStrideASum)
+USE_JITKERNEL_REFER_LITE(kSoftmax)
+USE_JITKERNEL_REFER_LITE(kEmbSeqPool)
+USE_JITKERNEL_REFER_LITE(kSgd)
+USE_JITKERNEL_REFER_LITE(kVBroadcast)
diff --git a/lite/backends/x86/jit/refer/refer.cc b/lite/backends/x86/jit/refer/refer.cc
index e1b1240c5d..c47f8216ab 100644
--- a/lite/backends/x86/jit/refer/refer.cc
+++ b/lite/backends/x86/jit/refer/refer.cc
@@ -18,7 +18,7 @@
 namespace refer = paddle::lite::jit::refer;
 
 #define REGISTER_REFER_KERNEL(func) \
-  REGISTER_JITKERNEL_REFER(         \
+  REGISTER_JITKERNEL_REFER_LITE(    \
       k##func, refer::func##Kernel<float>, refer::func##Kernel<double>)
 
 REGISTER_REFER_KERNEL(VMul);
diff --git a/lite/backends/x86/jit/registry.h b/lite/backends/x86/jit/registry.h
index 7613a8dd43..65e3152d70 100644
--- a/lite/backends/x86/jit/registry.h
+++ b/lite/backends/x86/jit/registry.h
@@ -77,16 +77,16 @@ class JitKernelRegistrar {
   void Touch() {}
 };
 
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
+#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg)         \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
 // Refer always on CPUPlace
-#define REGISTER_JITKERNEL_REFER(kernel_type, ...)                  \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace,               \
+#define REGISTER_JITKERNEL_REFER_LITE(kernel_type, ...)             \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                    \
+      __reg_litejitkernel_##kernel_type##_refer_CPUPlace,           \
       "REGISTER_KERNEL_REFER must be called in global namespace");  \
   static ::paddle::lite::jit::JitKernelRegistrar<                   \
       ::paddle::lite::jit::ReferKernelPool,                         \
@@ -94,84 +94,84 @@ class JitKernelRegistrar {
       __VA_ARGS__>                                                  \
       __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_(       \
           ::paddle::lite::jit::KernelType::kernel_type);            \
-  int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {         \
+  int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_() {     \
     __jit_kernel_registrar_##kernel_type##_refer_CPUPlace_.Touch(); \
     return 0;                                                       \
   }
 
 // kernel_type: should be in paddle::lite::jit::KernelType
 // place_type: should be one of CPUPlace and GPUPlace in paddle::platform
-#define REGISTER_KERNEL_MORE(kernel_type, impl_type, place_type, ...)         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                   \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type,             \
-      "REGISTER_KERNEL_MORE must be called in global namespace");             \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();             \
+#define REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, place_type, ...)    \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                              \
+      __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type,         \
+      "REGISTER_KERNEL_MORE_LITE must be called in global namespace");        \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();         \
   static int __assert_##kernel_type##_##impl_type##_##place_type##_has_refer_ \
-      UNUSED = TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
+      UNUSED = LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();       \
   static ::paddle::lite::jit::JitKernelRegistrar<                             \
       ::paddle::lite::jit::KernelPool,                                        \
       ::paddle::lite::fluid::place_type,                                      \
       __VA_ARGS__>                                                            \
       __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_(   \
           ::paddle::lite::jit::KernelType::kernel_type);                      \
-  int TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() {     \
+  int LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_() { \
     __jit_kernel_registrar_##kernel_type##_##impl_type##_##place_type##_      \
         .Touch();                                                             \
     return 0;                                                                 \
   }
 
 #define REGISTER_JITKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
-
-#define REGISTER_GPUKERNEL_MORE(kernel_type, impl_type, ...) \
-  REGISTER_KERNEL_MORE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
-
-#define REGISTER_JITKERNEL_GEN(kernel_type, ...)                    \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,                \
-      "REGISTER_JITKERNEL_GEN must be called in global namespace"); \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =       \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();          \
-  static ::paddle::lite::jit::JitKernelRegistrar<                   \
-      ::paddle::lite::jit::JitCodeCreatorPool,                      \
-      ::paddle::lite::fluid::CPUPlace,                              \
-      __VA_ARGS__>                                                  \
-      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(         \
-          ::paddle::lite::jit::KernelType::kernel_type);            \
-  int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {           \
-    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();   \
-    return 0;                                                       \
+  REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace, __VA_ARGS__)
+
+#define REGISTER_GPUKERNEL_MORE_LITE(kernel_type, impl_type, ...) \
+  REGISTER_KERNEL_MORE_LITE(kernel_type, impl_type, GPUPlace, __VA_ARGS__)
+
+#define REGISTER_JITKERNEL_GEN_LITE(kernel_type, ...)                    \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                         \
+      __reg_litejitkernel_gen_##kernel_type##_CPUPlace_,                 \
+      "REGISTER_JITKERNEL_GEN_LITE must be called in global namespace"); \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();    \
+  static int __assert_gen_##kernel_type##_has_refer_ UNUSED =            \
+      LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();           \
+  static ::paddle::lite::jit::JitKernelRegistrar<                        \
+      ::paddle::lite::jit::JitCodeCreatorPool,                           \
+      ::paddle::lite::fluid::CPUPlace,                                   \
+      __VA_ARGS__>                                                       \
+      __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_(              \
+          ::paddle::lite::jit::KernelType::kernel_type);                 \
+  int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_() {            \
+    __jit_kernel_registrar_gen_##kernel_type##_CPUPlace_.Touch();        \
+    return 0;                                                            \
   }
 
-#define USE_JITKERNEL_GEN(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                       \
-      __reg_jitkernel_gen_##kernel_type##_CPUPlace_,              \
-      "USE_JITKERNEL_GEN must be called in global namespace");    \
-  extern int TouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
-  static int use_jitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
-
-#define USE_JITKERNEL_REFER(kernel_type)                            \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                         \
-      __reg_jitkernel_##kernel_type##_refer_CPUPlace_,              \
-      "USE_JITKERNEL_REFER must be called in global namespace");    \
-  extern int TouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
-  static int use_jitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
-      TouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
-
-#define USE_KERNEL_MORE(kernel_type, impl_type, place_type)              \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                              \
-      __reg_jitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
-      "USE_JITKERNEL_MORE must be called in global namespace");          \
-  extern int                                                             \
-      TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
-  static int use_jitkernel_##kernel_type##_##impl_type##_##place_type##_ \
-      UNUSED =                                                           \
-          TouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
-
-#define USE_JITKERNEL_MORE(kernel_type, impl_type) \
-  USE_KERNEL_MORE(kernel_type, impl_type, CPUPlace)
+#define USE_JITKERNEL_GEN_LITE(kernel_type)                           \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                      \
+      __reg_litejitkernel_gen_##kernel_type##_CPUPlace_,              \
+      "USE_JITKERNEL_GEN_LITE must be called in global namespace");   \
+  extern int LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_();   \
+  static int use_litejitkernel_gen_##kernel_type##_CPUPlace_ UNUSED = \
+      LiteTouchJitKernelReg_gen_##kernel_type##_CPUPlace_()
+
+#define USE_JITKERNEL_REFER_LITE(kernel_type)                           \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                        \
+      __reg_litejitkernel_##kernel_type##_refer_CPUPlace_,              \
+      "USE_JITKERNEL_REFER_LITE must be called in global namespace");   \
+  extern int LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_();   \
+  static int use_litejitkernel_##kernel_type##_refer_CPUPlace_ UNUSED = \
+      LiteTouchJitKernelReg_##kernel_type##_refer_CPUPlace_()
+
+#define USE_KERNEL_MORE_LITE(kernel_type, impl_type, place_type)             \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                             \
+      __reg_litejitkernel_##kernel_type##_##impl_type##_##place_type##_,     \
+      "USE_JITKERNEL_MORE_LITE must be called in global namespace");         \
+  extern int                                                                 \
+      LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_(); \
+  static int use_litejitkernel_##kernel_type##_##impl_type##_##place_type##_ \
+      UNUSED =                                                               \
+          LiteTouchJitKernelReg_##kernel_type##_##impl_type##_##place_type##_()
+
+#define USE_JITKERNEL_MORE_LITE(kernel_type, impl_type) \
+  USE_KERNEL_MORE_LITE(kernel_type, impl_type, CPUPlace)
 
 }  // namespace jit
 }  // namespace lite
diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h
old mode 100755
new mode 100644
diff --git a/lite/backends/xpu/builder.cc b/lite/backends/xpu/builder.cc
deleted file mode 100644
index 796eaf9c46..0000000000
--- a/lite/backends/xpu/builder.cc
+++ /dev/null
@@ -1,189 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/xpu/builder.h"
-#include <mutex>  // NOLINT
-#include <utility>
-#include "lite/backends/xpu/runtime.h"
-
-namespace paddle {
-namespace lite {
-namespace xpu {
-
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname) {
-  auto iarg_names = op_info->input_argnames();
-  if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
-      iarg_names.end()) {
-    auto inputs = op_info->Input(argname);
-    if (inputs.empty()) {
-      return false;
-    }
-    auto var_name = inputs.front();
-    auto var = scope->FindVar(var_name);
-    return var != nullptr;
-  } else {
-    return false;
-  }
-}
-
-std::string UniqueName(const std::string& prefix) {
-  static std::mutex counter_mtx;
-  static std::unordered_map<std::string, int> counter_map;
-  std::unique_lock<std::mutex> counter_lck(counter_mtx);
-  int counter = 1;
-  auto it = counter_map.find(prefix);
-  if (it == counter_map.end()) {
-    counter_map[prefix] = counter;
-  } else {
-    counter = ++(it->second);
-  }
-  return prefix + "_" + std::to_string(counter);
-}
-
-xtcl::DataType CvtPrecisionType(PrecisionType in_type) {
-  xtcl::DataType out_type = ::xtcl::Float(32);
-  switch (in_type) {
-    case PRECISION(kFloat):
-      out_type = ::xtcl::Float(32);
-      break;
-    case PRECISION(kInt8):
-      out_type = ::xtcl::Int(8);
-      break;
-    case PRECISION(kInt32):
-      out_type = ::xtcl::Int(32);
-      break;
-    default:
-      LOG(FATAL) << "Can not convert precision type(" << PrecisionToStr(in_type)
-                 << ") from Lite to XPU";
-      break;
-  }
-  return out_type;
-}
-
-DLDataType CvtDataType(PrecisionType in_type) {
-  DLDataType out_type = {kDLFloat, 32, 1};
-  switch (in_type) {
-    case PRECISION(kFloat):
-      out_type = {kDLFloat, 32, 1};
-      break;
-    case PRECISION(kInt8):
-      out_type = {kDLInt, 8, 1};
-      break;
-    case PRECISION(kInt32):
-      out_type = {kDLInt, 32, 1};
-      break;
-    default:
-      LOG(FATAL) << "Can not convert data type(" << PrecisionToStr(in_type)
-                 << ") from Lite to XPU";
-      break;
-  }
-  return out_type;
-}
-
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int>& in_shape) {
-  xtcl::Array<xtcl::xIndexExpr> out_shape;
-  for (auto dim : in_shape) {
-    out_shape.push_back(dim);
-  }
-  return out_shape;
-}
-
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape) {
-  return CvtShape(std::vector<int>(in_shape.begin(), in_shape.end()));
-}
-
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims) {
-  return CvtShape(in_dims.Vectorize());
-}
-
-std::shared_ptr<xtcl::xNDArray> CvtTensor(lite::Tensor* in_tensor,
-                                          std::vector<int64_t> out_shape,
-                                          PrecisionType in_ptype,
-                                          DataLayoutType in_ltype) {
-  uint8_t* in_data = nullptr;
-  auto in_size = in_tensor->dims().production();
-  auto in_shape = in_tensor->dims().Vectorize();
-  if (out_shape.empty()) {
-    out_shape = in_shape;
-  }
-  int in_bytes;
-  if (in_ptype == PRECISION(kFloat)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<float>());
-    in_bytes = in_size * sizeof(float);
-  } else if (in_ptype == PRECISION(kInt32)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int32_t>());
-    in_bytes = in_size * sizeof(int32_t);
-  } else if (in_ptype == PRECISION(kInt8)) {
-    in_data = reinterpret_cast<uint8_t*>(in_tensor->mutable_data<int8_t>());
-    in_bytes = in_size * sizeof(int8_t);
-  } else {
-    LOG(FATAL) << "Unknow precision type " << PrecisionToStr(in_ptype);
-  }
-  auto out_tensor = std::make_shared<xtcl::xNDArray>(
-      xtcl::xNDArray::Empty(out_shape, CvtDataType(in_ptype), {kDLCPU, 0}));
-  auto out_data =
-      reinterpret_cast<uint8_t*>(out_tensor->ToDLPack()->dl_tensor.data);
-  std::memcpy(out_data, in_data, in_bytes);
-  return out_tensor;
-}
-
-// Build the XPU subgraph to the XPU model, store the model data into the
-// weight tensor of the graph op, and the model data will be loaded again
-// by the graph computing kernel when the graph op is executed for inference.
-// Due to the lack of XPU APIs for building and outputing the model data,
-// the compiled XPU runtime object will be managed by the global variable
-// 'DeviceInfo' and the key name for finding the runtime object will be
-// stored in the weight tensor of graph op.
-// TODO(hong19860320) Compile the XPU subgraph and output the compiled model
-// data to the weight tensor of graph op.
-bool BuildModel(
-    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
-    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
-    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
-    lite::Tensor* model) {
-  LOG(INFO) << "[XPU] Build Model.";
-  CHECK(builder != nullptr);
-  CHECK(outputs != nullptr);
-  CHECK_GT(outputs->size(), 0);
-  CHECK(model != nullptr);
-
-  // build graph and fill all of constant params
-  xtcl::xNetwork network = builder->FinalizeNetwork(*((*outputs)[0]));
-  auto target = xtcl::Target::Create("llvm");
-  auto compiler = xtcl::network::xTensorCompiler(network, target);
-  compiler.SetParams(*params);  // set the data of constant tensors
-  compiler.Build();
-
-  // create and register runtime
-  auto runtime = std::make_shared<xtcl::network::xRuntimeInstance>(
-      compiler.CreateRuntimeInstance());
-  if (runtime == nullptr) {
-    LOG(WARNING) << "[XPU] Build Model failed!";
-    return false;
-  }
-  std::string name = UniqueName("xpu");
-  LOG(INFO) << "[XPU] Model Name: " << name;
-  DeviceInfo::Global().Insert(name, runtime);
-  model->Resize({static_cast<int64_t>(name.length() + 1)});
-  memcpy(model->mutable_data<int8_t>(),
-         reinterpret_cast<const int8_t*>(name.c_str()),
-         name.length() + 1);
-  return true;
-}
-
-}  // namespace xpu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/xpu/builder.h b/lite/backends/xpu/builder.h
deleted file mode 100644
index f0ac2b303a..0000000000
--- a/lite/backends/xpu/builder.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <xtcl/xtcl.h>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/core/target_wrapper.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace xpu {
-
-bool HasInputArg(const OpInfo* op_info,
-                 const Scope* scope,
-                 const std::string& argname);
-
-std::string UniqueName(const std::string& prefix);
-
-xtcl::DataType CvtPrecisionType(PrecisionType in_type);
-
-DLDataType CvtDataType(PrecisionType in_type);
-
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int>& in_shape);
-
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const std::vector<int64_t>& in_shape);
-
-xtcl::Array<xtcl::xIndexExpr> CvtShape(const DDim& in_dims);
-
-std::shared_ptr<xtcl::xNDArray> CvtTensor(
-    Tensor* in_tensor,
-    std::vector<int64_t> out_shape = {},
-    PrecisionType in_ptype = PRECISION(kFloat),
-    DataLayoutType in_ltype = DATALAYOUT(kNCHW));
-
-bool BuildModel(
-    std::shared_ptr<xtcl::network::xNetworkBuilder> builder,
-    std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params,
-    std::vector<std::shared_ptr<xtcl::xExpr>>* outputs,
-    lite::Tensor* model);
-
-}  // namespace xpu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/xpu/device.cc b/lite/backends/xpu/device.cc
old mode 100755
new mode 100644
index dbf88ff833..badde878ad
--- a/lite/backends/xpu/device.cc
+++ b/lite/backends/xpu/device.cc
@@ -36,8 +36,11 @@ std::unique_ptr<xtcl::network::xRuntimeInstance> Device::Build(
   }
   xtcl::xNetwork network =
       builder->FinalizeNetwork(xtcl::relay::TupleNode::make(all_outs));
-  auto target = xtcl::Target::Create(device_name_);
-  auto compiler = xtcl::network::xTensorCompiler(network, target);
+  auto target = xtcl::NullValue<xtcl::Target>();
+  if (!target_.empty()) {
+    target = xtcl::Target::Create(target_);
+  }
+  xtcl::network::xTensorCompiler compiler(network, target);
   compiler.SetParams(*params);  // Set the data of constant tensors
   compiler.Build();
   VLOG(3) << "[XPU] Build done";
diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h
old mode 100755
new mode 100644
index bf9a8bf76a..6de18d5466
--- a/lite/backends/xpu/device.h
+++ b/lite/backends/xpu/device.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <xtcl/xtcl.h>
+#include <cstdlib>
 #include <memory>
 #include <string>
 #include <utility>
@@ -30,7 +31,18 @@ class Device {
     static Device x;
     return x;
   }
-  Device() {}
+  Device() {
+    char* name = std::getenv("XPU_DEVICE_NAME");
+    if (name) {
+      name_ = std::string(name);
+    }
+    // XPU_DEVICE_TARGET for XPU model building, which supports 'llvm' and 'xpu
+    // -libs=xdnn'
+    char* target = std::getenv("XPU_DEVICE_TARGET");
+    if (target) {
+      target_ = std::string(target);
+    }
+  }
 
   // Build the XPU graph to the XPU runtime, return the XPU runtime which can be
   // used to run inference.
@@ -39,10 +51,12 @@ class Device {
       xtcl::network::xTensorCompiler::ParamNDArrayMap* params,
       std::vector<xtcl::xExpr*>* outputs);
 
+  const std::string name() const { return name_; }
+  const std::string target() const { return target_; }
+
  private:
-  // Keep reserved fields
-  int device_id_{0};
-  std::string device_name_{"llvm"};
+  std::string name_{""};
+  std::string target_{""};
 };
 
 }  // namespace xpu
diff --git a/lite/backends/xpu/runtime.cc b/lite/backends/xpu/runtime.cc
deleted file mode 100644
index a2c34b9575..0000000000
--- a/lite/backends/xpu/runtime.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/backends/xpu/runtime.h"
-#include <vector>
-#include "lite/utils/cp_logging.h"
-
-namespace paddle {
-namespace lite {
-namespace xpu {
-
-// Extract the model data and recover the XPU model for inference, the function
-// is called by the graph computing kernel when the graph op is executed.
-// Due to the lack of XPU APIs for loading and recovering the XPU model from
-// memory, the key name is obtained from the weight tensor of graph op, to get
-// the runtime object for inference from the global variable 'DeviceInfo'.
-// TODO(hong19860320) Recover the XPU model from the weight tensor of graph op.
-bool LoadModel(const lite::Tensor &model,
-               std::shared_ptr<xtcl::network::xRuntimeInstance> *runtime) {
-  LOG(INFO) << "[XPU] Load Model.";
-  CHECK_GT(model.dims().production(), 0);
-  std::string name(reinterpret_cast<const char *>(model.data<int8_t>()));
-  LOG(INFO) << "[XPU] Model Name: " << name;
-  CHECK(runtime != nullptr);
-  *runtime = DeviceInfo::Global().Find(name);
-  if (*runtime == nullptr) {
-    LOG(WARNING) << "[XPU] Load Model failed!";
-    return false;
-  }
-  return true;
-}
-
-}  // namespace xpu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/backends/xpu/runtime.h b/lite/backends/xpu/runtime.h
deleted file mode 100644
index 4ff8d75bce..0000000000
--- a/lite/backends/xpu/runtime.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <xtcl/xtcl.h>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-namespace xpu {
-
-class DeviceInfo {
- public:
-  static DeviceInfo& Global() {
-    static DeviceInfo x;
-    return x;
-  }
-  DeviceInfo() {}
-
-  void Insert(const std::string& name,
-              std::shared_ptr<xtcl::network::xRuntimeInstance> runtime) {
-    if (runtimes_.find(name) != runtimes_.end()) {
-      LOG(WARNING) << "[XPU] Model " << name << " already exists.";
-      return;
-    }
-    runtimes_.emplace(std::make_pair(name, runtime));
-  }
-
-  void Clear() { runtimes_.clear(); }
-
-  std::shared_ptr<xtcl::network::xRuntimeInstance> Find(
-      const std::string& name) const {
-    if (runtimes_.find(name) != runtimes_.end()) {
-      return runtimes_.at(name);
-    } else {
-      return nullptr;
-    }
-  }
-
- private:
-  int device_id_{0};
-  std::string device_name_{"default"};
-  std::unordered_map<std::string,
-                     std::shared_ptr<xtcl::network::xRuntimeInstance>>
-      runtimes_;
-};
-
-bool LoadModel(const lite::Tensor& model,
-               std::shared_ptr<xtcl::network::xRuntimeInstance>* runtime);
-
-}  // namespace xpu
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 57f353c0ee..1d0558451f 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -96,7 +96,15 @@ add_custom_command(
 add_custom_target(op_list_h DEPENDS ops.h)
 add_custom_target(kernel_list_h DEPENDS kernels.h)
 add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
-
+# create headfile to restore ops info sorted by suppported platforms
+add_custom_command(
+  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/record_supported_kernel_op.py
+  ${kernels_src_list}
+  ${ops_src_list}
+  ${CMAKE_BINARY_DIR}/supported_kernel_op_info.h
+  OUTPUT supported_kernel_op_info.h # not a real path to the output to force it execute every time.
+  )
+  add_custom_target(supported_kernel_op_info_h DEPENDS supported_kernel_op_info.h)
 #----------------------------------------------- NOT CHANGE -----------------------------------------------
 lite_cc_library(kernel SRCS kernel.cc
         DEPS context type_system target_wrapper any op_params tensor
diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt
index d379b31b84..1c85353d53 100644
--- a/lite/core/arena/CMakeLists.txt
+++ b/lite/core/arena/CMakeLists.txt
@@ -6,5 +6,5 @@ endif()
 lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
 
 if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+  lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/core/framework.proto b/lite/core/framework.proto
index 5adf2a18b9..84b5502ff7 100644
--- a/lite/core/framework.proto
+++ b/lite/core/framework.proto
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 syntax = "proto2";
-option optimize_for = LITE_RUNTIME;
 package paddle.framework.proto;
 
 // Any incompatible changes to ProgramDesc and its dependencies should
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
index 86193235a2..18a1243c11 100644
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -83,14 +83,11 @@ class KernelBase {
 #if defined(LITE_WITH_CUDA)
     WorkSpace::Global_CUDA().AllocReset();
 #endif
-
 #ifdef LITE_WITH_PROFILE
-    CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
-                        "When LITE_WITH_PROFILE is defined, please set a "
-                        "Profiler for Instruction.";
-    profiler_->StartTiming(profile_id_, ctx_.get());
+    profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
+    profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
     Run();
-    profiler_->StopTiming(profile_id_, ctx_.get());
+    profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
 #else
     Run();
 #endif
diff --git a/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc b/lite/core/mir/elimination/elementwise_mul_constant_eliminate_pass.cc
old mode 100755
new mode 100644
diff --git a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
index 97f6a2657f..8447865bdc 100644
--- a/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
+++ b/lite/core/mir/fusion/elementwise_add_activation_fuse_pass.cc
@@ -35,5 +35,7 @@ void ElementwiseAddActivationFusePass::Apply(
 REGISTER_MIR_PASS(lite_elementwise_add_activation_fuse_pass,
                   paddle::lite::mir::ElementwiseAddActivationFusePass)
     .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kXPU), TARGET(kBM)})
+    .ExcludeTargets({TARGET(kXPU)})
+	.ExcludeTargets({TARGET(kBM)})
+    .ExcludeTargets({TARGET(kX86)})
     .BindKernel("fusion_elementwise_add_activation");
diff --git a/lite/core/mir/fusion/fc_fuse_pass.cc b/lite/core/mir/fusion/fc_fuse_pass.cc
index 5b8e8563ba..c85d34cbae 100644
--- a/lite/core/mir/fusion/fc_fuse_pass.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass.cc
@@ -23,8 +23,13 @@ namespace lite {
 namespace mir {
 
 void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::FcFuser fuser;
+#ifdef LITE_WITH_X86
+  fusion::FcFuser fuser(true);
   fuser(graph.get());
+#endif
+
+  fusion::FcFuser fuser2(false);
+  fuser2(graph.get());
 }
 
 }  // namespace mir
@@ -33,5 +38,7 @@ void FcFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 REGISTER_MIR_PASS(lite_fc_fuse_pass, paddle::lite::mir::FcFusePass)
     .BindTargets({TARGET(kAny)})
-    .ExcludeTargets({TARGET(kXPU), TARGET(kBM)})
+    .ExcludeTargets({TARGET(kXPU)})
+	.ExcludeTargets({TARGET(kBM)})
+    .ExcludeTargets({TARGET(kCUDA)})
     .BindKernel("fc");
diff --git a/lite/core/mir/fusion/fc_fuse_pass_test.cc b/lite/core/mir/fusion/fc_fuse_pass_test.cc
index f7aa4bb5ad..54260732c5 100644
--- a/lite/core/mir/fusion/fc_fuse_pass_test.cc
+++ b/lite/core/mir/fusion/fc_fuse_pass_test.cc
@@ -88,6 +88,7 @@ USE_LITE_OP(mul);
 USE_LITE_OP(elementwise_add);
 USE_LITE_OP(elementwise_sub);
 USE_LITE_OP(fc);
+USE_LITE_OP(relu);
 USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);
diff --git a/lite/core/mir/fusion/fc_fuser.cc b/lite/core/mir/fusion/fc_fuser.cc
index 460c0fdf7a..3c99131083 100644
--- a/lite/core/mir/fusion/fc_fuser.cc
+++ b/lite/core/mir/fusion/fc_fuser.cc
@@ -35,12 +35,23 @@ void FcFuser::BuildPattern() {
   std::vector<PMNode*> mul_inputs{W, x};
   std::vector<PMNode*> add_inputs{mul_out, b};
   mul_inputs >> *mul >> *mul_out;
-  add_inputs >> *add >> *Out;
 
   // Some op specialities.
   mul_out->AsIntermediate();
   mul->AsIntermediate();
   add->AsIntermediate();
+
+  if (with_relu_) {
+    auto* add_out = VarNode("add_out");
+    auto* relu = OpNode("relu", "relu");
+    std::vector<PMNode*> relu_inputs{add_out};
+    add_inputs >> *add >> *add_out;
+    relu_inputs >> *relu >> *Out;
+    add_out->AsIntermediate();
+    relu->AsIntermediate();
+  } else {
+    add_inputs >> *add >> *Out;
+  }
 }
 
 void FcFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
@@ -71,6 +82,9 @@ cpp::OpDesc FcFuser::GenOpDesc(const key2nodes_t& matched) {
   op_desc.SetAttr(
       "in_num_col_dims",
       matched.at("mul")->stmt()->op_info()->GetAttr<int>("x_num_col_dims"));
+  if (with_relu_) {
+    op_desc.SetAttr("activation_type", std::string{"relu"});
+  }
   return op_desc;
 }
 
diff --git a/lite/core/mir/fusion/fc_fuser.h b/lite/core/mir/fusion/fc_fuser.h
index 7ba0752789..6cb08f4157 100644
--- a/lite/core/mir/fusion/fc_fuser.h
+++ b/lite/core/mir/fusion/fc_fuser.h
@@ -25,11 +25,13 @@ namespace fusion {
 
 class FcFuser : public FuseBase {
  public:
+  explicit FcFuser(bool with_relu) : with_relu_(with_relu) {}
   void BuildPattern() override;
   void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
 
  private:
   cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
+  bool with_relu_;
 };
 
 }  // namespace fusion
diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.cc
old mode 100755
new mode 100644
diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h b/lite/core/mir/fusion/sequence_pool_concat_fuse_pass.h
old mode 100755
new mode 100644
diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuser.cc b/lite/core/mir/fusion/sequence_pool_concat_fuser.cc
old mode 100755
new mode 100644
diff --git a/lite/core/mir/fusion/sequence_pool_concat_fuser.h b/lite/core/mir/fusion/sequence_pool_concat_fuser.h
old mode 100755
new mode 100644
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.cc
old mode 100755
new mode 100644
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h b/lite/core/mir/fusion/var_conv_2d_activation_fuse_pass.h
old mode 100755
new mode 100644
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc b/lite/core/mir/fusion/var_conv_2d_activation_fuser.cc
old mode 100755
new mode 100644
diff --git a/lite/core/mir/fusion/var_conv_2d_activation_fuser.h b/lite/core/mir/fusion/var_conv_2d_activation_fuser.h
old mode 100755
new mode 100644
diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc
index 9ad69b8152..76c97d2da6 100644
--- a/lite/core/mir/generate_program_pass.cc
+++ b/lite/core/mir/generate_program_pass.cc
@@ -29,7 +29,6 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     if (item->IsStmt()) {
       auto& stmt = item->AsStmt();
       VLOG(4) << stmt;
-      LOG(INFO) << stmt;
       insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
     }
   }
diff --git a/lite/core/mir/subgraph/CMakeLists.txt b/lite/core/mir/subgraph/CMakeLists.txt
index 1ac4ab346f..f8aa09676c 100644
--- a/lite/core/mir/subgraph/CMakeLists.txt
+++ b/lite/core/mir/subgraph/CMakeLists.txt
@@ -4,7 +4,7 @@ lite_cc_library(subgraph_detector
 lite_cc_library(subgraph_pass
     SRCS subgraph_pass.cc
     DEPS mir_pass types context ${mir_fusers} subgraph_detector)
-if (WITH_TESTING)
+if (WITH_TESTING AND NOT LITE_WITH_CUDA)
     lite_cc_test(test_subgraph_detector
         SRCS subgraph_detector_test.cc
         DEPS subgraph_detector mir_passes gflags model_parser cxx_api
diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc
old mode 100755
new mode 100644
index bf04d5c2ef..6d48b053a1
--- a/lite/core/mir/subgraph/subgraph_detector.cc
+++ b/lite/core/mir/subgraph/subgraph_detector.cc
@@ -94,7 +94,7 @@ std::string SubgraphVisualizer::operator()() {
   }
 
   auto res = dot.Build();
-  //std::cout << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl;
+  std::cout << "subgraphs: " << subgraphs_.size() << "\n" << res << std::endl;
   return res;
 }
 
diff --git a/lite/core/mir/subgraph/subgraph_detector.h b/lite/core/mir/subgraph/subgraph_detector.h
old mode 100755
new mode 100644
diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc
old mode 100755
new mode 100644
diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc
old mode 100755
new mode 100644
index af5bcdee08..116b361681
--- a/lite/core/mir/subgraph/subgraph_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_pass.cc
@@ -27,7 +27,7 @@ namespace mir {
 
 void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::unordered_set<std::string> supported_lists;
-#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/npu/bridges/paddle_use_bridges.h"
 #undef USE_SUBGRAPH_BRIDGE
   auto teller = [&](Node* node) {
@@ -41,7 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
   std::unordered_set<std::string> supported_lists;
-#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/xpu/bridges/paddle_use_bridges.h"
 #undef USE_SUBGRAPH_BRIDGE
   auto teller = [&](Node* node) {
@@ -55,7 +55,7 @@ void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 
 void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
     std::unordered_set<std::string> supported_lists;
-#define USE_SUBGRAPH_BRIDGE(dev_type, op_type) supported_lists.insert(#op_type);
+#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
 #include "lite/kernels/bm/bridges/paddle_use_bridges.h"
 #undef USE_SUBGRAPH_BRIDGE
   auto teller = [&](Node* node) {
diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h
old mode 100755
new mode 100644
diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc
old mode 100755
new mode 100644
index 0d5fc7bf5e..a56c364f97
--- a/lite/core/mir/subgraph/subgraph_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_pass_test.cc
@@ -92,7 +92,7 @@ void FillInputTensors(
 #define FILL_TENSOR_WITH_TYPE(type)                            \
   auto input_tensor_data = input_tensor->mutable_data<type>(); \
   for (int j = 0; j < input_tensor_size; j++) {                \
-    input_tensor_data[i] = static_cast<type>(value);           \
+    input_tensor_data[j] = static_cast<type>(value);           \
   }
   for (int i = 0; i < input_tensor_shape.size(); i++) {
     auto input_tensor = predictor->GetInput(i);
diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc
index 78317f78ac..f4d0e3c0af 100644
--- a/lite/core/profile/profiler.cc
+++ b/lite/core/profile/profiler.cc
@@ -28,36 +28,55 @@ auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
 };
 }
 
-int Profiler::NewTimer(const OpCharacter& ch) {
-  StatisUnit unit;
-  unit.character = ch;
+std::map<Type, std::string> TypeStr{
+    {Type::kUnk, "Unknown"},
+    {Type::kCreate, "Create"},
+    {Type::kDispatch, "Dispatch"},
+};
+
+StatisUnit::StatisUnit(const OpCharacter& ch) : character(ch) {
+  create_t.reset(new DeviceTimer<TargetType::kHost>());
   if (ch.target == TargetType::kCUDA) {
 #ifdef LITE_WITH_CUDA
-    unit.timer.reset(new DeviceTimer<TargetType::kCUDA>());
+    dispatch_t.reset(new DeviceTimer<TargetType::kCUDA>());
 #else
     LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the "
                   "default x86 timer is used instead.";
 #endif
   } else {
-    unit.timer.reset(new DeviceTimer<TargetType::kHost>());
+    dispatch_t.reset(new DeviceTimer<TargetType::kHost>());
   }
+}
+
+lite::profile::Timer* StatisUnit::Timer(Type type) {
+  if (type == Type::kCreate) {
+    return create_t.get();
+  } else if (type == Type::kDispatch) {
+    return dispatch_t.get();
+  }
+  LOG(FATAL) << "Timer cannot be returned for unknown platforms.";
+  return nullptr;
+}
+
+int Profiler::NewTimer(const OpCharacter& ch) {
+  StatisUnit unit(ch);
   units_.push_back(std::move(unit));
   return units_.size() - 1;
 }
 
-void Profiler::StartTiming(const int index, KernelContext* ctx) {
+void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) {
   CHECK_LT(index, units_.size())
       << "The timer index in the profiler is out of range.";
-  units_[index].timer->Start(ctx);
+  units_[index].Timer(type)->Start(ctx);
 }
 
-float Profiler::StopTiming(const int index, KernelContext* ctx) {
+float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
   CHECK_LT(index, units_.size())
       << "The timer index in the profiler is out of range.";
-  return units_[index].timer->Stop(ctx);
+  return units_[index].Timer(type)->Stop(ctx);
 }
 
-std::string Profiler::Summary(bool concise, size_t w) {
+std::string Profiler::Summary(Type type, bool concise, size_t w) {
   using std::setw;
   using std::left;
   using std::fixed;
@@ -65,12 +84,14 @@ std::string Profiler::Summary(bool concise, size_t w) {
   std::string title;
   // Title.
   if (concise) {
-    ss << "Timing cycle = " << units_.front().timer->LapTimes().Size()
+    ss << "Timing cycle = " << units_.front().Timer(type)->LapTimes().Size()
        << std::endl;
-    ss << "===== Concise Profiler Summary: " << name_ << ", Exclude " << w
+    ss << "===== Concise " << TypeStr.find(type)->second
+       << " Profiler Summary: " << name_ << ", Exclude " << w
        << " warm-ups =====" << std::endl;
   } else {
-    ss << "===== Detailed Profiler Summary: " << name_ << ", Exclude " << w
+    ss << "===== Detailed " << TypeStr.find(type)->second
+       << " Profiler Summary: " << name_ << ", Exclude " << w
        << " warm-ups =====" << std::endl;
   }
   ss << setw(25) << left << "Operator Type"
@@ -84,16 +105,16 @@ std::string Profiler::Summary(bool concise, size_t w) {
   if (concise) {
     std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
     for (auto& unit : units_) {
-      auto ch = summary.find(unit.character);
+      auto ch = summary.find(unit.Character());
       if (ch != summary.end()) {
-        ch->second.avg += unit.timer->LapTimes().Avg(w);
-        ch->second.min += unit.timer->LapTimes().Min(w);
-        ch->second.max += unit.timer->LapTimes().Max(w);
+        ch->second.avg += unit.Timer(type)->LapTimes().Avg(w);
+        ch->second.min += unit.Timer(type)->LapTimes().Min(w);
+        ch->second.max += unit.Timer(type)->LapTimes().Max(w);
       } else {
-        TimeInfo info({unit.timer->LapTimes().Avg(w),
-                       unit.timer->LapTimes().Min(w),
-                       unit.timer->LapTimes().Max(w)});
-        summary.insert({unit.character, info});
+        TimeInfo info({unit.Timer(type)->LapTimes().Avg(w),
+                       unit.Timer(type)->LapTimes().Min(w),
+                       unit.Timer(type)->LapTimes().Max(w)});
+        summary.insert({unit.Character(), info});
       }
     }
     for (const auto& item : summary) {
@@ -109,14 +130,15 @@ std::string Profiler::Summary(bool concise, size_t w) {
     }
   } else {
     for (auto& unit : units_) {
+      const auto& times = unit.Timer(type)->LapTimes();
       // clang-format off
-      ss << setw(25) << left << fixed << unit.character.op_type                \
-         << " " << setw(40) << left << fixed << unit.character.kernel_name     \
-         << " " << setw(12) << left << fixed << unit.character.remark          \
-         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Avg(w)  \
-         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Min(w)  \
-         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Max(w)  \
-         << " " << setw(12) << left << fixed << unit.timer->LapTimes().Last(w) \
+      ss << setw(25) << left << fixed << unit.Character().op_type            \
+         << " " << setw(40) << left << fixed << unit.Character().kernel_name \
+         << " " << setw(12) << left << fixed << unit.Character().remark      \
+         << " " << setw(12) << left << fixed << times.Avg(w)                 \
+         << " " << setw(12) << left << fixed << times.Min(w)                 \
+         << " " << setw(12) << left << fixed << times.Max(w)                 \
+         << " " << setw(12) << left << fixed << times.Last(w)                \
          << std::endl;
       // clang-format on
     }
diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h
index 4e9e9ae31c..3933e5ba01 100644
--- a/lite/core/profile/profiler.h
+++ b/lite/core/profile/profiler.h
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #pragma once
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -22,6 +23,14 @@ namespace paddle {
 namespace lite {
 namespace profile {
 
+enum class Type {
+  kUnk = 0,
+  kCreate,
+  kDispatch,
+};
+
+extern std::map<Type, std::string> TypeStr;
+
 struct TimeInfo {
   float avg;
   float min;
@@ -35,8 +44,15 @@ struct OpCharacter {
   std::string remark{std::string("N/A")};
 };
 
-struct StatisUnit {
-  std::unique_ptr<Timer> timer;
+class StatisUnit final {
+ public:
+  explicit StatisUnit(const OpCharacter& ch);
+  lite::profile::Timer* Timer(Type type);
+  const OpCharacter& Character() const { return character; }
+
+ protected:
+  std::unique_ptr<lite::profile::Timer> create_t;
+  std::unique_ptr<lite::profile::Timer> dispatch_t;
   OpCharacter character;
 };
 
@@ -45,9 +61,9 @@ class Profiler final {
   Profiler() = default;
   explicit Profiler(const std::string& name) : name_(name) {}
   int NewTimer(const OpCharacter& ch);
-  void StartTiming(const int index, KernelContext* ctx);
-  float StopTiming(const int index, KernelContext* ctx);
-  std::string Summary(bool concise = true, size_t warm_up = 10);
+  void StartTiming(Type type, const int index, KernelContext* ctx);
+  float StopTiming(Type type, const int index, KernelContext* ctx);
+  std::string Summary(Type type, bool concise = true, size_t warm_up = 10);
 
  private:
   std::string name_{std::string("N/A")};
diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc
index 6f49698ef4..3841f01518 100644
--- a/lite/core/profile/test_timer.cc
+++ b/lite/core/profile/test_timer.cc
@@ -69,10 +69,10 @@ TEST(profiler, real_latency) {
   ch.op_type = "operator/1";
   ch.kernel_name = "kernel/1";
   int idx = profiler.NewTimer(ch);
-  profiler.StartTiming(idx, &ctx);
+  profiler.StartTiming(Type::kDispatch, idx, &ctx);
   std::this_thread::sleep_for(std::chrono::milliseconds(10));
-  profiler.StopTiming(idx, &ctx);
-  std::cout << profiler.Summary();
+  profiler.StopTiming(Type::kDispatch, idx, &ctx);
+  std::cout << profiler.Summary(Type::kDispatch);
 }
 #endif
 
diff --git a/lite/core/program.cc b/lite/core/program.cc
index 8dc8fb0ddd..41d178f015 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -137,8 +137,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
 
 void RuntimeProgram::Run() {
   for (auto& inst : instructions_) {
-    std::string op_type = inst.op()->op_info()->Type();
-    if (op_type == "feed" || op_type == "fetch") continue;
+    if (inst.is_feed_fetch_op()) continue;
     inst.Run();
 #ifdef LITE_WITH_PROFILE
 #ifdef LITE_WITH_PRECISION_PROFILE
@@ -147,7 +146,7 @@ void RuntimeProgram::Run() {
 #endif  // LITE_WITH_PROFILE
   }
 #ifdef LITE_WITH_PROFILE
-  LOG(INFO) << "\n" << profiler_.Summary(false, 0);
+  LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
 #endif  // LITE_WITH_PROFILE
 }
 
@@ -252,8 +251,16 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
 }
 
 void Instruction::Run() {
+#ifdef LITE_WITH_PROFILE
+  CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
+                      "When LITE_WITH_PROFILE is defined, please set a "
+                      "Profiler for Instruction.";
+  profiler_->StartTiming(
+      profile::Type::kCreate, profile_id_, kernel_->mutable_context());
+#endif
   CHECK(op_) << "op null";
   CHECK(kernel_) << "kernel null";
+
   if (first_epoch_) {
     first_epoch_ = false;
     CHECK(op_->CheckShape());
@@ -263,10 +270,7 @@ void Instruction::Run() {
     return;
   }
 
-  // VLOG(4) << "kernel launch";
   op_->InferShape();
-  // VLOG(4) << ">> Running kernel: " << op_->op_info()->Repr() << " on Target "
-  //        << TargetToStr(kernel_->target());
   kernel_->Launch();
   has_run_ = true;
 }
diff --git a/lite/core/program.h b/lite/core/program.h
index 291252619b..c845a17c52 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -90,7 +90,12 @@ struct Program {
 struct Instruction {
   Instruction(const std::shared_ptr<OpLite>& op,
               std::unique_ptr<KernelBase>&& kernel)
-      : op_(op), kernel_(std::move(kernel)) {}
+      : op_(op), kernel_(std::move(kernel)) {
+    std::string op_type = op->Type();
+    if (op_type == "feed" || op_type == "fetch") {
+      is_feed_fetch_op_ = true;
+    }
+  }
 
   // Run the instruction.
   void Run();
@@ -101,6 +106,8 @@ struct Instruction {
   const KernelBase* kernel() const { return kernel_.get(); }
   KernelBase* mutable_kernel() { return kernel_.get(); }
 
+  bool is_feed_fetch_op() const { return is_feed_fetch_op_; }
+
 #ifdef LITE_WITH_PROFILE
   void set_profiler(profile::Profiler* profiler) {
     profiler_ = profiler;
@@ -118,6 +125,7 @@ struct Instruction {
  private:
   std::shared_ptr<OpLite> op_;
   std::unique_ptr<KernelBase> kernel_;
+  bool is_feed_fetch_op_{false};
   bool first_epoch_{true};
   bool has_run_{false};
 
@@ -143,7 +151,8 @@ class LITE_API RuntimeProgram {
   }
   ~RuntimeProgram() {
 #ifdef LITE_WITH_PROFILE
-    LOG(INFO) << "\n" << profiler_.Summary();
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
+    LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
 #endif  // LITE_WITH_PROFILE
   }
 
diff --git a/lite/core/tensor.h b/lite/core/tensor.h
index de08aa82f3..41a2d16f75 100644
--- a/lite/core/tensor.h
+++ b/lite/core/tensor.h
@@ -139,6 +139,22 @@ class TensorLite {
   // For other devices, T and R may be the same type.
   template <typename T, typename R = T>
   R *mutable_data() {
+    auto type_id = typeid(T).hash_code();
+    if (type_id == typeid(bool).hash_code()) {  // NOLINT
+      precision_ = PrecisionType::kBool;
+    } else if (type_id == typeid(float).hash_code()) {  // NOLINT
+      precision_ = PrecisionType::kFloat;
+    } else if (type_id == typeid(int8_t).hash_code()) {
+      precision_ = PrecisionType::kInt8;
+    } else if (type_id == typeid(int16_t).hash_code()) {
+      precision_ = PrecisionType::kInt16;
+    } else if (type_id == typeid(int32_t).hash_code()) {
+      precision_ = PrecisionType::kInt32;
+    } else if (type_id == typeid(int64_t).hash_code()) {
+      precision_ = PrecisionType::kInt64;
+    } else {
+      precision_ = PrecisionType::kUnk;
+    }
     memory_size_ = dims_.production() * sizeof(T);
     buffer_->ResetLazy(target_, memory_size_);
     return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) +
@@ -163,10 +179,7 @@ class TensorLite {
   template <typename T, typename R = T>
   R *mutable_data(TargetType target) {
     target_ = target;
-    memory_size_ = dims_.production() * sizeof(T);
-    buffer_->ResetLazy(target, memory_size());
-    return reinterpret_cast<R *>(static_cast<char *>(buffer_->data()) +
-                                 offset_);
+    return mutable_data<T, R>();
   }
   void *mutable_data(size_t memory_size);
   void *mutable_data(TargetType target, size_t memory_size);
diff --git a/lite/demo/cxx/README.md b/lite/demo/cxx/README.md
index 5e0ec49add..3217a7ed49 100644
--- a/lite/demo/cxx/README.md
+++ b/lite/demo/cxx/README.md
@@ -1,91 +1,111 @@
 # C++ Demo
-1. 使用`lite/tools/Dockerfile.mobile`生成docker镜像
-2. 运行并进入docker镜像环境，执行`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv8.tar.gz `下载所需demo环境。(armv7 demo可使用命令`wget http://paddle-inference-dist.bj.bcebos.com/lite_release/v2.1.0/inference_lite_lib.android.armv7.tar.gz` 进行下载)。
-3. 解压下载文件`tar zxvf inference_lite_lib.android.armv8.tar.gz `
-4. 执行以下命令准备模拟器环境
-```shell
-# armv8
-adb kill-server
-adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-echo n | avdmanager create avd -f -n paddle-armv8 -k "system-images;android-24;google_apis;arm64-v8a"
-echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv8 -noaudio -no-window -gpu off -port 5554 &
-sleep 1m
-```
-```shell
-# armv7
-adb kill-server
-adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
-echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
-echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -port 5554 &
-sleep 1m
-```
-5. 准备模型、编译并运行完整api的demo
+1. 环境准备
+   - 保证Android NDK在/opt目录下
+   - 一台armv7或armv8架构的安卓手机
+2. 编译并运行全量api的demo(注：当编译模式为tiny_pubish时将不存在该demo)
 ```shell
 cd inference_lite_lib.android.armv8/demo/cxx/mobile_full
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
 tar zxvf mobilenet_v1.tar.gz
 make
-adb -s emulator-5554 push mobilenet_v1 /data/local/tmp/
-adb -s emulator-5554 push mobilenetv1_full_api /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_full_api
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+adb push mobilenet_v1 /data/local/tmp/
+adb push mobilenetv1_full_api /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_full_api
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/mobilenetv1_full_api --model_dir=/data/local/tmp/mobilenet_v1 --optimized_model_dir=/data/local/tmp/mobilenet_v1.opt"
 ```
 运行成功将在控制台输出预测结果的前10个类别的预测概率
 
-6. 编译并运行轻量级api的demo
+3. 编译并运行轻量级api的demo
 ```shell
 cd ../mobile_light
 make
-adb -s emulator-5554 push mobilenetv1_light_api /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobilenetv1_light_api
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+adb push mobilenetv1_light_api /data/local/tmp/
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/mobilenetv1_light_api
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
 /data/local/tmp/mobilenetv1_light_api /data/local/tmp/mobilenet_v1.opt"
 ```
+运行成功将在控制台输出预测结果的前10个类别的预测概率
 
-7. 编译并运行目标检测的demo
+4. 编译并运行ssd目标检测的demo
 ```shell
-cd ../mobile_detection
+cd ../ssd_detection
 wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-ssd.tar.gz
 tar zxvf mobilenetv1-ssd.tar.gz
 make
-adb -s emulator-5554 push mobile_detection /data/local/tmp/
-adb -s emulator-5554 push test.jpg /data/local/tmp/
-adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
-adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_detection
-adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-/data/local/tmp/mobile_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
-adb -s emulator-5554 pull /data/local/tmp/test_detection_result.jpg ./
+adb push ssd_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push mobilenetv1-ssd /data/local/tmp
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/ssd_detection
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/ssd_detection /data/local/tmp/mobilenetv1-ssd /data/local/tmp/test.jpg"
+adb pull /data/local/tmp/test_ssd_detection_result.jpg ./
 ```
-运行成功将在mobile_detection目录下看到生成的目标检测结果图像: test_detection_result.jpg
+运行成功将在ssd_detection目录下看到生成的目标检测结果图像: test_ssd_detection_result.jpg
 
-8. 编译并运行物体分类的demo
+5. 编译并运行yolov3目标检测的demo
+```shell
+cd ../yolov3_detection
+wget https://paddle-inference-dist.bj.bcebos.com/mobilenetv1-yolov3.tar.gz
+tar zxvf mobilenetv1-yolov3.tar.gz
+make
+adb push yolov3_detection /data/local/tmp/
+adb push test.jpg /data/local/tmp/
+adb push mobilenetv1-yolov3 /data/local/tmp
+adb push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
+adb shell chmod +x /data/local/tmp/yolov3_detection
+adb shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/yolov3_detection /data/local/tmp/mobilenetv1-yolov3 /data/local/tmp/test.jpg"
+adb pull /data/local/tmp/test_yolov3_detection_result.jpg ./
+```
+运行成功将在yolov3_detection目录下看到生成的目标检测结果图像: test_yolov3_detection_result.jpg
+
+6. 编译并运行物体分类的demo
 ```shell
 cd ../mobile_classify
 wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
 tar zxvf mobilenet_v1.tar.gz
+./model_optimize_tool optimize model
 make
+
 adb -s emulator-5554 push mobile_classify /data/local/tmp/
 adb -s emulator-5554 push test.jpg /data/local/tmp/
 adb -s emulator-5554 push labels.txt /data/local/tmp/
 adb -s emulator-5554 push ../../../cxx/lib/libpaddle_light_api_shared.so /data/local/tmp/
 adb -s emulator-5554 shell chmod +x /data/local/tmp/mobile_classify
 adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-/data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+/data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
 ```
 运行成功将在控制台输出预测结果的前5个类别的预测概率
 - 如若想看前10个类别的预测概率，在运行命令输入topk的值即可
     eg:
     ```shell
     adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv1opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10"
     ```
 - 如若想看其他模型的分类结果， 在运行命令输入model_dir 及其model的输入大小即可
     eg:
     ```shell
     adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
-    /data/local/tmp/mobile_classify /data/local/tmp/mobilenet_v2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
+    /data/local/tmp/mobile_classify /data/local/tmp/mobilenetv2opt2/ /data/local/tmp/test.jpg /data/local/tmp/labels.txt 10 224 224"
     ```
     
+9. 编译含CV预处理库模型单测demo 
+```shell
+cd ../test_cv
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+./model_optimize_tool optimize model
+make
+adb -s emulator-5554 push test_model_cv /data/local/tmp/
+adb -s emulator-5554 push test.jpg /data/local/tmp/
+adb -s emulator-5554 push labels.txt /data/local/tmp/
+adb -s emulator-5554 push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s emulator-5554 shell chmod +x /data/local/tmp/test_model_cv
+adb -s emulator-5554 shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_model_cv /data/local/tmp/mobilenetv1opt2 /data/local/tmp/test.jpg /data/local/tmp/labels.txt"
+```
+运行成功将在控制台输出预测结果的前10个类别的预测概率
diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv7
old mode 100755
new mode 100644
diff --git a/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8 b/lite/demo/cxx/makefiles/mobile_classify/Makefile.android.armv8
old mode 100755
new mode 100644
diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7
similarity index 90%
rename from lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
rename to lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7
index 784ad73da4..05f1c2e276 100644
--- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv7
@@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY
 
 #CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
-mobile_detection: fetch_opencv mobile_detection.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection  $(CXX_LIBS) $(LDFLAGS)
+ssd_detection: fetch_opencv ssd_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection  $(CXX_LIBS) $(LDFLAGS)
 
-mobile_detection.o: mobile_detection.cc
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
+ssd_detection.o: ssd_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc
 
 fetch_opencv:
 	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
@@ -57,5 +57,5 @@ fetch_opencv:
 
 .PHONY: clean
 clean:
-	rm -f mobile_detection.o
-	rm -f mobile_detection
+	rm -f ssd_detection.o
+	rm -f ssd_detection
diff --git a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8
similarity index 89%
rename from lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
rename to lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8
index 2304b38eff..77ff07df95 100644
--- a/lite/demo/cxx/makefiles/mobile_detection/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/ssd_detection/Makefile.android.armv8
@@ -40,11 +40,11 @@ CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SY
 
 #CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
 
-mobile_detection: fetch_opencv mobile_detection.o
-	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) mobile_detection.o -o mobile_detection  $(CXX_LIBS) $(LDFLAGS)
+ssd_detection: fetch_opencv ssd_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) ssd_detection.o -o ssd_detection  $(CXX_LIBS) $(LDFLAGS)
 
-mobile_detection.o: mobile_detection.cc
-	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o mobile_detection.o -c mobile_detection.cc
+ssd_detection.o: ssd_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o ssd_detection.o -c ssd_detection.cc
 
 fetch_opencv:
 	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
@@ -57,5 +57,5 @@ fetch_opencv:
 
 .PHONY: clean
 clean:
-	rm -f mobile_detection.o
-	rm -f mobile_detection
+	rm -f ssd_detection.o
+	rm -f ssd_detection
diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
new file mode 100644
index 0000000000..d659a316cd
--- /dev/null
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv7
@@ -0,0 +1,71 @@
+ARM_ABI = arm7
+LITE_WITH_CV = ON
+export ARM_ABI
+export LITE_WITH_CV
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+test_model_cv: fetch_opencv test_model_cv.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+test_model_cv.o: test_model_cv.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+test_img_prepross: fetch_opencv test_img_prepross.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross  $(CXX_LIBS) $(LDFLAGS)
+
+test_img_prepross.o: test_img_prepross.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f test_model_cv.o
+	rm -f test_model_cv
+      rm -f test_img_prepross.o
+	rm -f test_img_prepross
diff --git a/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8 b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
new file mode 100644
index 0000000000..c80b07d5c0
--- /dev/null
+++ b/lite/demo/cxx/makefiles/test_cv/Makefile.android.armv8
@@ -0,0 +1,70 @@
+ARM_ABI = arm8
+LITE_WITH_CV = ON
+export ARM_ABI
+export LITE_WITH_CV
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared $(SYSTEM_LIBS)
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = ${OPENCV_LIBS} $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+test_model_cv: fetch_opencv test_model_cv.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+test_model_cv.o: test_model_cv.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+test_img_prepross: fetch_opencv test_img_prepross.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_prepross.o -o test_img_prepross  $(CXX_LIBS) $(LDFLAGS)
+
+test_img_prepross.o: test_img_prepross.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_prepross.o -c test_img_prepross.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f test_model_cv.o
+	rm -f test_model_cv
+      rm -f test_img_prepross.o
+	rm -f test_img_prepross
diff --git a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7 b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7
new file mode 100644
index 0000000000..b584f56235
--- /dev/null
+++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv7
@@ -0,0 +1,61 @@
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+yolov3_detection: fetch_opencv yolov3_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection  $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_detection.o: yolov3_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f yolov3_detection.o
+	rm -f yolov3_detection
diff --git a/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8 b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8
new file mode 100644
index 0000000000..2777981701
--- /dev/null
+++ b/lite/demo/cxx/makefiles/yolov3_detection/Makefile.android.armv8
@@ -0,0 +1,61 @@
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include
+
+CXX_LIBS = ${OPENCV_LIBS} -L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared $(SYSTEM_LIBS)
+
+###############################################################
+# How to use one of static libaray:                           #
+#  `libpaddle_api_full_bundled.a`                             #
+#  `libpaddle_api_light_bundled.a`                            #
+###############################################################
+# Note: default use lite's shared library.                    #
+###############################################################
+# 1. Comment above line using `libpaddle_light_api_shared.so`
+# 2. Undo comment below line using `libpaddle_api_light_bundled.a`
+
+#CXX_LIBS = $(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a $(SYSTEM_LIBS)
+
+yolov3_detection: fetch_opencv yolov3_detection.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_detection.o -o yolov3_detection  $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_detection.o: yolov3_detection.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_detection.o -c yolov3_detection.cc
+
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+
+.PHONY: clean
+clean:
+	rm -f yolov3_detection.o
+	rm -f yolov3_detection
diff --git a/lite/demo/cxx/mobile_classify/mobile_classify.cc b/lite/demo/cxx/mobile_classify/mobile_classify.cc
old mode 100755
new mode 100644
index c651bf9f4c..d0cf59e185
--- a/lite/demo/cxx/mobile_classify/mobile_classify.cc
+++ b/lite/demo/cxx/mobile_classify/mobile_classify.cc
@@ -117,7 +117,7 @@ void pre_process(const cv::Mat& img,
                  float* means,
                  float* scales) {
   cv::Mat rgb_img;
-  // cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
   cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
   cv::Mat imgf;
   rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
diff --git a/lite/demo/cxx/mobile_detection/test.jpg b/lite/demo/cxx/mobile_detection/test.jpg
deleted file mode 100644
index 6bb36e136deec6088c7b75215fc35d6231283673..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 127499
zcmb4qXHZkm7j}S95_*?vfKWmcklv(2LTCv!6sbaJQUVHC=mJth=pB+kLT?Hv3P_EG
z5<mn|KtVu7@Q0$<Uj856nfL3vcV_S2d(Q6JnY(B2GtW8uZ{^<xfCpg(w*oLQ000bU
z7vSF-z#IT(Wo2V!0kg5Ov4g?vT<3VWI61i_1ci9d$()ykN}rdOl2g*TAP3WgNlB}j
zs%z>Q7#kbQs#?I!4Q+Icj1B%b2m_d%or{x8oQFr;KtWo;;Qzb)8v*dL0bemSGcia5
z7<n0(cp3hU1H{kf$;|M->Hgos2w-3WGPAI<ok<;d&K{VU7@1grKp+#4jS0kX)|E+0
z9mr?O?Bo}n!on|I)NrWLJvuL7ws|BdBy$lJv&9P4Y}B&!4@ku~jS0(|Tg0ZJgPa4?
zd(H+FWdJapS^Iy?0T>wnXYNcU$9wjkk&)@F`oB61jJ#6nOnj#NKqmod4ZmnJ=6|aI
zj{j-!GVuZ~0p3#z(i33GJvn8Yk_g9c9H^0Fe6;Gno#vAzNkt~sH@bSuA3n7tqclBO
z2(GtJ8|R%<51F%PKg7u>4Hs|$ZFtp-ij3=T5M&fG9TOeC)Se=V?^FUyPb44Z`1!P5
z+^UOTnEVNse{pb0%OiE)QH2du{?%RN3B2I3rI8NE1vV|~?w4AkHEEbNR8xRgrLdY6
zb>1anE=Sn(QC^`Zf;Ffm{nl}o<l@jD6y~tt{oX3)6YDQ+wNLH1Z*CQJijU2ntQU)p
zH~HaS);dR0?ZNF|9`YDd((mybcjk1QRzO?oT=ksvzUI|tA5_uW9cd*8Jt;e=Q$_g<
z_HjzJGj|-#PlPAAu;zu6?vHwr7spQg7h$iRBjX({5P3VSIrdRtjXlD2wTTIm=zyck
z0>a5YzTQYp7tW0-{OYcR)=(pRdUE(s^D)9Iqf%Mmli~KN|J)4$E&!j<OP__uaQ8lU
zab;~=vQzKE#h414?>tb27yQW!mG9*l!SJH4LI;p9L>M+lOE@x>L?cj{f%RZNm^q<o
zLX7ow=~kuzS=(fVymucWhp>@3u*H$#fUgKR!UkB%53Q(mO2dIhQwz^4e&*$@W47RN
zpgu-uC(7DOb)Hh+uTVY)FS#Y8NX&Ea5%8$=VwEbw#+XeqftN8`(+rZP>%f<Zo<XFI
zIlY1rOeA@<9dm(5q8BNMNkP1kQ$9lp3m@vDDPBP_?#CyZ(Imzy`T~Bz!~jh|Da=I}
zgAiJ)Uf~2#cVkn^z@0}A;cN$86?o>S2agps!TSp|ByY$xTMTSH+3!Hp`Iy_%SDFrj
z)89TRPVT>yp`QMU{@Y2|S|-zJzP$Vt@>k$);gD5`d4(dm$;v%E-*GC4Ene^rT1+Jg
zHayQFC04-!M@rb~MkiB^N-?WLj(g5N&zeDXF3D7vkgv-9KF_HAupP!h#H17O913L!
zQ$k&Y$Ti#RIu{Tv3wcP=JiH+DdMOYa-GT~@a->M}6v_hh;L;{$UaYV*vV!dt8#@hf
zw*eGgO|CG@<DIFZVUO%!9hLS75yT;$0S|{hArXyv-#0VTL}fNZ%h{EyI2&nJS!@^(
z+lYeGnb}cY6B|0%<Y?WD0Rd)Q^U@%4F*t{13t2wQN0A@BO!r%80eABe1@AC3T#LDR
zL?bIUo=mg*Q^cDDA5eXMs<m&GLFY~$!G^!&G=Iqrx{#P}_IB84u9BEVYyV;|rJL|x
zj;{<(Fi6dtHMy(}iUFP|cpYrZ>n}7Uku(fmH^ky&I~rq27cW*f#S)2IP0?2PV7?|f
zxYawb^yYPB@>Vk8OOyg=h=H%zVFEXvf@7Y$5|w)Tc^Ka|XP#8DuGzoXq%?hM?V_)^
zNyqZXy+VDzk5sU&V^N1mpudk5+prc69L~)l+l?Q*^z5W4Yx8w^d%Sq~N(J~OD#NP?
zCTv5?cQF^pMmX9Wln?QE!UthLpe>`SeaO}e#CPy|3?m~*918XX^fh-0E4+$<L%BBm
z5AoJ^YUvMp{NW04ne=08Re1tkA;@rD4f-|1kjM>_J6r99|1J5mN)1(uHZzGz)~>s|
z6oO*)ZyaGS&69g>5GAnM((b0n+E-^8G3Iqczy-rR-_;7ToJV5#Dn{&N8FYm#mUZ95
zJgjzKH8LV%xPyWS)`7R)OB!c~A2+c$nQnWYFm)mKJ^h`!NmuS}x)Wy}%4McysK_$}
zW|p2C0m1+b=3h*BkdT|<-m*6R57|J&yFxBTdxp(eRc$y*HEF~w;=gnb^1JuVK_Xp}
zl!#jmX98F30u&aqM^^4r*t1N($!BEKZk3+JwB$n~l^3?nFN|cl?d~OS^2vP=RM?Z_
zP+IJ(H<5C%&*Fh7A&C$ZHyNeM3VO0ygJygML#QZ9$|{8aH<9|ydPo$?V(PRH>!%C5
z-h7hL03w<U8DS&UsdIObkT@8&go4U#xoo?ss7Z0xuS9V^EDd|bAM}9I%;I;&l?;4^
zsE8A79j*m`HZ218jw{OQED)p#9I$E=wJK@aJw!i-70k-!N==ocG0N(?@Q#I8?R+<9
zRtGp+tlMLf`vfLS-s*w1f}XQ+s$EAG#^4L-xl$@p8r+TkycTlVrYw;adJ@IfQK`yu
z_GVZM8E6qqF0wYu0Mn*UJZx&5O;1#^11Q(<e7YLw$=_$D#`h{&O?ov8cqD9kPD^Nh
zgOFVQkjs&qq37&I6JGw*sMSoE4P)L^yCLXyW19yD#2R>>P?DD1oe!L@78630lg2l{
zw%&m;71ei%CpEur;Bq+YFU0f^wZ2t<k~VZcG0k1|$f<0+<ov@Uv&#xndjC}`yna7H
z)7V3@4LL{Z*dt@X^7;sq7YZO70<gKSVrRm_NpDghu>M@bkp%)|dalqpFo@R#p$)bf
zbr_znVDptfM&<;|p%MO(gr;OvI>EyRm?thqGFr;)jpfEM**VrdVGv7Re9c5ylC<?V
zGQ0cZn)c50-Fpl)P7(6E_;rKwuM8`<5LT`oYximh&tJOuJrKRYW&_hS>P+UG=i8~u
zAlbS~+MlW%gQyyL{<auQl596Q<c_#E0`@ep=sh{UqODkGLjf)||4IzX<;#LLIS01h
zW&q#gC_)m}&%A{Mf(SGd<_L0JcsPX59A<XC8X`+#{)xejqmMoM#4ZdilV_7{HZ7f9
z)&`HZN)j(|Oo6{C0v^Q>JI%y+1U`Lx!rK?Pj$CEx>F#b&0-o5KL_h>m@<5SH1i2O_
zSj-J*4@y_Dw>8<>eo>NN)awFM38(d}Myu!ntn%xpCUfVc`+V8Nlpuk^y!`5NXKU5y
zZ86Nd*(H9Ffh;$Ly%eSrtv1&nK3~DeJVt-f_Jsvr;9aFf+;j1p0;`V%TlQq{yTP!k
zZR&X;Ur|@XtaED7F-D3EUF~RJ5guyN4{e3D%hzD!^*+h~J9B87w_|4agX|6EAqVXv
z?@rTs^p(8+>bVa?S6^Xs*El*#j@RXSM=ppl9Q47Csdh}Uq!6>9UX|ASmZp!VxhvI4
z^{l__e!~2qGs1_*0e{H93m?Fn<FPvLn_+SyOG8HB<j-ObQQiH2v`(F%k=Rj&I7BBZ
z%)5c|5XtTkr*kf5Wm8KtrJP;f$sTmi_0JWXat};p!aZSQw0-t<KQ}*CaE(cSoWRau
z?|u*1aZz#s5#Kigw$?tem~;T&6AaU)b%#_deOm>Lk&l?qeC}#0pP@j7S!AUNhD#vO
zGH)^SO$+B-Jx?X0K}>nMLtt)~Bu}Y~9KSH|X?jqhG*7tjrmaR541eKT;6#pRRF#4X
zM5O?@7G&?K2Vwz+V=o78%NkU&ppXN&MGG|#BsjC^huFzjRYpl9Od-W-8g!CEV~m*g
zb_CSl3=we&4F!#{EWdTvDPd|Yrcp!7#K{*v)fV?nynjX-bbE7UH;qWv?h?<{GXX;!
z`P0ru&?Lkp7YcY1Jn^r2_#<tvgC%B81VZRnLHAp($*(JbbckL}1QJmCz3jAYu`ndO
zdRqi`hPG6#Qur0%NH2r4R#tTo18l|&SuZ9FvQ*R&`&plhlp>6;Ytk6DVaSr1bP>z(
zuf!8bHtW1M9gB4d;Q~tT@9yO(|4{XF=)2?!BX2-~$^rZ5LhyZgtSAFTb|#vlbd3%Y
zWE;HSj>t0s2s~@;JJy;m(`In3pmH*bzSR&c{fI4u7C^|W!&c^}VNW~Wzt|glnRV#N
zy@dS-*nm>CK6c^V#~3}EoWwK>=O(LuDhUi3gqlX-ZwMk1=V)d9KQWr8X-wBxy^XEv
z6^|z!Q&-WisrqGlM00@n*ph@(2bM)*hw94Mt;#eFD3hLkO2wS8%}QDAo8YY(FmbGv
z4=~TZw$bM@;nDc(&Uxx}mfOo%*jf|y=!ei=khbQl`Xe1DM$y|mlCIWW-va*u?kxWU
zB)2GH73n&D84Eq_5J+KWQ~3`7lf=BnCHEceDV|E9&!M>23vY3+jCy**uwGL)RE&O?
zAdCIGuHx#YRUmIBApEQ5pzlQ`5JrQ>6yY)LM3XFLxw)v)(XjyYdcV!k?kBz;&dJ<g
z-H4esXctk8(66aU=XJAM8j$qqm(j;;+Jl!NnIT@*VB@+e<*;kCyru8df);p%>EM)<
zYj)6SwuqB{Yd#pT5J`-X=e<IvWa;P<Ogju=dwbPa!AmMKd7BPBf1Jf^@NM*l6%}xz
zIG+@znHXk4`lKYg>VN%Iv6H-`Z?;RF^W_tYY%Dfa+zxCuqi_LEJTY!M;k2m}KeAb`
zRmVJllwfpU%%h&2^xM<Z*VVMcrrUm%xUM3(`VTNs<_=ZTEvcrZ&N+W>4JS2j#uZeT
z_~obe?mbEv>b2X6DNS=tBdYc`#9dh+D!5)^8~rpebYW<o?z-yo4wu`#HZS5*!m{&_
zC&~=wb*sw#J465b1Qqa2t1hN@N(ljCc{q^rr6c{vN3wrq-B*Uig{rW3D}lw5LB5Y$
zoOc?2wkU@ASni*%OiFVZ_VhN(GN=WKE8CNnBq)-f)57$QJif&lv@7=R&nl`S$_3q4
z#$GHA2wrI&0cQlyDzRf`#%e;+f~wrT+QDCvgRU%Jn}Cu8IyP(@>=5OSv*Yb(z8YOh
zE026e?SHCYF_<SD7irHDQW0%vBoeq#KD5i8EaL1~>ufOvGkT+_aWJ9YEeugL^Hvbw
z9SZ7(%kA1Gn|L9x@-U+1-u)kPkd^v!6NDrfvgc^GP&{g~8slemT9j^3Aj|&S(KPn;
zRYD=L)#Brx!DhtbgJkEG|N1Vc`+ODBkF5Cyai<=1Fy?(TbY>nW9D5@6Nw4dTLxQA(
zoWjX`cr;@*V=7-%rIfBI5o@Iq$A*)BlU#+9ef->VQ2Tu*{fli(x}MNRa#@V$1BGkA
zU#X1V6G9(@Odm`b9|PC$>st-4OZT=Jl+)=!;$#AY?z$D?nl6`f(0U<91kE4KpJ0Yy
zd4XW`lin7=k+o&eJ-I_>{=u@KXfu0+FI+7hUqeKg?|-keqP?On&&Ea9Fgd={d^44s
z%I1Ni4&K>3_8ivkx9(%ArGq2d)w4YsTFK}4dnE|@i}%v5Sx8QBwK|?gKn=@vL^4X;
zFdv54;o`5XBXkQB+povYtQEgP-)b57tx^E372lNV;*Rs!e0NxJ1a4HpL^xg&%VM2C
zXc5+a1Qx9a;o(EWP4%lyVPB(}MKgvRmX;EJAQKcW`#kiR+G%rM_@w5w!H`Ss7oFti
z-$BHh<BY%d<~^(9%j<8|ej#+A+W2PP_ZFP>jm%C(MC9Fl-QSl4-m8G6wqN!(j=6F_
zj=g+%1FAH1i~9YB7C?Rwg!OqIygym~WYxMrG4sBMW6k3JZ`K_Fmu~@%({*N1y^>y2
zr8AJzMvK>Fx5s@ZC-?5I%Z0xFRmQ^|WQBaYldCWFN)og$w14n{S0&+Ea<|<Vs>YKW
zyfdb@KC~zaE7OA@?;s4O)J_`c)we=wGI@@EFRXitZ(CRzu=@VgL0z$Uf3OoG?#okl
z6!*lpTDEB|*3Q<p)xd}O2P62Vb#K)p6&1&=sF%FQ?rr-n?tvyUc%jm?&Y_~5+^|s{
zxv8B}%7K{WoL1%Y>kH~tp&}1*0eTuk5!bqh^~<bV)>WI{gi-EC8>CBqcj;UAgj0Ta
zTJ(L#b3gZ*#|gQ#e!;nT+mquL<FXwBmOdQF=0mE|aHd~7;?`N3$lvt)!g|B9*TV&?
zcBp(y!nPx0_nqcO7)fNl?a1bhuZZsy`t4W2w$<G1=v5wW`-DR!;aAs^Pi8Q+b4yIk
z#=ixHd!sVyD=g+@-L&cn#=p~o$OA~4)~<TKw9Bb8?7P>0S{;6_tS&9e0{gAU`6+$n
zx~DJ<?B3C_*UJ7kM*ICGd$3R6t^B;xoN$Sf&|&*3y`c}GVWEqELma<EUxYp5jg4H|
ze?a6=>{z-wsHAH!!4vI5{G9YAwQLwb&ATE0b}Nsw1Kdk1>UW5<>`=5C+S;A(6G6FT
z8Wxq**-g5TbzjMNlYhAG8L%5yxKf6s_d+yx2Ta^r2G~5DFN4<<O;Jidc|H>u!KRWT
z`};v^9ZA#@ZO!4I)ikTYgWlnPfSlX6T33eM@?v6~9op;>En)mLR+j_RcRe9RSBXXc
zwp?mQ8|YcHAVW`0E#9R?wZ(-3T(|W&<UH&Ra~XUj&}!B0HsRP}3Z*Eyl(>A+8H%C^
zD<#IN;Cr*(a0i)kHvI<-&7(njuawMv;K7wolf0s;6@`tzn|-jMNx=`a*)mH$+*Opl
z#@F;hD0yyhQf8mjOuzaA{aVG<i?3>3?>j>TSJio1b7Z>aEgq>0A&G5>M4fYm&5_AG
zLQl?6q|Aw3$XG#;%~X5}5x|(Y3b-GiHZ>3O{Rhb4m)ak>W^V{F(%Zxt@A9X=+2D<;
z>rNcJA5#08`mNkEh`zLtI%YC7vaFUf&V2oSRZbxvzSkD%B_}1DWMBD-%48;e^v31N
zK!l{tsN<XiZj*xyW6_%&yHOZSyGc#d*YA|s{Movgm}C*jmp4id3wbyWytUisrE<qH
z%rh&aj&eQxwT*26A{`J@dBaiIZpQAxgj0AdXxsY1gkiZUcT9pAU0z|s%JGZ``jk$t
z6UdFl{kPRryLaCkSc-bx_`38xaE)Zxq9gy7O_Tl#bVg7euN5_P>n&6|mc=9waG}Nc
zbEQ<l8b8gvQddYE=?j^WC{|exfgS=G1HI#0D3;6RGHfLD$lVlL#XII&U0r2o!#^-1
z(?7P<=z9>kVReND)>ns59N?1G*SR`fx>&}uHWUu}tK}9$Wr?g_m?<T#Z`ItfM#Y>i
zuAnf{l@H;Be}FChM(v!4Xv0$O=b}ocU*4%){Lc+HRCDp1ovY2xE0@V;XHV522zP2u
zM_+OOi%2y<`qSuG^yOxktEbLv(WA1#5~f6l2kNWWorYReljpub(o;8&G_A88;QfB9
z-R^uRJb(Z5Z<LGK+T3c8YP==&9_+h2dODV<LQ2AzLT)<tkCLb68dQtrYbQWzhIBEc
ztrs-0Wg2n~Ne7x$s}*v%*GmOt8HZXio_K8v50-gmXh~eW+Z;jo^u9%rE<48jAR`zt
zDI9S<{QQqLazdEpEA>r9`A~}fn_X?a;lGrt!R>O3$ES?m@cE4k>_&^VQ#Vwz-n+M6
z?A;2eKQH3=sN+))cXVoYx7i7PUfO0h=Stxw|HMI-1RZ#uLty8w^;CiLLiRJrGi0D#
z*a$8=Jc=ae8NIKDI+1!M_Rz{vSkr|%_&h>JJ{j53YPwuwqE%uVP-zk+&$JY6?%;7J
zPGXWHtJ|#r72l9D)DWM=9g+3l0eKSt;C$$DQ`>Q7nC9noPy0v~+l2pou3u*1O2}9`
zHtxk_#7tdE5|_d98!A5z)i=jg*1gkg`uo}a#jBc*Ka8?Um&oyzE$i=SgC0lrW=|Jx
z55_gM`~yU|@EtCE5A8mgm9%wO)h8aGSf6cz_bsoTs$2+BWNM63Ji~qxcsv%+jQQ;k
zhvEa_pAwLGmI*Zi*;YX__c-LlBk=0y&QQ&!9a7k&;#=_~(K42yC81W1HN;Z_{L
zPsMne0#@e)YL5lMt5-biU#8GVU#<p0sA(bSHU?VPa}&ch@-SlR2T}CdV!YzzEzow5
z`g^mAek8=Iuv46N`&MrTTF>%Jj@3{YBo0OW^-C%)azcf<*i{dOrWm%p!yu|0P~qk7
zi$kIa8g&ctxHWo3My=<~JJ^#IpTMe3lTBHJb^peb!blgnyuPP7!)Dn4W>kqQjlQtt
zm;H&Hm3l8@$&;t?s#UZ`)V-S@MK1(qB*yw1hhB4Oc@}I{AgH$I%S@y_tIklMAe(S~
zGzncsVq}&M<44;+8d+M^veR^3tbF=Ez$jmzNr<*3Q#y;=A%O69b2IESC?eFf_%}dY
z_BY^D|93VM6E?Kc4<22c-KtkuUC>bOU_C^mU*Gj!1;jMtX{bVEXTF>FqEa6!?uXZd
zkD7~+>h>rro7Lt1ZV+rQ=juaZzgZQBrj-<eBVBVuMf{b`q>ii<E8;Fy7oc$uXJpiP
zz}I{Hf-_2@v`Opu_%r^Bp|f`f_Qqc2@-qOyS?MDMyUrp%>bM*?)F0GtVCtb2I(Q&E
zPJnk0(3j%S&gS8UrAhn3csMOBIty{b1#2J@H%(Y;OGVR&^N|jZC#+%H7m+V{6Hu(@
zTv9B=%?fLS)J-J05HWiuuua4I*8npg9=^yU+c<&&{i=!GkZRSnoV$s9dnU+GyTUII
znT0k=-*b{KY%*xXq{S_)<EFn>qAHMu&|))qs*U7cj-2Meh>h}Yi|A4%L$YenRj0X&
zNV>}l=6d(@uYz`22clwgQ_U8Z-TumDh27%ik&z893Cnfq^f;wHIw&vFW!!&MI*bwR
zb}g-TZlU`*;f=lO146a$eaACJi<fZTwB3K0`c>|whGT^MZoi99=%aEn5-+;KwBy2Q
zj@4TJQN70Jt=Q{zr0#kHNdXAn7OM=#OBu;Wb6VRrg5<mFsOE5$x9&ONM>F8i%1USZ
zRpr-5!qR?@?LMSy2Q6D$(YT$tl5gcBo|a^TT9@E~fv0&3Q5X4fJV~u~L~n5@=P&6n
zxY`B9=Mwx>UQT0etM_~uD#(hdTG{w+n8Up7)TYdMyPbURbMDBU=|hjVP^_WL?4EvH
z;yBNk=}bnydHpiPCZ*gv@-CU)faJW#1U{^vPwshSn;2#o6aT*D7_e6QvsHPwZ)L$e
za>ya}99YM(SUpax9a=QAKV<mqoFQ{|+g`r{-WGX{eQuo`HZ9(wWu(%7?v@DbS({RB
z=S9ueYr+9Tj9;JX+IU}e+_ih{wLB_8@?Dzyb^mizA(zm=2PN@`aTYGG!<i75Zp|}1
zktkeV7|b__9&cfn<T3)&GjB{#!luToi(u+s$cH<Z$71(rl!30d#$Z=BAuj$iMAJw4
zh%M7ey7}sP_cu)BUggbORd23deVE(#*UwM=lTOHe|IWRBP9pta2CIu_`w44JyOQQa
zx*nd&!lv_6uq)&-x@pbvTvXr5ku$4HRx_nu+)8pjs{b^pWwCMNCI5LIJW^0FdRoWs
zAkN;8XQw>Fm{<_s8JB1hX6Y;SLWjX|X@qQ@N{P=U7bh9+E;>IBBE9t1w^r`R{M=vN
z`R&e&iZ=?w+5Z5l{*hS%>Is$^os*uL>xN_pP!Y5+yh?`SE|C#PFvaJxDV#BDEAzrA
zmX^_U&$+%rfW0_l)(`@PcQx7oO*oBp%Ejar2Z}~J68t6w?8ejzw-pkqD40Dy0m+bZ
z-zKY`my(E*j;x^$y26vjX{~$12uVu+hRjKdjK>L4_DeyWXQKb#nGkyABL%z5A}%N7
z*QxGWL7of7AC>hvT8DIgJC?Ao7Aw~=mpM=TQc*E2_3W8!dpD4p6(`~5GvOLLqleOc
z*5X~#s9jX>PzB5lai1uSFl=3X%yAt?&0GpoR2#u5UfPN??H*G}kP1cyHrlwZEME1X
z&g0wAA3OT0l|=|CsjbVx{kUfzd+}`-+xza8&{$2<eGc=TPoI2`d;SqAcYy9~Te$Y$
zA2+W3gL5(5TRxmQMywC!Di-&u4fe^AkE+@S6X;ssJfpTR>i5T2(|pq27y&<?V{sPL
z`V{`#HuoPuq+`>+k+K-JcyHHs?oU%&`w}Jl$6sn*WZ&rTkL7WR`A>6lyzFmLP6eMo
zXxW#0V1upj_!2M!h0x2NFJ&%a<CkKD{pj;8@buCemj1SKLh%Y+y@+W>@{lp_&L*2r
zL?EBTjg3GLCh8zv)zpaQe*-=Dir=u|KbTw<Bp;8pPeu`lc=jL|!VQ@wv_Yudm4y~y
z%^QrragYWzR-TeH>t7R^Oi-1MG=E3tLCv!`#olKS9>nXRz}ARbP4*@5IceRu66JFS
zU}0cO1c3P*z(Y*b^x%qMrKW|uH)PC<DH1`a<Jz5kDF@RbI|8dQ(ha7>^&a?a&=say
zb1N3mEVaPN9h=O3iFZTsMcZYY#Vp_oKLGYQq+iQY_-U3Zzi*&-uzss;R)paU%|+DC
zT3A20n@w4&aRemmwV<L$>GE~(ZD={U^fT{=rDMurpI2_Z8uKhQ+AwqDiU9?tEgCvt
zb52qLi;fFT$@{R|W?U4#GCOA6!l(3CMz>fIqwlt43~W8Muifw9nPDoK3rVyI3%mzE
z5D0ot_qvf3uzPA-ZK^UA9dp00MkTKx)1aiu+G^QEW$=?)=K?*Y$Zu?NJM2bO5h_2U
ziE~Qv52r%7$u~5beDNH*FA69tc=cEa%_IfyOOOuFO+TzH>I*8`O2f*!XCK>!&@{DM
z59~kF_rUQeE}obIe{D9a460}idnlV!>`jNJ(b&p~)xqRk#l#x5s8j~a)XSegxada<
zNhf(++1I>))s~sfc)On%g2zb52t_uNh1H&39c8OnQRN#*B*@<9eYftM#4a9qZlj~%
zas#Cqm+4t3<c(2@GxQQ;ygy|$`1R-`Io9X9h@Yz`<@HLe)xn=R{fsT`+6g(Vk{)S}
z$rJAdY}Z}T!(8e7>BDYaozX;(o6InI^~3w^d96I8)>J_H(Z$p@EQsVhpOB(`F{9S$
z`r3W#P9~J--=kX}&wEWbzwZyxS_;v$TT3{geiMZ*Xk54@VEYu=Eateoqm<CjMZJd2
zX{LMLqa3;==1Qo$Bw6ge_ze5bPt!aZd*qRhPXA?s*vtD=t`~JRFs+%OzCnO+-=rT9
z#Naf@3KEQ_Ie}u+Qx0nRZCF7kW|}}&Tto`(ZnTZ=*P^zxtt>-7H)g{blhw)hb~we&
z@tC}hBElG5!mw$bNwF*XoP}f*3N><b$LWh=SKI^Xw_Byz0g3)!8{U);IW^f1!kkib
zf`Bl^ovf#STp)s<*~*x;oNjgf10a7YCoU&=#@!i}WEUMMo6T|Dr-(QX@VMMzbJE|&
z9x)@8a7VkXYS)!C-@|lCokAYO6HMZU|5L+k9dKi)9$aXyQjk|lj@NWt_C5g^SR7~o
zN14!)fP6-^*zjQ*-tmA<rVqf+&t^MU#ut+&{-aq~+ws6q+D~?R-u@-_70H1}XMG%-
z-Xh91(bT~AI;P-m_G|5AMuZzm6$bMZY~Dm}RV05Nv*&APkp;mvavSb8KnWy;AP5W&
zwqdOUk~n5;>N8*n5wkQAV+yeWdd+SC5fDJ2;}#6aMe8al4moJtLP=3q!Vw|}x-1Tn
zfG!Z~!yT)EkVeyLX*N~<XHXV4#=R3ZCt<r(&m2z8w4fwA`hIBE%Gx!LlyQS4oRN~d
zLbA9CFTP3E@^8&N)~Q9&^+)5O%iqCe&C{DaO=b*PYH7c%%86dgS+pH}lAljgjdnGM
z!rhnP$fR$Wyh$5b%eFS#;gw755q#MNn&lrjih?|HwS2t(i(FqKSL8x%rebNVyt2Ik
zcw)QTPuL1GbMjFY^@<*+*sS<9M_cNnmxuOWy_ih4B_JZ<#dW|_*>(x(xzB?s@;&qO
zA$FmqLXzN)LVczv&e`3e+9DIJ<dV_^eZ%W*yL}vCCTqMUorvU+E~mY{R5+{Rnu&Y-
z?(TD6aTsQfuj@a1wF)+72x~gv2Zf;r+sHUwIK3_Kf?sBw$Qt;Pjl|qbQmas?g5cAn
ztd6u%nkF{cNZ1mQ%O%(mTj-irRScHziITF+DamW!TWsyh8)Uz`KdiMHny+%68-E<!
z!vIz8Xl@8fwC@^{o!@sWf7G-)#67XmhPu9phUk0$1F&~k7oom+Gc-77C4+`ugjLzK
z$C?Vt4?iB-D~o<!Ca5#y_YV+SugZG3QDCWJ#VT|Om?Bf?%!zHWv?|wyo=fekoOemq
zea$_|=xjA>7nY|R-7R#FH%{?P@8YyMUGh>Uq&g*r*tUVy>(G|A{OSJZ>Oa7CUX$eh
zgVFQz@WsoT5Sa*wqHyk(;H^gg6n2}*o|JD6TTRS75~aE0wR{!~pOP7s3nk`<q%76Y
zAoMhYnYa}~Wv1}Gd{xVRYtR$Cn5{94ynBtY7R|baPIaUufEXG}&MBm9vcT*)V$QHP
zH@0#MQ7Fjd&1@si!}lDIzXRKbBq(HT0x9Qs0vMfGihXO^57Bgz&OuRA7<jmm=e=_M
zJ73PFVHL)g*<kDhHLo)Gp>pLhX?*OhCj5#}d}9jt;a4<sfKj>~;nr@oiq4!7!T5bz
zhNp^RccDN#5AnppTUXd*A+vjUIjo8~EX?9=RIfNkiDPF-)^M>2TQv`Ff^%`18Q-2A
zZkm+<C8m=t;0Ngy+lF$h2$nq9pyfUO2&>6FENWhuCs`wsOkCk-b1ilM2guS4!g(85
zYo_FOgxNelZCDXp4LOTBE@@X=d{(9N9>YVm3!Xrs_&2xczJ|Ou(zIU5_zxL!{{VvF
z<43|y#UQ<fCb<Mtr^s`5M#YKbQByD)fki~5^DdYUs!TJ=q4_~~3uypiOeL@ZPpEhD
zP>QNVF$19_%@YSH78tS;WnxGsWZNmI(_l__5Ym1uF&jK(-xW0i3DZn(98x`}&0!FE
z@($iW#m-2|Jq5}o6|}ck<{TmdWnRVot+LOPb8dmw3MDMP6J){Hlf<U6E?SBF@cOFR
zxE1I5q*eZddzt|gu{AdQ&RJ?}3TrkWw4k7H3yk1JbeOMLRe}m<7;b}4eB-w%L-6O2
z;-SA%SiRh@7JI6~FVMOZB=O|H*R_5YC-ANh;0nSBPoz`N7oi?TXbRZMoh;)3XZ~C6
z9qe#DLi?V-*}LQMoV<sdaTpHG@^aeFSN(^PsdvrYImRig8%XM6EgPyU*>2gclmXaW
zRgdM}%DExPGio6L)*dobbLv@e4D_Oia)X6Rv@XgzD=PF{f7p~>51eC<cVG~|E|;7f
zoxVrrcYvkF3j`6@;o4^58ii&%NNUMmw2+kdg^+FqMNOkeis5<*(cVeqoVI922O%$^
zh7OX;b3N?XcTS(;H)^SGgsYM)>Hh$kO9LrYV~-_%)TtU&Iqe-p1b-5OT3kcXTTyqP
zio?@pl`bK@ZBb)%;mr`Yq~W7aTO37cwB_z8@|)DITUA@U5(6k#?VDSAhc38$#`FP`
z=n5vqkE>G`kKEs|dE5-sH#KY5&waclgcp<~+xUTRU+X>+vwNmE^wmx)w$C*LG@@2)
z%9eUBNX_YL3L9F0X>cSR6abTR?jq3Oa>X=M1(X3{GHMP(T){`yIwDZS1q3RYFH3*{
zkzfvko7hO;OlfN9G+dsqG4CBjCOVA;ZA3<Oy;fi-3eu8-LZc(^n&&Lg9ba28qY)!s
z$`2l2`BN^sYPaa7ZnMEbfDftAx$^P;FTc8BLnI)-dcXRlH1CN@7{vFaMa=LadNdaS
z1u?UK9qV=q=11~mvN$=6O%y_Ll?TvYcdW1bWN99o7TNW_VN}VzMB{F}U2&^GwR0g<
zQ-r*^BNXQ?QzflFe&KQ)*rAuwHyhQPc&B$&B0}eObNBp@N&xysuwluzAl96-gPMFz
zaCjM~Iv;x8bTYn5a?_tB@1@7_a;%`1|5L5^-l1U(QbEIIIxfsvS-^S~u-RIWN)7#j
zvusa!-!*hZfxsLq=+hsbPe$9?W6pDi#Wq%ZiQ_)IgunZ|WsnQ$F*k%>OlXg;YyXs0
z+1)ls<z%q4!>@q56UxCm6?1FrOkqa)>7meE)m9Ux8diqb#Nfg}cKp3QOwHNvf=Ho9
zHa)=riPVSst)?Mxmj6mCLKDoMaDtY)4N0&b!WhT<1_DWQj17fG@CA=@)N@$}65~x_
z2u&_;m=_s=W`@ba3b|p3Koa0<4H(CCU>l9B0C?ip#{_m+!#Cj)a~cQWn3d3ClT9Gm
z{)h|Na#ztx4X+|=_4L7W4AE~4#g4H`YGy4`PkwOWie}{t3N@12og?u44zR;CpI7PL
zcemC{sQpAat-d|AKsHq+B|YqYZ`Tq<!c$*&3+AWaRP*hO>>IVabG!7EdYT)l{~czt
z7WVw<u8}vpDR<@usJ^~yu-CT;Ps?)mYqY9aV|`l<P4IoVQ&zW@t&{MO8AIhie!gw)
zPgJn8xFh9~895YSx^^Zzb7lM72nLcji~=4=f_T-c+1I5EFm!!I2d0cg+&*bU91kd&
zS%B)6=5FHtNI&P_>H4aF!@kV^N};OWyHO;aWx~{^ChrLBs&DsW>_glQkAN;R<B6l%
z2>)1U<-VpjR|{)uOXa|FuJjtaT(i%@yqb^vjM+`fJQH_|*hZ(#TKuTZ6yTC<su6Mi
zl3=RbFUuy?RYiwf$O9Z>%c#Qr{Ev$HgC$kUQ8J_M5uV6xu}dzWH`%Fu9JIF3G<@St
z_?WgF>@Ag3BZ{*8u`<!(j)9QkBg*?MR`&9J<PH5_CK^%>_Z0^YSC@YU6X`E#DKCVw
zwB5fMJ!<*zIY|Bf!;YYJ{dVgzO@5tfnvqVgw>jD|X~@;+Gdq?nycU^J`!ZB%31CI{
z2Pq*6FjThK%R_+DS|Ojbm32??^C)P18hd@yr(Y|OWTe959}jPTyKUal+z~v^ba`!P
z^7~X{WsS-yEnq9q0sndd7u=P)#=G*YovVTLSH6CqqI<(K-6B@V>S9gqr{$*6SBC@V
zM(?LCEZz&Dw`llU=gcO>hk89Jv@6ZrylmIst)g+i1G{>V_jV5*mTuEs654Q6Cn9dm
z&rnhS+Ls{he}FMaRA0O?>5N+Y)H8gqMoKXM1Qm6jOa54~bl|Dr2RblP1YFwlN!x|L
zSl2T>S{PX?l$$22+_+l02(W&xU=6Y?R$=5a3v?t%^Ptl4$PkhSt{0cmRV@O$P6}iI
znsPKU%LqUT#AMvY7#5jK5QtC|sss9tLt<*E&v~GVZ5nMToSAJN0tgVM{yM*?u%BqB
z&oJUlhGhF#BN~7z9aWta^7S~K!SomogzS<*{<~Ll{K%3gdq%CxTJ?IPW73Fu(r5H*
zmi>sjGiFqMz<qDQ>**g7^)@?Z{wMibzm}-#B+aM+8P-e#_j_1X)t*Fz%xHVvYFcfF
z*O1!3x(^-nw=CU$;4vQ0!sw<;$zbgp8cKZBfNJH;Nb%a$*h2KSE!yaKG<K`Hd#|oW
zGq*AddE52Yx(s{!Gd{hEa-dQK2MPxpKrQtb2nHnA;#hkhumn9r{0I59<vj7%yxyT7
zL$o33B0(LrXK<F#+g8cfwW*=4T1CiPC>KWd#e-tz$kSx$1q!xeaO|l|duEC_to5$9
z4l=YVSEXIMEG}xk#G1@ApD|3wepzAyyAC^8_80Nsmy!U!J%~R}_wQ&<vwfJbdXgR+
zxNsJoMpY)A<uNv=637)i<(f1&-Hfei0(wY**j0gKL4|>MxETWp2HRBllnkd&6hrvn
zvdKIk=*%1|;=r|9mZ~DgR0sZpk?qT7u%y;p^toVIpCR*n(Sn-I&pL5SU}Rj7$)-d>
zf!(=Q8T6k|^FKiFqgki+=3wVGsF>OXS--TM;X8ky4;@tek*(4D!r6+P29BpU2bb9&
zYH9-JnMRbkL;DsVkA2f0nzV%#j=s8D#?m)X#h}rBTAKYQI?evjqnTwVa=T+7XxJ@1
zl-Y5NYbOo*qb=vjA9KmOYd7!hN(eV6Nwq|iVa{Z^st;SS13vPO`Fc)QTI~*`qJ~hW
zvuxMkJs#R-C`N=rUp*LEXh6|heUo1lEg43f<HkYR#dx|;<6zexmU|0=eKq+8f0w-4
z%0t+f`U55w+mk$HC!dt~6iF40jNNf&pL!5@qSk%7`-3ysCE3$veAVp|Wy$46&3G-c
zefjw1qjA8+I2Ge;iy^O9@}pjK1Es9RN8NJbHs4(0hhG#MU-Kz_kwKQ`gai+uZkP7u
z3pEP$YU@nJ|0kzG(Ub2}G2&Dtjp*`m*=)vld`H{AQVBfI%s+JLaAV|tGP*Q$ipvrA
z{Mh41^YieYw(OG1Rm)Pw^{zIw{9|HU{BCh(S}0@9q%*GE%cX2y3eUW0jH~Ua_*UuV
zXQrvr5wyW`-8D=0y`(_ib>C0;PRr&!0xackRlp3E-i#==&`bm{j=uc%S;yh(<mWHC
z0g;HE)BBe{mf)6@H0lQGRic64G4Gyo^h;)i&HMhS9o;eOZl)lJEQ{E$kWR2s7-gvW
zSI^dNF8<z-b$=&4l$op5@;Rs4WqbGU(a}1X#FZC8`3~4-yLKIsQELjl0QZ66@S%m-
zhf^K5Na|g$=O&%V(Xcjg)~xL~9NXd^+l<6gKz#1~^SW`B@^&w^1g0;sPp7Y5%hXPQ
zwmU@P-r0V(@2NI@tG?S^C71r+kjwAjZ%xf%ln9*fv+F9AaaDknskows=o>z_EgL_u
z{D$7J1~S*$1zRUiJv5kW`{TXhvhMlB9I`&f=G-q#4lh;fX*=Z2EKWyaW$jm?6FKyd
z1<q2Nu2%n)4S|1vC*@<FqK+yuu#)?8G8yc9<E#7xEQgeo2v1}=)rKv7S;UeVq+=>-
zhMR%pmucJG+M9H2Y9`!CMqK47V{Kr}31Z{P0*RQ+L#r8Ry6KXsa}hxhA^1H@2+X-K
z9GvJc<;De|csg1+b_ng2OZhbZ7<(4*{%))0n+V)-tinr@k2Ip!!Gr6!t?=6j5qM?V
zLsN9)JvXEl>KRouFZM|mm(#t+ryd@wIdea4RGeSx=C(}M7i;vcW9*8$ziQOp#cF8<
zicxXvs<LqJ@3Y`xet89zyUw>nJUTF<joNJ|fz%8Wvk-Z<91{=COyx>&kwQ;kL66Nn
z!mr2_A~vfB&mV!rg0O@&dp*>M-FZH3{*cL+-k}R+g}u8P!1j&;l2$9glCWfT$9~@E
z<vXR@R<luu??TXfni1p?nGB-{*hvf{)QC)J^$Yod_b|)mg2@Cp$O(bopCHxvw^7_`
zQW~+ez!W=tHSsK(2gS3I1f(pI`3W-%rd8?1WE1EH|2R1W-yCLQS{MRQs0GL(y#y}c
zXeKix{ebjol5)D-j9pRv++MP18D^+(zFF3PH#D50(HAGUla}80C9FonE0Dlfqyr=G
zZ*iyl9fg>}<oH-mFh&NK(QuCv?8E-&scbzoYxubMSPkssF!$Q>QnQ0Y+jZc`>)mYc
zsE>UVl%V8}khsu*2>XX+y;%c!!}ejJwmq0uCv|K2HK<GBdsiaj)1}W2EqT{$7Cv3t
z6U`LUQKBibTd}e?e5^q<JJ!$O`pKiQUs9!=RHr%${r5C{Y1iC6E=Ey6fPq>XB5FVF
zxdn%a)e~-JMao*^URyHgXZxv2Z$i)5Ug5sVC+3%?CO04_?8V0GZ;VT^X}-$ML28^6
zKNUi9V~DX0_Z*iL(hoZAi-TS;GS4VoLbnQ-8nm6Y@cVyoOVQ+)k&)W@H<sXip3NRD
z->cwcsOGORy)#18qvYm;4#kLty$mx>^2ge}A#e@+^DWZhOm8%M_rh_ziUiXhDh%^M
zEwkuXE8hN<XBhuK!1%hYqL9mcUvi|F^qbaT(yz<)A5}S6*BAc*QV!)$eBv&X;-uOr
z@Jh@<ESztd-`Gm9=2EXjq-A2Fep{>!gvS))6-_sceR|cF;t)$zs2o!S{*f4>u4@!6
zJH@G(1C2UnTy3TPwwAu|jyZfl^i45Tnk9-q^>CsO34Yhd05J9y+rvF^nqDt!4P7s}
zE;-L*ZC4le^9|eq6~;=`R)=UgK~77Rv?yPkFR$WF7mb)2894llex9c{s`c3n%iEsC
zkCpLZL$VQCJU8D}9j-#R?f>`^_u@2U3i<P!Yzoewvv}&<Ar_T*DNw&e#ar<}IiUP-
z10+9HAWJ_VaDMqT!~072Xh)l=R4EhfsdLy~@!`Ec4Iv@l;_9ySrMwHeO=|hQBe(bH
zDOUfY=(-UiWx3G1WC=%o;MrBdT2h2GtdN6aM6DyDQW7??8QY~d4Ilqq(UiCQY^P##
zb+o0CBj4s1d(i^+ZA*arQ~YlJ$K}VLlbZ9~dPQ$0d0wEq0E&`$Z&SY>+;w~XlKTZc
z_m5C%&tzm@$g#IiR{u9jk=+hca^FwV5_{iv@~Zz+;@)ZVw?x&s&yt_F-1WGoPrtkS
zER|LsPhaBqV0|8}`tiomx7;U`lcjQoaxkX-^%I6*Y_*-|&Gq<4Q^%q=uTgt40{#}?
z(n<C$6vU=K)EkyeeMH(oHK7{nwy~EQcl&B*;`GO?a$4$sq_D5_NywYiVM*<hXLNxu
z#Or5Cv|FRm8@TOCB+=v_z>GVm?05F!yFcVk7x*O`pKtG7{@mHm!(Mtao9y)#wTpSq
zpVdijdAd?sq_lg^CGJ$h_<{YY%U=Iy8Rko-2S0~M8Vja<3Vz|gtVxk~TlW<%9`;Yl
zBzJF_As9?H<D}TwYMpw3vio(QfC=_Rg)s=DKuY=@rmpneKuh|JlzKP9WF1&o3*QzM
zgB#&r`vG9{jVmVtzoxet-K;Vft{Q#H?z~t6j&f?~CmXA_X$*?f>OmHgS8=mp2a9&8
z$aZewZ6`r)S+Vm`rI~A=s5ckeKHH&(1gTabuY&h$j();U+YcjlV%nT;q{NxuT&lFs
zEP|EJw%qvINPqAS)~atND!<i(5woLQ0yka_x{3<QA;Xa^HdXTWwX#kEvcRUW;8Db?
z#U{gLKZSeOmFyZ(0$$g93Y+`LGKwEtBx%C)>66rZbLpV#XuYn=D$^FjCBAJr+o&O*
z^vS-+BX=B=$sOkQ#*S5)@DbT71E2ajfy^-!#-iG%tfrl0<;%v@ZxLZW>+RQ;yhy=v
z0T_EebIZ42xY*<*V>FiV1hfRnhC>TMP`n8gkKn;Ty`-#}!s|mo_-%cTnd0XEOn&$V
zu>ILg=h1~r>qX1SUT~g)gzx3RbF{SsaE<(A9S}Z3{SKnvV`{xZ+XeSz%e9rM>!-qS
z2d*grS*x09EDLWY>{_A8)^!3V*}~gEB1e#@QAhuC@+&A(DxUHpaxR8C_&!ArP1o2E
z*z<I!xV-SiYJ(r-o0az!xUi3VUuaG=dq>!KbuklF<$+<@WJIw}=+)FJn5N%(37>yE
zM7W-@Fvl}5pD}~exe=}H9mIEY<<RSq{rVf#vXCLqoxEeJ-aYjo2>_t$v~_VjSGjU^
zVhI;oS;0j$cVIwW?bQ#xu-+Se5IHoQGNzVs%+STZQKxR`v$7glVO1!G5tB$=I73bt
z=sULj{O-az)=EFR7MD&N;1KlnhHONr=CC0uviqBaK-EXS+B!Iy?Skxw3C|#*`9Q7i
zbLl_rx8#Lc4ibLkq~H6GR@ngEw(x~U%gQkRbfp+M`8~8(ZJ*ceiODU63aI*O5XPc{
z(3<iN@JaEx-~qe8VkG&qa{`1OSuqY$)4IiTL6%#hcEA|MiZ(`--CU48kj>tvhTdh?
zZ80h?2;?D=3^JDN=M%j?>dVDly3&tvyC1(Fn4I+fmhJSERg-mlPo7kFN#(h~&H3+c
zeNGpNE%ekesf3Hvp~p^wt1BmQqBpHm+YatzMz-Vh<t<-^^fbknE^<a0M-A46ukIVq
zhba^@lz{lxlpcK)!06`}w9vyf2E(V~CT_2K^V_aAp=91V1+?k?0$u+V_PgL`tUg?8
z#Cx`<Ca#8wauMy4vG`hhKnJ3F-jvmeJ;2dFH(#jq>-!p`x7MF8SieMv?S7<O{^_;B
zcI;+#AF;t6Cop{&by;t2=}}d>zG2Q1{afYp^9K)hTG#8EFI@Y3!TzF0om$Gexy@RQ
zotmMIkMf)Q&IKpl|Cyp&7o+|GqJGm)?(F$Ib?Z1@RgpcVnO*vx`r-8Lp9b7>PGKFS
zcT0<9nm^;$)zy=;1Z(Tc+0M(+YB2@TyWVKbpZI%Za>Ea*KB7sXcF%~$h%y2aG>SA*
z(Q|sK70^_Sz@D6aMfjc^oGI8r0Y%`v3)<$Op~qOJVA~ud1y%wo6`iJ_4tLNtBQ%*p
zV&sN=7;~I7pNtY%zSsRJ?2RKUR<*8PD~R3g?a$|;+KiwLT=qCxHP;VQsdh`HCLyLK
z%eb*nxx<tlugIM2GRVSNLVh^)b;*_PAK*4-dPWj7JUkmIO7;cF1(T2iBWDTaSoRHu
zlBQ41y-WJvTj*iKYa+}qq~Sg_4d2a4A(gB{C;3cA1_WYXJN+39*F3<~RR?i?(zq<W
z=>4Iu(<|6+e&M&kIU|8II}3MIc!JDA)A99n`+}B`x|n}}zo_1ReQ@9G_?Y1+`+9z5
z%5%9oOIy#mW6}M{Aa4Uz6%n@M#so=r41sN%tFlOo!BDkxjXSt%J}%<q3U+azR*0pv
zNDj670Tv$-;tJCo@Gy7j7}@lh(`wWapf2MunEVzN#y1U(QG7@eU7;`aM@h%BX0B3U
zyORJrke%3Rl&l5)V|$hT>ZW}~{m54bNj6g`L7*#58Cs#w0L5Q7(S*rtny?5I1*U4K
zngv?K5FkL10}r~;iXX~BkhguVp()FjY$t45p$T*(Kv+}>K+iy;=T>qtg5|LNC6k{q
z_+Gg=$*b*^e`PG$uUm=(CT=X9x{0y_NvF@1&lj61Ab2#{oBUSyY*Gmxkg7L~vjQ!9
z)X+DcNTg4aLU6|5JO&mLx~!+hiqGw++|==Sv6r7JXET$5A?LKqMM_)8&C$spQUg;6
zahjxKTlPTHhsqy{OHZCWsSIPMOpiDfF}k)(h^!o1re!IXgj!WSWt#*q2Z{gm^Zy61
zz`gX&04ehgO6EN8Pd_`vx+*Pd$oOPtt$9M%r_n-baox8UHuQ67*zSFb()o7kJFeaH
zvId@Dl8$Ll-&ADJKu$?LZ@SZ06T^A96oxw&k^nSyP^Y!GbevPZjyQ}77xZGyRS-Yo
zYY?<Ur1LY_c&I?mi+IGm#7M}fe#3A8Zj^9d^nF^gqw62oI-w^EC^6{k6m|0M@>JWL
zHvIyN0J!I_+gY{RXg>WbY_&P+O8Z>r%%g+ksc-J!y!A4{JHln&QFDFp<y{l6+Tr|W
zkG0!`vFZWW^PhHMFT*klmTaM;(_U{1+)eL@-BroauMpO@m=4M;+jeMXn02mlwV?Hr
z45(S#UJrZPD;Z&fy`_Z@o2<sxed-PV0d~&ae;c%a21gsezFZE_&xlW}?6^<8<x}$e
z);e>%adU5?+Pe$Ql#t@;y7}na-y08al1R3t*j4Vg_+v$9PH&b}g8lxl>90Nk{{SKS
zDb(*z+;<+tx?8GQXA~dJl0H1d&h8RJ8Q4(|3I=G;=CeA4YT_qE4&BPa)%R+}O=%_l
zY#CfmcVp^Cx|Db{${ycy7HcaX6bHU&JNstQXL-AI0+Z7%*~NkO+C2b%X=E)Q@}56|
zh_;(^{z@S7nVw(LFoiM^EA%lyJRIR$ObaYzh9LwB{VQk*W`ugK8|c&^HdzB3IX=#t
z7qgpc{|`y$8OVnBzWvx+)vi^0ma1K=Hi<2X5u?<MJrY`~wQG+^)UK6~5Sx-Nv{h1C
z#9l>dB}!G%R>$}8`#*2b`*WZBy3cieZUJ#%ZgB_PhJTLB<3E5aIXLW@G&AibaIuv^
zR)u~7O|@`DPEXz7&D=&o&=+2%p4`G$2AZ%};(vl#pHnl2)@)WX0SURNs;$?|U;hJq
zS-?BbgoVNP4Ao}=y`{Wvcov43wC8szp+=OK%kd@Ami5RQ-4$c3X#o!DkYxSG%2}->
zW@YYkH~72H-S-j50H3-ug@k@sw<_d<sPbth)%#B!-ls<XpyjlePPUZez4GsyZPGWg
zWybxHYA^6}I+t7SL`mCEe`RU-@V5?05I6U5bTl2_Z`RltTvLQM8?$FM^#3T;ahYoK
zbQs+;Ab;Du9}?F{s;w<+TM?g+5O+IEvuiK1c6wfXElL(1;Ztto3*QXu+>iLu*%Hbn
z6uI19aHo|=3W9|R_~?*91&OcJFYuQxBfjC-tTL-*YzmlEd&r1lhCbD1k|K!S*T7Fv
zBbRT)lbHb>6QEx7n(h*tb($i&UWQYSwO&isDs@RwOd;d2BFnJUOwXQ;!34xNa@&rX
z{g$fYDOS)#qWrFj3MTbOpzBQfQxC7U8x(Mo-dj}-2cEKb{IBdXM4`MIvyCK_<b4Z+
zKy_hzw$tXcijb*HF{zByhP?zFPojU-n67)Y>kCT)$4$%8gMsfJzw{%vdX|W$b<;=e
zJIv`0tX&X1@7jdUmkw}rWVHln`*W0afzlj>LNR(Z5YZBy)b_3w2lzb^<9FnyJ*i#I
zIU(imK**sy{bT=0quo<Duy86wNpLPKi`83BLucZ!8*yx}9|7+Yf+1>DEw{TWVSY@*
zp8j>poXC;xH*4mW%aJVSUlK?3jrJdVRX7oegq*HuW9KM<7oq-;S6fm7{DE}_gsC6&
z=v)6>dY2#B>Ns-UxtM`-c?lf348(fNC7<|+hg?Hqzv9^wkehLHmLTm%E;TO;M&2)+
z7zOGFuc?34dAh{c(>DuU(5TALvrbD)hVc3hefNoR^Oz5P%I7^4FAVe6gc7+qXJ4jX
z&K7o~w&b9n8gzV<KfFiiZ(SA<VP>z)-741;lKsV0WcBp9*ew>r2K=-lQsm!RkJr%+
zRy_&!Km(a5%l&(gU<@HopX8+}d}`N!_&kiDcTZ&)qMu+^kG<WZlwvuQulno}r&&AY
zRopUxxxjw%)rVn^na~MSJy-3@<jI8G(e-^J@AfYFS8WG7@au)Sm}o24tgOx-lP^%8
zwa#(?pY&iP{m#=TWb(JC-S35bMmTS`Gk8ofapS?xE>Gh1>k)nli3w3!Yr`BBd&~B=
zCkJ$eiyu|5X*@o%A2k+jD?^+uM^K=1qR!LKam)5iBj1bygK0=@q<|W&@T%<3wM4gm
zk?EP_s$fUlE|GA9XPEQfsSn(_$)CMzOW|#Tc_W{fs`t6zvJ47Y`A4F(-<UcEoxYc-
zxHVR4q2S8Kb}hpR6oM+QM}h4zlk%BorG9X?%TqLFJ|>B&KKD-GGZd+-Wd_pV+q;5l
zt@1F!XaxMjmojHe<!H&XDT1z`C3}J1|9b<0!AdixyiT=zrkeniV=uP1?$`;Hj8!}|
z(ehAqWc<c<nr)&!p%77HtaREW$|xJbWw4pskNXOC`lIVayCG7s{MDw?ShD#>8b2p$
z$DoQ6O$!EnMk5%FE+|X_ZOl8%@vZk*q4BRr`rQ5wLUT0G(wD5*Oz4rOniApco#b|S
z7i2iY?~uRpH`3wqMAJXjA&<9)&~Be^suBx@_%|3PZ`(ppduUzU<ACPI#k?PWFkk71
zl~@~l6ZW~LArmoqr!%f=L@)X)Plyr)1r76A8)`|o8GlxUkDFs-`=Ssi{{A1$<6)va
z&s2l%CTtcO^dXbEnzI?-oif`QApuv?W?+-Cew?qdX_HlZQK~|YZBx;-k{2jTj~=wk
z0c_Zpq)4d?TF_h3b8Z?cNb7-k7dho!%$#b6q@y2&-ac|<1MJ)Pt79|N3sNHFRGXBf
zbY{aDBBmqQJ`VGHi}EA}{Rv(3`-e&g8DCT0ah=qR<X@j9%P88olrUzf^Qp>6U1PUl
zi(t<YlS+YLpR@XsJps#=^m_Jm08ukkrn%J<A5K$yHj7Kv|HMd75Cze#z`n{zi*i@|
zzzrM!%m);0zjpqc4@4cA!ln~QEwK9>gI<Fl317c`9w;afP%gf$Jk~^3U5^k;sUun*
zfB34D!+h$HydRnrA9KfL&M&p_kG$#AQx}Q>P)wkb%bxG$SLWA)942~t+WW(T_JUU!
z*n@5`nXnl~W1~=&3bt47YkqfyKOL!CFa=X?OUv(a43F?UJLH#oLkc#?2=Bf05V#2x
z%~0g{r$|qm<GDTp?jb}`=pDr|iIqVWW$D+8`{T8;Gq@?eACxKm3}?yG;X~|2e`NJ)
z`X5<7`=c(4hdbj|2NJPk{{cQKSkDMA$^*p@9v`2%UQ3&*A>!r_Rsmw<;Jx1CjHpLE
z1RJlmRQgrvDrFfwJm=5dQ!tme0*7&McakF!2(2F)^Dr{EP}V9R7=S4_t)?|5yFR@r
zw5c2ylh?5NnQm-MrD<xKyU9yKfkOmy0of80kwN^IyFXXEt}`L`Ab6YlgyKNc${8yT
zucvuC0_&$jPxK#a_a1K#@V)mnBzMicIrEiZ%iMgvbW#xcWI3`ChxdNtXzOHtZgWUe
zEi!vztf#so1ldb03ehjHkFR4sP@iYZ(#?j1-jXuL6u;*!_>#`3bXG@&F$LD&{M)#V
zp9+WYw9f-(K-m+~X_+<~JR=QSazYM_O%`G@e$z?%^Li;!_uTv(ZQjfb>3n(myD5yj
z%1M64tq!pd4s(WA6nFPvG)T=>fv)A@0%LpnMj(r!P9IE*s;Wj$YY5e~bWL<>ejN_B
zJLQwiya;C*KF7X1aH#up-6QdRPzqxo;S+pdU>6|AVeK{(I;F5VQvWzSO<LeBAA!IA
zIt$y7V^9d_VuT~>BBLC;Mg6&6rC02ZmdUtSuV~3zHk+@GHdDpF1nROba(Wxj$**`w
zFOo0VZ0d<MtSs@F%F!9|#IIR;0iwur)5qjK`~&UgaOKK<8LD2IH2;ChR)PE<P-2>O
z8hZpr=05<t|5h)+zGQ|W_Lf?1vD{C`t$R)9ud#CXAhgHseW1fdR+KP^Z__CK+!UtI
zin5)Sy<}sr)Q$2D|8q+aqn}A+=$^oFfBV#HF3;O6NDT|sN|}3sZG}*{Wpg<A9Ghf&
zM?q9Ym#9H5TiT3EtMfMf7geL{KcQUO3{S*}xe%)fWfd&!-OqNE5mfjMqWEDxWki<$
zkUv$~0;OnHQk8LtR>&Bx*JGnQw{DAZG_YPwDH$On790@_atz(K&kMuI?L$YlO;ID}
z^9IAQAH<x_CcSJHB3oycv81#$J|wxu*wYlE#=h^&Kfx_vBpbzk;=Xtn?K3BQBKi$2
zt0J0UEF%{hLXtT#GwCK~Nl=vPkph1)ZOcg&7|hMaP%1B%MpSW8z|mVNw%*qN0Gp)J
z{{U2jZEW*266z&gapJfv*s6Sit-tmyekMuIr%5QgR<Q4?|8CY@6V$^l>X2D-YyVu5
z<oQ*zGn=^I4J+PdUROL7iSvz+OlS1X9ITtubsBbP-ntA8E$W8mW&Qi3S|ikL*JNJk
z@)mkKkoJ}tD|J{Qo@>`;Q46A~B<Wwa{VQ{Uo|4vuG%^5VWuXn$*?cGNUf#EV$(boX
z(D}j)wjY0am#?eVID37*yZRQy>Z!Z3Lpe*Po*<-m&O9ed${Ws157(=pa~NmdG`pZ|
zHBg0-@20>Vz8&%{25L%XgR|(@cjd12H7BsYRbCml^sYybAG#ZvcYvqGQ(wk>B()`}
z_+$0U)NR)Q${5n5;DwCwZ(fhSRqi8QwTbHA4@-U9B{Ju7{K~@P>WU-ji3-bFDkqgi
z8a|Ai^!Qm?VZ2yCEc*=ciz;Z@br#zae>xD!G%tgpz>|xq=;Jp3-LKI+UiRbPq1RM)
zM)j`E?r8xL@7X@I&q5`2s`9GC0dn!MoBq9|$X4frW>VP{Of1M?De46dE@~nK_7}BF
zV+EMS(LcW<5&G}Z{OeG=_(y-w)4zh2#ND+7dpCcnH$ha!rNqnNh2w)qSdjuB97a%k
zlI~q}*H9|0`=lnJx$;wixVbVEDL$mR)vr<W#v6Ah(%Z1dasLKb|52>F7i?FmJ4)pV
z6{#%{(l__&`q0Jg)?=Sk9Ay>{4j4RG{VY08UXLu-00=vmb=X$b;AuAx6rVQ%bzt}2
zKW8I+yiA921lt*EXV^>#+_!rl(-a+2@PXa^omIkL?+bPa_{&3vpk%qbU)za)99`$!
zkbid)NJ(o+`=F23Q+?2C|3H?@*oW_XhmmHxZ=WQ#4^pyDryHdDyF+o}XvDHPQ~oc-
z4-Tz0QV%8_MV)sCq6Bxm+f18335Y~U*Odslh2HNsMB(0F2k|}mc3Eq@nYzVGc6Ugh
zGF{#<&Gv5Y(SEyYiX8dKN}?KJ2<CZ)pz1>XXOLXWb&f~k?k?@_-vPqj+M)N3iW)Y~
z*2}P5&T*DK%36_ctclIgy7XdV8Rhl>=H>A-q?L*MKb3}=u-YG{zZx6uiB!m5=iA#G
zYMO0`1o--A?S9PetoK*ZyCcoBq&Srs$$SFls*0+rI@ki!I(L@TJ6El<9NPoPY<eo&
zFn*^w=Mco6B)KNI0+N({qx@P`rPi{90QwE=bUdgTgB>TrWe!_!Y?>*A4rXEphWK+m
zZMs+PJE;DHIy^6#l=oy?Rjjb2pE;MZ@n$j(oT4hrBBt;7$Q3&AjX~K1b{<^bQXwO|
zE!35MvbbxtsrTh-%JrI+svo{$X7=Hbh(R|3H6=!w&*DMmTQnu)tG+}@(Tw^ijPToX
zIstqYw;Ck)um`7w9^Oz*hF)~7=za@2J5rd|s<!YmcT%hBIR0jZj}rxa64E=Za1`xw
zGEt=JZrO%Q`~GYb*jG|6N(IXd-)vcGW>Bt=Vq<iwG`zT9c%^Ke)^7?lGz$>SQ!)dZ
z1vyn1DvSVjd!sOT)4GvK+^s!ZyP%7SdJ7aS6Hi#oVUb*EL*(vvPlPZo9!uiNRxvBB
zC)8ZyY14Y6Vj=S46)*4y(1;~leNm~hzBvUKDTpY{3TUwBi7qJyG@qLZscCeE@($O~
zsWKrutn5~KlA8@&6oq(D07G-e$S3|3S$aCle();=(+8pM7HANF&x;qkvcV^qF`CNj
zY*`v1dHUUvg-NmsBt_nvpcsQ{YhS52-k@W@8q`D)pfg&C`LeR)QY`+E)1u;=r?_bA
zeFmpU9Z(eo4|hLhds^2@QF`*rdfb`%EVQt`Ko$AUvTV}G<hBqd4k{G>^{~?9-SCmc
zdX+tHlEZQ|k}yy3&>2b0B+;4Xvtm;N!V+H$Fw-dv2fOTHlwZJkE~FK~1@@8<@A=1_
zf0r4gbQOW`I_@@#-&TM5*$8#|fUwRF@2CFIZW#8vU)Z<xS{^Ahi!41&hxUi52TDN;
zc}ye^BfCBjRCzzL;rCLb>1H1GigYDFXA<w>yl9@{d{@*5|3=xUawFL@C&ly`&xqbu
zF9M~v{3qiQpLcld3>T$D_~ORhG|4Kv%i@;HX)`0@Ja%Jli$O9>{sv%(I%Um&<OPLx
zKuL*RxvAcG1-4nc@YK1poRoiw6{orujN5}`OAdo)Zr@;ow9GE=Dk#6T;bx3OyawW%
zl+}ATjt05=zVvz9M{Oa#{#+@0!qM-ao>EnZw`?DHALz0)VPCa1NH=>Ug+dX~>XV?w
zNEJLXAaWD;vEmYiRR~zZd|Nz8Y(!*wW%A}MHQ;~?WuWW{<DzDZkKoZu_gsz>{sSzR
z)z((`Kd_x8)ra4W7P(LlOT7Hsk&4kmPbTZ&G3mzg8fBZt97gElPsSG+(&+na;2#gu
zSp3-k1K8aB=x|!u@wcA6Z{^Qr>$zDtkJsU)dosh$w6*&kCqyo9KCM{<L`AE=F)wf3
z+NeH+erPEyCXIzHX|%6j=;@g4`X9)l?{;tP>#4iqfr%#%xYW9L&+gk%Lg2aji4kFz
z`CgBFpk?4z_dk*;_T$|%6!;}OHr^zR&dPGzOSYlo+4xSIOqIIxZrOrSs$R!n`N;~Y
zGhF40XsgB}t_jI7mgiUXLaVIp9y}DQTH}qa?%*)G>F_toytN6pk89H^|M+M;P9*A<
zR&S-QW{g<-T7f7)CgXjaFxSZL0(aP*n9oZ!+PWsqnXwOhD&4@(Bzv{>3979L7b?4~
zp|-vcnDkTM(~?VP26&Y|kVa_6Z|6gDg~W4wGQz<3zccaSHE;mSH)o>%)-BIJh^Yhr
z0kFm({bV7p@66~60((d;$U8H_UpbveBfOt3U$`FJQ~8)9Q}Ohsvry))YTZzD-fEZF
zgDS{rl<P{~PnER`<~M<apUb}OyVUY1R_5I5F(Dh~Qd>NrOaFtTfICpg+dzrApB~>{
zCpIG`2*!M#oT=2#em#GXTfzn3QFYB#wsUN#vU#Hgu;N#$cX<3nSI0HeYA!IJJT~{B
zjCVzny8O6ntH#j`j23M>?rOo=L~O?BPjVd3iRl#6eR*aZ7TUe_>WV|hbY}cREidJV
zz&qRgo81wZ`gZD93PiFWV1#i6%T03t<I&6BI$N@;&*Y1>fb3jctZ9AaQ;cfW>cQ|O
zd!iug$83kGLPNpj;9kqR9q25z@)PEMi1V|4DI+uLq+1>A4-8r$Dm5~Vy!RZf|4kY2
zA;E9tGdXd-v<#yQ7*@y>NGvcU6wXEnDL-jFxbAqo@l0*z8Bu{<sFTH$q`IBGY3;ng
zJJ|gsN`d=*12jx)a*KYhmFVvtI2;3WXbPELc|vE}LhB1H<xc(7z1;g^;RLbH0Sox4
z=AIK{s3JhnZ!FxZKYy%VYcT>xOdnecIWW!ftQs>xa`fE9jG5f?&PV0o2$$h6S%!Qf
z*4?pH;j!3^S0F!rK$V!14z};yt*yXS-MGxVvIN+_m|5S%Y}PPZ@fC@03M@&~Tc#3K
z;!|Fi8K-svWH@f+6rrSu-DAVf+GE2oDX9UV))<g(>&i&QfKz9lQe{sQf~sI6EbHP>
zQT3X~o^2|cW|hZCPct2<1VmO`BnJ$yV5FI|EsFW&n3Nn*=~~|B@|sn%3EfmIJAEDj
zmgFwT?fwri)H=hn6V$u`PGa|Js-3AqkGw0r=?QeYUL-99pH6NJ^qrr(tCx^6plI#T
zh#wGKHsB{+xJ98=K<LFwhQY>FK0+18Uqkk&w&wZ00lcJxntgFDTa#5fL*8oq%s~$p
z+_utx*Hy5&N#tr7r_Wmg79Z%kZTiWDXxb5xK~~eKe(P{!&EVoOGz>2G-IrwKkmsL6
z2Tn2%5AW%*J@p6A42Y?U8__Q+=9OP(#V`0?xAv206Qsxm&9sYbx%Oi>kH>_H2Z>I}
zB3Yaah_uK?Jkqoe$D*RtR9R9BnNyAwGHyY%T_D(F;rjMA%S;PJl@CG5lQufc>mZ6d
zQ)8w$$oJClAsBY6|K%kqpfXYDAS!khZ~m3Z-@n0!*@ijL5!Bg<ajyzwZGnAdd??EA
zV;q`1A{fZt3Fw=fH@-vGxneL>f#KM_wk5seU`^n=D=RRXTmEr7hvcfY^fa~&aO+#Z
z77*IC@2pc=DLtg*X?0cFxv!~wxrzU#AKSTBNV#I1J88hg2j{UwZ<yiYzr0?sdW%*!
z4ux^9ZZ2J4zG{wC#NI7Mh5@alSG}!{IMQ#kzpL5s<nXv2h%ui%X|^$;XUKBXB<#T+
z7)|=U6EfYGlm1ur6?WBTg06UgC{vL5dRwjrUn1Dy>3#!gU7Dm23?>Kz@F(_jA-pRv
zL9pVy+M)A*fI0XJ9F3Iq^gdqlkR-0_d<VxS<P~35T6sEoNYZCk^3T}8wmfdxL${12
z)d$U=mgT668+CRi*D0(s<X}>~%rg#K<4d1l=l<@mWDri2CwS>3)#9*wCQ|L!VlY!P
zA%x)x{Ps^}@xTLw;l)qARHR@@+_}|8Yd<VVTF~KzkJeCD_3Z}F&b=3}@N@~OM|OV$
z^@#>v#(5JbMs8r}_U{qK4S@t~n6_GreEl!N^6izLwjrtvhBatav1R_=@W8ko?cHj9
zSS_{VR?A<B<*G_aa#=xKW<>ji6BBD3!95<;VbXGyO#Gv#E9k03zha*@Q054s^6z!l
zJeKL+-aCHT+-4en74*aJ%Nq`r7R0a<U#}xFHn#GU7LhB&b(v?5a>L=H^vhw*o3GTq
zT;)`LX<|udvA}!LF?WtZS)n-BsjT#lgM#nv;_j2e<?z0|0JDjO>FXYMg1OY&KI@gD
z?G(LO)y?F!+C`{2E16tiY5r-h?#!(~vl2U2bb5OUPbO}6nzLTo+3Xh2#bK6VQ95*Y
z&Y)edLQZJghIzeb+&uy4e5|h?Ss0a{W8YgXZ(;{|zJI-WNj+6g%8yAkqtQ=z%&`}p
zB#C!_D|rl28XK9PaT$WNbOT_^I^(Qdsw$F9$yxM7%Pw{beY9Ly8#W9r=`CzTQI%iY
zLgiP<w$!S>L>kzmIfcRbSEv$TGC%N=EN{kPxhxf{MB<XMsKNufb8zfvB>*xvz>&-^
zk+D8j%-Lwlrf0@}rEXl&v5?o+w_X<Yh>;#mMOD>Nz<*2~!H0>;$M0Y9f4y;jx!Fr%
zYh)Yv7%P-A22Bi$b88aO)f{3L@rq7WGUky*jlZpZ@|MZ%YQm(0V{y~UO8aO{sXH8A
z?g1VMaiI<ad*`;K=X~LtYU$GiN`#4Bpx=q{Bkzq$we$^vzZW3b7GJ5M71FaqDBDsm
z(kHoP>I4*d{{7rt_t&aF@5St@%z#hjUsc`glw`r1PP^!^&a~kTKgV{=W6ipSq)#u4
zs_}g=XIRZ|QG&=pjVI1^eacooyqSWOH!cq=s#k`*T!8O6#pKA011ZWW_f%@32lcL1
zi?_aKrmX=f^9VNf&1A%WoTHK<i`H5IRr{draQMy_Qh|_zfICM<=?3mKXf&Yf^vD==
z%Xh+k6M-NJNN>a|Y1hzU5KWYDNO;K>X0-RsxY!V)L@6g-0sStl($*%8jZuOJ<>2b7
zFPFzp5X2;6obGt|maPgH>El#yp(?*18gd}JbH%c2un1rmNMPCNN|Qe<wvkc#LS~4&
zZyKl4NDH}(eD(fTYsO*^`M#XHtI>4s&{6Fc(xr6G(Q>%00#Xo^in$4%6$KWH(aDf>
zIiQLC?c8x&ZRh?k##67M+2?&;M}1l+{i#W2@^|wdaHlm^xjCHb*+`~ckcw@m7ED9q
z@c#jNKl(34=z{t4kN71HjjC*l&7QWk;j|}Dc<ItHwPMrcnLT}LC$)C(f7x#v4}Z1$
zJI{wqm)JxZkt?|0?{{loY0!Q8(sR7~f$hHIx5SrN3gxHg!&*0~4q9KIby#iusi2!t
zby$V-Ng*rXGTv<u-#vQp1hyQSY=+i**{H$-okk)g{Hj(q1fQe&mQ6ipMV9k_W>MLL
zmOrOwQxN+%pTjvNZBV2&0fm`{sXy6nRwb!3^diG;I7fhjzp5swtgBb)lCGCe59F_<
zOOOSTdyuHzS{b3kA*uCjh)({tMb3+9uwPPtN&sdCK@=Q4P|akqrJ@+tuWmgqADszF
z1;}A|xy^+e)KHDBEsR5(=3gCx`0>cXOg~Ke9gmx@vbGx{anWF|WmsK=@>><H66U?y
zK!4?zNbYcN*YW9@mMd~LpZd0bU{W1h1gl$*_4C9e%Z)=n*>v_*3PKqpFsUt5#(Cph
z-Lw@Cj9DQL)5lDQd48Q*>rC#AD7N6YjeiW}md#poO|ZGG;|(6yB7gLV-)>HU{y-4Y
zHj?>+n_bVjOJ463<_Ts*e3QhAYCMAcPI^j!V8nZ0eD+Rk<)*uXmv@9ps;mxK2dt_q
zc%I+<$;GR-bnN_MX_@60{rC1$T4nS>xYn{jEM+Od@iZte^TC+pe*hJ*0$XV9$IK+w
zO{;m-uJ_#ipz`G)77ms_3PGJNGIA>6q5-)<Or;``U!D|PHmQRbOfdeC^0?oF5WBI=
zR`8~lJs?0yX*#Z2(y?LxeGk99Isb8TZKLCVfSJp(u{s}bWVC!r5r&37({)z(t0B~Q
z-e5!>aVXq(XtLh8EPG-ba}KgFYTklN<c20H=pWv*ecUnA!h4^>w|zdudq5H7GCBWz
zCf#7?Ylz-Ns!eGvGI5Zi6)s!E>cuJ8Y*%3ss~(8^n?o_j_WvaQ|8e`waCcNpAL8m1
z>zirk4(!Uc5UX9Mro~iYSmiYq_B@Yoej(ILvbUA9=9=U@d@)Rd9x#7jCLj;n7B3pB
z9xM#h%m&hZ3>RdN$>h4`iw&(TiLTMG(LB#R-7(R3K?Trp7HJzmxX3ltO6pI(NjaSR
zNL>oYG?+wqWHM}|M<hY~rpnP^C0P`m9S)6EGEs8WwO9IO+SFX!2DxTOIQ|d7<9az>
zsop8xAw#smM7q)is1EF|9agkUXB$qJ#In`XH8#@bj+?^oJmw{FUFgHsu{X}UiD#G$
zcV&<+*v=%`W82K_P1p`xU#nW`?=!V?Ge$psWDg=*7nD|KE+G=<dfVt5$0Zclk}kM@
zdU0ON?H=fua%9r}iagJ)PnJimzs%f4gz2<09hixnDI-%?=~xD=UerG>t?ZB%H@1Y}
z&9H8N_{SAGF*Im$<%|*Kpz4TqH|Yir%hc<6#Ja6_Gx&@dzntnq)R=c*^~tH9W>yEN
z<FNICR{%1dZt!01>+Cau%ozt<5tLcNJV)D_Bc##pWi3luJA1^$Z|jDCqCT{?s54}w
z4B)5>$Lt23qfIaD7A|mo`B^#W*3T^!w?icTiQj#kU~yT<Y2brm@^&n?t@TTXpesH!
zCB>02`eVhOe1FU2`L3<?mtE$%i$Ae9UWo7b>Us;x8#x!;zC3B%a)R9#9T_le_%U>a
zZZ_pJE&IfE-TTA>9jR;8lwh{+54*wRwdT_nDFU?)BRu^loeZ9{T9dm+m}g^lkaGSf
zb@FK75n-aq>xk`!c71b5T#69ruw`bNqOc_ezc$qhU+OO38hhJ7b_A}qsKR+OD^c8j
zCf>@@TioE&#(AwDV4@wRW$IMVG%O=cNM`?Xw@A=bYuj|FsEJmj)}x?D0N3^og3FYX
zcBnBx?C9x#03veV&8<Dbus_dnWXLX&_|#?LMydPZJ9;k>u1`XyjnbKd*PaGX`oXKT
zbqyFeTnlcNcm`hj`8x{UC#&a;^uFq{piciKUri*_Os)5rbcVK(daKsjJn)4!x3~f9
zf}EAE?u*tq9I6Xm-wXU0_S0pW)^>v<v(PN*D$t7<3RRw0XOrC$d;{8j^^#U(CfInX
z0S``sO?|wpDiRSUqVqA^YH?MYtm70YXu+whPx_{M*;4I^moSIvi$GN9!*jS13gp8K
z$~+mWYH6JhV|u3t@%EUzc~~3GmmW5PPikwMfu}^*Pfy9xTh48))dL`Fz1{|U94QdF
zSBnaP3VL~gl0HAvq-m21Pd5crRZL1xT>hx;2fTNVvu{b(D~-}_ZJz5|()C)i_gB(6
zV6X8ZDS2nvKvnHaKImX|Ih(Tvqxp^SvqGj3Wv36?jC;%rnqR{;EniP3;i^aYEHi^^
zTjq1*T=DyY6<HjwpF;2DoJ6qOv+WQsWllm*2`PWnSmuIg$@rvg@GtD$2kP-3UV&M@
zG5$L%8Rkf%AXQzlN1+#uRYD6w3w-5XtzaZ*Cy3N9h7A{jqX!?HD9pS?JM2Z#ixkD4
zASwy@KZ3_&k%Y&8oF9*fJHpyN@fc_NzVLw1(#L<{9|s*1Im+Sve^ja}E>Fjb>$bjS
z@bV=F?RkAlyD$fNs$gF*!RqZ?7kw`Ia$D}4wa{qFf}gi*M<3Ec%V+M%F`Pumd;`6l
zoo}&Yc#q^$(Gx`_pTh-3ueL2*!bp5V_1w>4V4>xZ>h4&bYE7qrlRB*gV92H&cmEx|
z8X=N+>F*L@E4At|CL=Bav<ZNzB49CikEtG{8sjCPbD7c~t5u@7xE!>OPsP@Xk<CHm
zud>q>{>)DGLbRDeN&j1mIvsmsrOCspNoD%)N>X<+itj#6EqCcNw_8*)t1M|M*0Il^
z^N`Fmtb4D-kJTDC5shsQ;y<sMSsJ)%tN#F_;EYE<vGQnk$r^Qs6fkS%6jp%^oPM4`
z{b85d1%ji;<#iA?rX)!xUhtUf<)7>wb}!LiCHwiokR91h5HS6JxnhQ<^YU6EZ&9gm
z$wZ5f&U2b7?!5u10%DUBQZISmbez&CP`4R$4o#<=g@tJGUfph=k5dH8jY@(N=a!mr
zULOJH!BU5oee*mKzXt2d8m|7{+Uf3s*qM8m#y1fp;~rf5kfllR>pQMTY%b>>&xii2
zGF?($8-0&E{sRcv<*c%r{6z}xY+Tmf@ox=->EwR>h32M-G(XCH@*eMq)K5%LB(lsd
zE~Fg{dk4b9-?`QN*7|G!@%qfS6_^ly&Ef9F0s{e3d+!X+-0@XlEXE#Gq;^NPY9|6I
zBs4FR(jsDzuSjm?*H5PKDT{m4lMCUO24JuVC2i|!eBx+1yB%RVPe4a&^ym{oJ;kiu
zl*6o*?mKkNVZmw5emkWhiTv25w1W<kB*FD&Tn}aiysii|nvoi2<CcRVZDaCrYDQnG
z)idk^s@52TdPzskjTRHdg&0X`@=H-VdqOd!LXg9=nR@tx$=2R>9ZIjN5?ag|!jRHU
zl+9_PqbKW%HtX(KeP>WFDM7Xq!nDIFGjXfF+~828eZ$CSeZVCzN3Xr8xml!$Ll#4<
zs*0WZY1^NQiCZr`Xmk3Ru?#OL@J<#LP4q_|wAXFSN};Ck_apL-t70~tSJSRKfg%X*
z9<~q5Gb?cKe(cbcSvHf@3h@c&nvd7N*7{^)N3Wy*FWQ6sSwdMh8{894JxjTe_LcBq
zde4-Hv+#Aa@$rKl_^x)LA4B|gbO_Fcv}RRO`KX{ld<kr%)7dMBn|n5N*Vd)5@iyJZ
zIS2CQ$pdi193h|_OikqRdl(q`GEUGzVi>K_1GOg!Qo`w2@29CH1Y!pZQ`mL78mPPd
z?}8mdNHvw}iz)3XQ_cnTuJww>CG=83G<+y)mIs1+(ToUfdySe}DVg0pDbjsG!2K1E
zznoj~-MeEjtdq`WxAIFbKMS8#>C0EyBq}17EHi%IwXrpQlq<eV|L`Zz{E(R3_%;2q
z-Y#t2t2XO-wd-fZcX$d-?;xby9jf@e>bYb!-P%?T4wPN4>oU#sp8mRH$_f?o9N-r=
z_&Og-iV*M39B^&bdBr48ZvQ+NQ_C|`_ro7^2d!(s)8S0%YZNjGtYN$IBCrHMQ|2n@
zolbqJ-r3j2;zW0Gw?f4!44>j!z~#RvSpmWbmU+p?S2AW4;H{k`rnj0EqBBb^-vu^v
zm=yqp$SpRs8RkO*6bl%ciA#X-hyb!Pj)mWcrJ5}03E;uD8q$!PBWzY#!<M6F&upp`
z;na;KFD!^LfKW_uW#cU+DqHme8e4qjGeLjaChd1ZToLp=+G;3_m&sdUnEowB(7V9Z
zUd2TjKB7Dn5U2nx;UyeM2I_3}n!Woqa)$-uu7B|$rR=>~i;MHK8C1{_C}7AXJ(J6j
zNgeFWSfj4ABclOk(q(&vX24ztDw^;Zp@|y&QZ^q(iZixKLARod6}3__Z0IZ%S%6y2
zCbCCWN|NGAb|^Z^1P+bDp(+MQ6`wbui;=iHKOuj$tk1LX@@!g7V0kn2)2W8E=#<;?
z<oSm{m!1tBi`B-dkQ$%5zK7t(sqXY^NFDsm=3P(I2Ftm#vobh-Jdf$72R^8*BA&I}
zYyL4FGOI67@oj|6oTxSZH6LTy+@S9sS5q37(^AQiEo~~7#j0Da4qK%+o`zWZCbh3M
z|GjGYDoM}ZS%PE0AK1K`!@%Kw>q&cS=Lh9CPT{Vx?^a=3-Q!?fKib|Th<wmr^nBW=
zx<o2qm-gBe`3xjBq$vg2CABNJ+Iq7{zXtH0)7zM{`ab^hj&AV_OnB_78-38M+y~Pc
z>6WP2=D|hSeV!I7PG3#wxL&tp<!+$7r2M;H&q8q+V@Nl08&x)%Hh*zMr?<*1dM2Dm
zZAlB_ZI&J_EF2~Yj7u>9qEL<rfyzLVfV~5@xBR@^a-y|Q&JAE*w+t|&$ae#O%_l|F
zvKSaMZ~MgtZy(u7CVqg$<La@NQ$w-W-Ge_>29Vu2o`@Owr(gMtKg=nR--WSBsx*{G
zNL^Bz8I29SMUG$Rzrr4W*h6e^%?7(51OQeGroc9DZk~E#?1K*`CZ8vdP{l^YAZ2Z_
zH_|bT9-M(%h{fqWTPxSmN8Z=uV(nstVImQx^gEUXK@KB@eNE<*Pw>yylotYRYhTAf
zPGtLaqwB-^I0>On1Xxm_TH!~P&gSN1l?nXv?_sagq7G<)CS-BTY^5IGh!0WKHOyGL
zdHJ~J9oP9ZJgmaB`(s98zgtkvobcMWXgbo?$@20^=zy`O*hSWznt}qiUUggeY9<#~
zJnl(zax(`fuN|?ld<jrj1wm?y(CNBX%mQ-G+GyR&)*+<V9r$*!NE0;fWlwW^(;tjq
zpYS(H^5!utx9~Q140*Tref&7aQ&o#F9k$)J=H}3J&eAzks%ih>p($ScxMij-e)5l*
zX)0m&fLpHI7D!norwsVEm*hvdRGj&)q-F^B_TDP*s}~4ztPFKrmSPHRQaH@T7_=f*
z^dP%#+{P-l4s@C$k_Nro-G3impHnJxyNDv?&Y24brwk@rVM&Dq@*Un>>GjIezhkmD
zG3r=pLEIx7qeU?@7HT|c(JC+e>t=_PJJvd6q<8K5j@`1PWfP&eKNX$hlGMKG{=<Sz
zsajHbCcX{MP1Xcr9ex1R>b0+;CB}PGluf4ieGcWmUMUBC1`u3Z`4A^67Hh$@u&lH|
z6$gvN%0qT?tNX(trM^mhsfwZ7>Ypau^Xh}x>4qhbM7n`r#J}2XA#S;$8Z-3yW3a1@
zTo77M6x*Uped(Isol9onermiP>mS)<VJ=qqNJDJ3QWb{DkjcQ=<7M-AI}}EO*G3_O
z9%v|^adD&sN7l2tz;;Q0s@Tp&(6oTZgqtseuGy^Se+zcl&KXu%S$Z;17DEruRLQyG
ztSe>rbqH00L(iaCkE+-<RpeP!61}ezomfj05s7Z0q2^{ZEu(9_B4Q>O!AhT+ra+Nf
z3z=#?ZjQD#1CK1ZTVa!On(}|n+Z3*3dvEjm8DetR^^@;A-y`6k>Sj9ek$J9Lv=k}H
zpL6rD`XMA=6f>(5j$U3jDMiTI^u?Ds@8u!ssb_Q#{%XHTGN9Dhu{Q-G)mjgz71tQh
zAiIB!7SCR#GX)k1n8&(iCL?)|I9|O+UjC+aB*E_krE9K(4@Bz0&JstYZp|-%ak2vf
z(3;@CCZaiM53N_+mf7iv^7A36xp&4N<ZgJKW=v7~^CG>gJzTKvZny=8TlCd0`WHhy
z>)8?$SjDF-T0-ui$1SG;#X3_iT-Ln#IvVL5BDPOjk3IXkAzJVUJbs>B0NIB1RvOao
zynhhRTdki9g;ViD&!*{OjQoj^c_>mpGWXW}CATag`FoDCqZf$CW*R{RQNt-d=>Aw&
zDPXnfNzACf#JxMK_y&65^o(EJCM+0!d>pjn5hvF*1gi3DpufK9AElNfU&1`fUqGjs
z;it1}ie>wQ_S@mnkaux?WncVYjiUy^on*0QY;{~2MD--KA?+j8>hBR>y|lQ~0D5%$
z*??}j#UxnxK95Hd0ivrMny9PKrlYiL*D?C&XO*b$-cJ>gNZd^)0427E8bZzPZ9R+E
zE}`GFq<AIwFLFg#IY#3s#_bhDonv3{(`_*$ByyeZs9;15^8+7=l}o9N66YIS9yZX>
zg=f)6H~1nr3Mrw4>G|i4T)2%RQnKKH5wT@GRsi(=irmg5(jgsBExM}u!KO;jhd#1~
zu(+`xs`d+4)}Gqz6dBgdwIDiwCTGtS=aL7+eX-nh8x@2%yy<T=slW($GbpErU@Gc6
zJZ~492#$Q9JQYma6OgU9#)cy@@m9tDCW<xClz2$%i87*StNcsK6PE%#9Ww5Wle<_=
z!1`pA4gg9yDbxEW+GzTs32{;R`?9!Qq2H|#WLt~r$b6vhr&Ps^G@a+_7*`fuA@K9s
z&1*;IeQz?3FHLY`UH;Cum$l`$h&RO@(963dOCe0|)w}1)vv2Rt-`JX6b^Puf8AdZ2
z>W}SQX-Yzd-`T((8~l~?rKx@*gZoBZ<1?p$kKNpXq<KWuvhT1U>q`?eN|>}-CQx&X
zz0d6xKGb-WRuEb$#aKx60kxe?Ik^%ne?l+2%G=LI#HuA7k3Q?}v5)0;(H>_DKOO`-
zG1$-hbht?3h%jbXhxVuGck8D1v^_{^ncnZ|qm(g;O#^GT^_ng_UfC;MRQ|lm3mnZ9
zm!H1vaSa3P5#A2fDO2b1O=X3dtj!>rhnOm?kf%pGazZ-a<K3T-#gXm)2Zq+lC+x=T
z0X|dg^6IFmAL{BTqQcVVmrXlveyuM!VUH;#he|&sm*7qt{!(1<revkdMMMg!+>Wmy
zePTx`V+IA&e0R}K%C<^UmdUV=`G=!M0o{b0VxTOYY=xeQEEQ$LMmSGl@-&r)%Z6+>
zpKB;H6dG6VsHkf?Wv<KS?Tpl!R<{`W{#T-FIXG>VA@kXJY_8Shy0g8Idm!jN!Z&*G
zM{n0F9@4k|o?z0u`QuN|Z7uk*`on1SNv}41=Y=JwWz9&pZ0V5A@)$FE05m>eqM6Us
zHLZ2|)jV9<L`9729bJOY1eF2KVo8SR9w(ZDvO@&fA9%H|88HZUc{8t}+l0jTp>869
zbqa|}@!NZ$wP)>;=K*?c?fY9dw3!zoQ;c&HVWAi=23XWa)4Sz9iCt8nr=Nb%*ZV@G
z!A5FY#RdGx+da(7bvUKR)V6PIltx+zRUUYErQ=xKH!1mT(JS@N6G^RFpn2o$6MI6~
zC&o4JH)$22Po8yGt)-5L8%5m#7O{L|b2;!@C=`$T%{(@eT(={=Q59=lpE6BWpMv<3
zG3B7jL}^2SGwme5?te;#5hI>#t2LWfy_-7jb<=(C!rZLZqK$8VYzL>B-Je=u{tb`4
ztMhYl!n?kYphZ<pjLqHyAFmbNxY7<m0p&t82C>bDk%ojecwCAYreSnHggq9Xu^mUJ
zd0$}0#=UJtR<*2!Rq{=sK_moL43p>87a!upX)h1Q;-5dsNexhiBgsU~!J9dR(3~5r
zkCAk7HFwBpXW32Bkd)Sp?@s-t)%+8biE}czuQ0d|4;xHTa-x@W=gOE2kq8-LvRe6=
zpXNK@dMS{>tfJ7RrSnY|+ZUeug7&teiUecfNc;nyFGpoXOKs#j-qT0g#zigqHMdzd
z<yk!n?t<T$C0~;rcw`Z>6aVZ6UIZyJbB~wA(qG%-*!O`)rmCIA-Upud#}rwk;Cj=v
zuA1@N3+1>szIck#Ow96o6dv~Q3G?2)`XA_=OMv$KZeA=rcBVT9!c?nH0u`__96UDz
zkwxvagn`d*4itC4{O;^-@5Dc@T?bUl0B_$SEpl6_a{l5a$3Uz^?LyxMkE!P|jLJiB
zX_`OZQE3}xY1si70e9mO$6}v`U6-0d##|`=un!jR+eLl*X5wQki9Gz!7VXJSrx&n|
zUHY*V{OWuD0eOcG(AxGJ<;Gnd(|na>^2Y=M@kzX(TYqU-rRHH~auzwOqDf0-;v4K;
zDY$Pc^F6?ud4oQyaeGHAgfLeO6`9jSKnbr?A?sAooQc~VQr!=f9VQYx=LS~DYjL?G
z+n~d`Vu8Z(t4621ZTDvKr>*XlW=_+zwzZaS?S+UEGCtFIftuthtG^u;J6wrsj6)e(
zm7hp=U9;oG6<kF`)ry#A;eCxEnj_zd^4QriO`iVVHK$%SWvrlqNac{{BLdd7|H$>~
z&)P5i;dA4>&TyNBMy8QHumgaD&%Px*j|dZ3p{I2TB>y&HQ|{B|O8wnLyuckqUoOhT
zq0UcqhYo|z$J9q9wcz>zB@mz;U5`FWKi_wzQA=gOB2Z`s@*Q);?llmYHPDdk>)TvQ
zyM7KTbx7Gx06E#oDNadjt{IYx=JaGGIRk+=*t0KPcd_fUVF${ihZXFVKJi?Tl@5FB
z#j0njB)7ERile=2VgPn}>f76Dt1?#qY`3y&6-oBo<uL`v7?EhpiBSQT^Tr+6s<Ir*
zCfe`pu=j1mAn3V8@c}(6c*0l=3sS?`$gpb|v=eYy82|jASJrOYsG^3Cp!>|_<LEg>
zV@zIj{Xy;6@ryNv*NKijDO@VoGC#Gc|N7?h;LDr~V#0bl5h-wkrD2E$lMiKEm3LiP
zH#ZXWX=vfrXqDZ1{U3myk}j79`7-5HcV##d`q(X!4sw#Xj8J;`Y>0K(qg@H9KiO!L
z3a<ifYxkMwRojyv&yrgEFk``+jQMQ^{%<|*Vq9V=Et9i1;@(#hzVAYc;_o_v{aY6T
z<#W^|0fHaIR@UPGs>D!df#ad1=yXThRXBB~Xk>`JWYNnb{_D%8fOTr~Yt4Lq1C*Q6
zl5D0N=u5rbvx5|-acDvO3_LEMZlXQ7?IeNb19wRnx?*!3S}3HP|EIRtD7fv0n-j$^
zi{7%()hZ6%U{7hEh$;E}xrSWE*M#b=kj3iwCYX54P*%4N$Ol*I>n3}L4w~c{uXIFw
zt5hPLLj)S&BwNr(!R(LhobG}wXx3!`?c2w1JDODQrlP~KTpU*QETaOUMgCAwUcQA$
zG~a%HoqkrneAwGDy8(ycKt<L|nUW(_D|QxFs`>RC(PhQ_zQql6JjLu*q3?T{7}uSl
znsBqFOVCTRh4+W)-0SvxKR=su{o_Dq>$D%hJ+2HJte^eb28$V?>Dwn4E=RkGHppQT
z3V(R3r8OC~AK<>Z6{&Ug_1{bzd=ZM6(|Q%UyI%TTOrYlpk|(yQjttvZwHJsM6H_%*
z#$3Af{Trd>euZA!cT{hm5&iRrE;I*Ty2sg@w&yvbpBV}-#a;cwW`$$AyBs~eyLkJk
z2v?30DUV>gCV=R@-1u;^xK&%h#I<eGd14dNv|wd9>=8Z%77i!+R9UZ$(HlsIci1Zo
zzA|x2Y95^VqC)gt(a3yFgt>;E)5p@(GLO?e>1_6GO$VKgd%X6<u){}$eW7Q_k;3-8
zHB;EJyQBm}<Y=B!jCn(yK<}mi4qX$h)q9q_?xLUne$9IrcA<5VFc<o>9_hy}q{pZ9
zrB~=2AuTP7x2eKq6mV2+qz9*U!XAX!nu;oYU=O136uWdT88IdH1_+e4%`ng&`-J0Z
z!s4S&Vr7kH2_D^q{{XBTzIngYDv1DKah<+N)O(bih#!|`$`@>y*E9=<J5?e7wq$(e
zBFQMv?)2GED`)#A6G#gcUD`W!g>P{Nq<=K_T(Cf*Z3-agXH<Br8SM1Q!)shee*qcv
z=+jx#0EHO+F8~vvpRa?jtcUY7`K2SZEOO-0SQ*3S;xuPFNq&WlG#kydYEc|e@;bWs
zsMg7PB!KGif4fNUO1?wm(9}!kHRF8X0#vkNRif+8mJwv5|37fUObwuPTOPeJW}^7b
zUV((r$mM0>3HS4)>=9g~#h{n0*2hCdo2O4;fpvzfKU9Di_<VR8XS=LkK5sjmlnIl6
z*PWN!zpD+?f&BAuubjN_L}QxnX8l~{>7Sro0yHq($C-umqTK_v4@(b9Oth-6@L~N2
z&eV$PrWS{i&UaZo)4vr|-A45H92n1R+Om|(nGM~SEzN7ZmRr~gnbg)bdP5&O@EkuE
zy4PB~z%!g8P&hcp<wmNy94@@$PhIx;My?RC>BBYItVf8?C%CkUJM~8Ug}F)>V&2hT
z8C?FleiSgS+<dC}F+=^VaB=jZ-Ugip=~2+Ligt3kcN@-KH2&I4g=24_o@bX$o2j#5
zNoB^Zm27v!h~;6cq5!&)>Zgx5E{N@;wTfEyUzvP^wrcuGR(nLye8^gLL4ty~rUOlt
z3+;n=fly;^hyYT$T<#66ZZ5RKg7^Ey6j$cQK7Y;odtcsVe;R!n{HSz3x@VTuvv&b$
z_tJDwtgWh4DOhO}`7yubWtp2Y&`%a4c~o(Z(-3El!f&t_4SP~_7_rczFkv2^s^$ti
zv+<|0f8<AiyRQ~_es2mySPuq$K{YzjfN^RCTuU53e~>H^TW&hfaiXsT9OC??jI6w-
zu4RmLjt<jl!uZ>S*dk)(BQK_9n6TNn%(J3Of%b}(JTa2pj_Oy6U)A5~w#s>hItRf;
zx^#z}p|a&DOGd%;`*`*J@E^|Uq^9?ZGi5Ek+NIZ2qHs{|5WZ}$<A-`$0<Dw;a0}ga
zXV|U1-V^!q@WWDNeT>t$TkNg4yBreyMTie<5w5x_;GicC%XPLX<aY)lzKi*I{h-zm
zt<`WRq_}1k<k>)MzhJD-uYxZbG<U-}9gTW~;3MtO!mP%LD#pQJSFQIyvPNEq!8hxK
zbD)T2XPbKj7{;7Vlc%*|A368Z3TkB4N^6;&yZ$N#tghdRT1II6Fhue6<0DKJSTHe_
z(V|FecT0_1ucTYETD9%N=-<*RcX1Yfp)wt+aKN|+Y|VU~Ek?cURUTS9Y0&k4wt6Sx
z#9`-7UqGOva!AsD0RF8O`PWWAIyH*hP04V0f74O6fLTrSEgI#rVY4C#l#kvf-!kM|
zRGrGZ8^^B6u4_!M!$;~MbT}XJtF_bT%r&FqwJ@(lrBdp(!R+fwA(VY4DLV-s?d@uO
z#cdm5XU0vAT>yoBg^?bD>})9~_0#FNlj`FFAGfIpxdf8yU6YPb2|0Q|nAZESaQ*^e
zvzNYDrWT-S#^~4%L3bd!q}IfK(BkG+)t!yz`rhA^->+{bdY+T%yPks|D6dc+{A@H2
z6Wvs$iGL^i9+hRybJ=X_K6Cn7e7@o2LKr<k^$6;;*LWdgbZZRJjPWuO#<NLm`N}qi
zv28)P+UQ7q=U6f2JDMU_>HVxsIZTYO4Yb<2-bW8eUXK7wVoMejStWUTC9`sn)A)I-
zQwj=GFflq8RVb|sgIb@!vgLYKv%*%hEh0dW2hcj~Z8WIM8uIDM4Jd4ksv9?CvXWp4
zU4EFAwSD;H9tOJvY<+Z^)O?M2A-dw=iMO^+fB8i^!_Zg;>f4x+jlZ2VBE@Jq)OzdR
zEpDNh`NT})DoQ9#S$KH{aNw4Mv`2c!<4;2My*^&ItZHQBZlTS|u7jqhXR^ORv?fq&
z9R7AwqIpcZX9#}*{nt)psb}+agK}_W?h38!lVFhRfP>gSuNO-*$M-|M8yl7ds#GI6
ziKKh8yEQE9!(ZGUy}~l7d_+5{RvSDE?uEH&Bp({n`O%)$GhRl>N67gGE6EwUaD3`h
zO}4i;{{cy8$fcUy&TK+y^e+ccWn1!5E@(2HY!%xSC!HGIP23tW(<MD{Ytb!Q%E~@E
ztHPmp1Sx=*<x_Y~VRoM5U9`$--ZYg8^Th*FGWdk7_@sJed`bC!ptl(1964DWZ?TK>
z8`k9%C6SV0JE=R}_xPg}p;n(GG-@&i)~r6t0qrd3G|x&the062DDXUD$ZiB+2dz>6
zxqayRyt<d?;H_Y$X{>r%|C6<!;q>qNM@h*N9S*-F4q0xoKkj>nc)SJQR+DZFefW^2
z?3afpb<NGYp+#ij_t1UJ>#NWt{WG;V<siW|-sRbJA3^y%@8$&;IDtvz*?@|h!y0j-
zaMKWs+Cw0Q1kaRot~JbOTp|9e`3dRs2Ie!;cRbKuo($dh9plh-#%ptODnt9;5q7KY
zw7fR6g1{2J{L~OkL~J7$W9GxU=E337y#OBa|0C(T1EK!^|NDk<Mja<3>zs2|*4bHy
z!`*d8W{EP3>{W?4=PDxOoU-?dNTQ;JWJPwFWhE4e?9uN&-{0Tw^?E;F&*$^?dOlyz
z=i}iXM>?T*wYro{cVXu1L~lHDdRi91;s0jD<fLbHsMhSy%z@9hoK3fd5#0)-0Rxv8
zx|m*`z=3alRpzzr-eofE$ijvrk{vaIU%vOWNZq;jyhpkKyT~D6N6lTO_*M=_lDsdJ
zUlft@I$NzW<yu$tuJn<#mT9bOG9yH;L6gAY_S0EeAX_dym*f4UQkeBKD)`l;QGA~y
zV^cJ^>P`Rslb3F!UG#MLdc(BU$?Aer(URPzGE(mJH6by1!xWamX{q-vm09;*;Ykmw
zaRc&7c21Kda9Q2sw6OH^gQRB_Rp@UV>Bi`)_y!1tD9Lz0`*6ZZIpLc&UO7j)qJ!~v
zw<`cmx)0YdExbo0@MhUesq&Q@amW(7WX)wspmm#8S4J>EILS-fkZBev#c7ns&L$d#
zi>BOU9U9~8g2;t)G}uVNCDkoq6X)Nj3w3)s#*MF>{*8p~RdHXN2_vQ2kGdEo2xw$x
zn)>tkH4Lo6O9Kb<tmqup6!bk8n{E)vnyI=R*5(qr1){=MP+TjH*z(6%wx302@tx27
zS}|jY?Squg+xX!}5h<LM#O_6*k;OYPui>vtv+mX!JK@&kjm;ic_&WKb%eBn2eImk`
zz%=oGdQ#Gn;MJtymsm~xNc&{#$39m@eh4L=+JXfpEmbOckHIB53GiN>BzBIZ%#zlI
zn(Ix2H_`>3ZzMV`Jl$Vz9HLjz!gA5{V%Cr0L&-Xwcj=*nh@LVkjm~JqVXYZsvN*LV
zfqV93iYdH@Z`+(jIg&v)F@KeJQZe<@XSKguFRq8_+pLT-_asKozA4O4*XCkFy}Beq
zyP6HtV`xe!4iswnbXlhy;8DyDSVNgPYC0T9Jckan2o=FhMy8~Nu3)Xa&t3jb8bK(R
z`cc#0ci&$4K=3W9aPia%gExM%?X5j))3|QkTWW_lv!X8=k)?y6-Ml!Wthd@q52=x0
zY%;=Xr*uQ)hWs;tr#tHl^oclE<!aFJ2lMiZOk5Fh7Q3nbrjdbVy-kGqr@JP(n%0Y_
zx2={ODc$eC4o>xUcQVhnDh7`U5WEIQyvNSL&`z?;@gHjjePn)mJ+Brxb5Uv9LH+a3
z;lZDxq*MO7sf!Kb&6^yd*;m5=CS_5CNr|7k>FY~kSGVgu69mXVv<GLO)W%w=1=|_j
znz+IdmEZKd!ji?r%p%PYuTI&K-Aqy_a(`-RDr7le_xX!3flc<n?2Ew2qmd_7fpkN;
zPq8@1eoKxOlHV<FxsNALPMnh&ueZ&UZPt}Jeg9gnHX>N-{NCY}rf{I`RMwPHscDM&
zy<pZmIbfpP@SWS4#UpBJY$rT7nZ$MIbaAhAQdK-=@td_#OnV?z7?vJQA}34b6H8ty
z=&5qwgP5i&^VLLhRV=-jX@`%Xku00+9ItZGWfAc*=qY4UncekxxsTa$s^lE#796ns
zE1SlGkU_M*GEFq8O*wHg!r!(gR3id)zvu=>Sy4S0RA*D0Y;&=#_kn5In;zK)yLleX
z^I;c>Pn`u_-=2M78qw7h)5f^Gc*gE#ad5E|I)~RZIp_{A9&f~_dOGm7AWn;jE@cg!
zgQWr#?<N~r9(L=~sj{-8V@;@0n4qSS!XkGPF9Y=3@-3eyR*mg-#UxRis3u)B>WU3U
zAkzkCE?nU#RrkBgadj3K&o09l;cK<dW?J`9x3{INjVz;-&~0(!&U2f07F*xEcaf>x
zxN>fEJ7T|%x6F|D0*4n)-@27Z0+DP&=`%p7^8meDsUk965$7P!V`jt_QFyMdL0waK
zhjIGaM~45kAC_VIIXKJNaz9b!S9r`%r3t+HnPUGR4c0%I1sD!qGx(JnC%4v$%aKDs
z?q_<I1Agf}UNKJs$t#6bE0-}DqNm8xy3m=-&;pg3iKdei1FOA+nhLdV+ol&6T^^U3
zT1#i<;R3L;F({An^DnSW=u=+ha6?^bcDA6Dg7Ak%>Fr=zCYH(!W{sd3)*yDt1nC)U
zU}~1li$6Cdz9Y<;oNbe`ptCR7+xbS`zZ^3>*Ud=hwuaG+3d(mFCGR~=dx|0ot)qZ^
zKFlu3Q}rHi8W~fd?QqF!-S)4Wv)o+oud?SICBIZnm!JJ|3w(@aQFs1Ar&nlS5S6zz
zzBPb_oolzt<+ko#5$!a|;n#d<QW@2IEoay0+w!p!r`<1g%4h$C*3~>HfTNv#H#uhB
zoG7{IDm(bhby;4(j>~()`MB}md(Auto#LXaBa@~MR|X%kp5>oGJQq^1(~(WTT|Jz#
z&c9GS;hpL9x%Ok%8Po@>FF}<s)v$su=0fKtW*^_EzOQ|(h}E}bl8MISdOV%JE95L~
zJGAPm64=<?iat>$TUNBro&MZbOVieyi3nbz$;0vJ6huN;mL7yHDaOB&xL96Z3oj*9
zSfS7gdU}Eaf~5Pv<ZZu13RMk|aCnSMcPj+hJU)^F!2o)SD~8qDue-rC1&#DWc=3i?
z-&anQPb@iAvhCu0QrOxLIL5QW&2gQgW#{vmn{_#g=6G+ysB8H1e2#2`W6qTOvi!qy
z^6kO6VF$&aCMuCx3=kZD$Kgt{MdUu87%y_mbU^0$?dpy@Hi~vm>@!lT<FHnep1|r4
zCr=JTDQXul-wYHzgEeGDn@d2;*})QA&b(HsN$K|(57H4lnKoo4ZdG<<U}C)Sixy=;
zMxp-Dn2atB?y4)r#0rTCZ5Czd%TCgS3m(W;acOWgsCs(dX)$*q4JOdIwanzmY0`B>
z2#VR>auyPp%S0<D3~5vu#^~J}XEsy5O=T%^4Y{T1e4q^ucQ$=kVKb}@yK5@te!byS
z&Zk%9qIzUUJ5BAPk55OI#x7>&<ecdGS?^zbt{paGubV!Xeyh0zKV!mDwrmggtuh(a
zwuoP$L5)D@Tv!T)p1`@>@K)|(l@)h=2grQzZ(m66`QAHff>M_^IqsBR5yQX-%}tmE
z;Z|^HWqn(tbI}-5+EnlPX+Pz27jJMLuhM;I^|O0Pt(ULfKk(CFqhHXsb9vt_^y=56
zyJMSWB+SpoRHySP%q2_Cs1MC{rKH8cL`t$5ErE0*GKnQEIL|i|NlYsvi&jsreI*UY
zSj1lFE-mt3X6`KIghH>Kedu=(D1v5`loq0yHC^QyBpVWN(Y0(D$z!XRrgk2-r;&8F
z?;iimX)dDWJ-*VDCv~T^^iq0<sIe9wZ=;*f4jFK0TUWS;Tb=$+nuEv*h{tQ$#?boE
zWpoCklY-k|eDAkH$dFd!fI-`~1CoLyq2zdI@a66Rzbj|>e)tVr!rgBx7`)eKmrHoG
zS1+Zfdg9zt3%kWF8%~~uM~D~aKVSHLGwdYAjsLDC_p&IU@<ky#xI*h6z%wA+<z@Ek
zL-61pbabsu{f+cs|L7uwvwYx%pM-=}VTssI>R0<J;ir@k_9WjYa(bi+Pn7xNMnC(f
zui{vLhCF_#U0VIesixSnzHzW@%c{3h)H>4sc~UJmCx~`;uUb!arhs%!#A`WAtZTLL
zepJR#kfe~aq}<)-9FMAoI1(jf$G#{}ygugXv$!Y#*qJVVtip-ZB$yP9nb+|@C^NQV
zvP`U;R#q<YMD`~!cSBjDCP*kEbVx#(qb5Gxv^YM<e!2eTn8|@QhQ!Wy(rL+2?VK5h
zoSLy)_Z-fv^`=Ba;R~bfnC_&~zya*DhLimY2DU8s_EP%&>(m{4rVJ~;2)CR+egAxr
zrcLp<pGAe3oxsRsB%ftf(|p0yYz^M=!P(jtD>hPU&07ZxuDbI+h5R#9Mqeytw-qyQ
zxSGz=<3BM{{38SDXI<#C{=`;~8EEJX7D>5rhS}O|n8$2np>oTUU3I(fmx?@6?X+-`
zP*J|aJ0Kox4Ko(#w#wiu3reB8kyr=x+u_B5EL?Kv`n3~x<c4?b&$nh(q?d}c-Y?_O
zt*|PyW@uSZh^<1F8aP3;5R2pB&r2rLR2MmVrh@TA^-!3^#oQ@qkR10JlPf1~msKuk
zIr}UvuljX=`9K?2{dk+(BBS2b!K-QLvt9}Qao0|w-1?lpbVvGVe!2+lw@Rq+1McUc
zq36Wy3TDijx<93s_DyOL{$y7d{V^-oQpQnWPH%!<c8b&_JAScQGG|jnIp|%OVGVwu
z{x*bm%FBo1ptdskXSz~(H8ee%An~QOYA|r6>ioW%nE<@8vLnoR;YCBueL<1OoC4{B
zN?n6B1ufjW6}L$AS1)YW&+aPBCCK^P(E7tI9-(bWA)!o!#x!Ymltvv<r0hRJo-{sT
zV_dUrQ@J$zqzA*+?A4%V8!~g_{MRW&v8ya>MxAco*Db_|uj3bzg3={aFj5BW7hU+E
z9$gjyO$B3nUvx;a4Z4I*&1Wv9ln5ySlC?mvp$-&E%%ttG+j6c|^~mPARL*DO<-&=|
z8rVE5Q|+^(DLP|3+Qwva)7`R5j-l=2=3?w_BrGaJj2(2FyQo(es*X)p_4v)}`yjpa
zI=hUg@Q40NGrOVCdvCYPn=-Y|{_VbNKFl)f%<;syy5T!*^tQY8(3x{0I=8vYEw<0p
z*J=gl+c?)R^|V$EU46K8@;6KR3Z|x1RD0AV6E^<+WBtt)9}#lX`crksG9hLXrKsU%
z&8v85$(EK=rYzyY8S}xV87EKkm*;yzJszD~cLpjs=d2A=%A+1VDUWQo_x!==3!%iH
zF&$i5@u@-}va6c`zsHRVnA(9#Y5I!_pIeKFZrA8ww^ge;LQc0q=v+rHG&wX~@+7ep
zGaxB9a#|m@SP3$<*-ueT4{(8SoTf21QR8q3_)fJ;zkl}G3NGm5T#IMY*V$oVr)x2!
z_Tir%*%b|rt$qe}5AB}@X_D!mhaV!s>_%Yd)+?v2NH--AlD%q-vaV7sql_;67bj#U
zftTY8bmUGW|7@Ypy<6=9_}416o?S9IzxmRA>$`|(>fBZ0kTlLfh6=-DDQbUYsTv8S
zt6VT9n1pNz`WTeY3He=gxc{PeskX6<0+W4>emR`hg~4(5acDP5XBU3Ql0F*MPRRZ0
zt{YLeaw%pp>z269Au=ZW$54|*Mv8vWLPL4uv;pfptDZ7+HWYe4REXt5Y1OzY$L&G-
zi8DpdZauLZc{uKAG)}j>)!Ab4y)KO?woVd~r5YObHO=^hj=Q8BN1vb%C_$P&FEQ(t
z1V6hN<z(F``+25<`){lk*g?KBbOagLbl>Owxs6f9^Sz1{mK;HT^LqTv2>KHyezRP!
zUv15ju=)jEX|HzlGX8lQZiQ1&Q&I216Z`A!Cy((vUyo>F!=AktnYdhbRnUmh+b3)O
zV59uRe5pQIqzKcMT8bqid4m{T?VdoA4cgxthaL<3BRe$6GLnVzkIij>-7^nxDav^y
zCmhZqaw=Du-p)$|l$3Mhc{B*@>}-tV8nnx{7Fa9%q-21Hf(QzXfW(or%tGmB2Z<;v
zd~PE-w~-+2TE}E0_gFv{+LQ#mu0SncpOleO8!E$qh%gO9p?2FW`-gUIt+c9}tX(0m
zr12}>3s4D0KK*Vwx6KebHB9FNwVM2%TIWAzU3xv9w@A(*t}d>equT50{u`VsP~Cg(
zgsetEr_<1}FAyzf{@cuk6z}KRDZEc_oPJ9^v*%;sa_YC2OjDJlZ;QiWmP(;o-+G#7
zghR34<!nhO<&Q@>l4{j1{4ed)(Q2vP%bT~5nQFgk)t$wy_S#rC7^N7oZLMa9zxAEI
zt;2&6%G>ZCdt<)+TelLuQgYu7y65;w>yEuxw^1#(t2(%&)4IGAnsq9+mh`+=N`s0#
zj&Ib=e^qXKvUzX_F&1`Gks(CZN_Z6Jc-3#1ipz8iH8w$32A{Z4&fRReT4?b@jgMvK
zVo^A!H^sIWIL3GJFZ8xmxEZ)CoKmrT323eT(b?TM^7>9DF7bX;3D~18uy*rg-BRp)
zO_%_kxcXF{Svn6~wNpgX3l$lkLIpF)<nj{6nA(&qNaZ?{PK4yiwNt7b%M4p$uZNb|
zkr;(jX{%PNO}9Qf_M{c2$;x~zMIR=Rq>#D2<Fj8o?J?mN8Pe>~_@ZxhjZB)+VS7+3
ze(KuIW3feH5=|)@T_D%_jUHjw*+wnp;{iqKkAnpFPAoN9UJ2D{+2+=FVJ0=%%Ae;%
zb6bk9-ToMzZ&3E`P9A1~CtXA+@yYqE*s*GEKW>M*1SY12w-+cpjC4Vs|5~WR>r0(^
zgoL?%!44g8WKGGoI@>zW0>`Gf$LVA7SsZp(&-zUMK{poE)Q9FmkMHM-9~tf|M*agb
zs~)kH+&JDCE>1bL%2LW!KDf8Xb|8;4I7~|Z2Otkqk7XAM4xSZ1fNU|*HS?p3{zmSp
zjhKJX6hGqoRA?0~v8z-VO&N$`l8Tl71G)`7Ph3nLTE23IV0qMen0j>U=x*_k0i!~d
zryZS=M+mEvgrXhseW{2|#ngh5C{YF3-?j4M%}a#jnI%Jk9i!+U6Kgm111koHpoStR
z#N`K~R}6~0Lz4<04-NTY*Bz((6<;si6cazS?*2U7T&lda?(2!0SFcqSL>A4k&j_}r
zgZ&vL`80?TafeJno;_E{o&rNjp<lznMwo0Pw~QMY-frvLx~+zaIsP>x)q`X2F8<(B
zZ#;ifMZLVfI_jZ<pF<KUv8wn3|E62^-vEP8<`1>@Op;=!{Dzg^OJ|-Bi&MO<^?3{*
zy6WmY5LbD;*S2hs<5ekqHUhe2(I18sz4%eOLbRGYyZY+j-uwq@>$1A$Rd$gR=Tz1!
zzZ<~Py1!Xq6H2UMd=NC90L?j%bwV<c@+c%f#PzFeRO<3-Ozjt~1?#HT;Jm_2Lz)FT
z-&XmVKM_smnk*-xAqbEOnqenG<D?Cl_Q$!h%TRH}ExKsBsaKK>LP{HX9$LVK@;3tR
z$ADA=bISaAF`Yn#tAq83wp9<KJ6FYm#(<OF6!H-Amg9RAPIGe==o-n@s&>2NcF`RM
zYA59rd3!3&H91z~pPG9o6XfCyOhESXEKUhUTybLj?@XRNuk~)J>-c_tU}W3z&MN_n
zM4wmvS1%M-7%GcAe{^+e1Trh&<Z!NXW=Qz_omuV&8V|?yEb>4)ls_cDhiNa$Uk?T9
z%j>O_{|>}9-i#W(Ptm<)=lFHx<?sW}@X~AdJ8t1?LAO;Zn{^-NYfi8=yt{jwJHV%^
z=~~=MXt3y^x##KV`01OKc1>S%X)#uo-WRW5s9yd_H#cI7k^QS!^=KFtggskn@x*!L
zeU)1MMtW807xlj<FRwN39cs6xgu1}22ELq*eKR)O^*(e&UD<p%`ooRKcdq8zFWbM)
zDZWzn@DH#fD`X#jzMAgpR95jov%B=M0nDBV5YO+8*A9iA6%;iYw4^;Im5DAisE=IN
z{@^~sZ0!A%h6`Z~&1TTEYzD7|o6B5A_#660Q2jckWKH8Pq;<n!7&Zxo_v})NcJNX(
zv?SIc<b~F&CXYJ^8r_PSm}9ACeY|vaRg<^rzEVThP22Zp$UBOfnRt$m2K?GvUye(C
z_~w21Vo2p1f*gM0so7EgO)r;8r{k#??WKgxAL<>6orrGO``z^Ig_`<Fvw=Z1`kG-#
zOx?^?JEd^+WTqU}MuEvmLK~LPo9cU}KRLM6ba`ph7mWo1-#jE6tf&?z-eba^$*pTV
zxA`S6@{Sq&dahi2@ei22ZYuuK;UAE$$xLY;3Ll+(cTm-(Q8peS!{qq{Z$EJ|*vcPZ
zf~Hli+f_x6KcI;JU3~Mt5&rs#w`G0#5kF-7jZyyLqs5k;`rpYcE=;hQeDe6U{wlq~
zaGa}It)JzAfzZDFyN0*JpSO~2hn{(!3_dyXDY&$<pW~il)4>fgrda1%W{&K*-8Eqr
z9a3cwa)vjN&pNQ6+n?@DvQa_bvFW=9YPVrlp-^OLr7&5NX%sIDN4Nwug)e2*ya`(H
zMGenU7NJm$xYmC?KK*VHc0H1RO7mCY!n59?S(B;?K;rVtu5|sfM-lg`(WNIHPhb9N
zn!xETV8*GQx#`#-`pk5+3pT(fr=^j$EPSHZ{@Pw?NBX-b*Lr#0Og1jj?K89^zoeDW
zZ#Xt4G<p2-===|^7$hxAwWbj1j1=~2QTc02EvwJ!+?zh%X9()@&3|c+n`F=r2CYWZ
z%!gV%AY&*(Hs-r@20L02J3q&Md9k^!$UvqDPID#5nll%XY$P)$qBn%3anBGY3)t94
z{dvnWguxk^u;vun7>BX-44!9LMFzweia70-FZ+ryNEWUHO?kI}29)`uGD}CYr~T`%
z;v~ENGiD+^&sD9gn{5CRR){;pAKE%FkUPARQr6I3RD%@ilzVkMy=kROuDFJA8fyFL
zOYAJwX2wz4KiS60ek*7wc)%cQ-Vj>zknlTVV3P|IC32$25L(P1b++4n(}=4)yUHb;
z5l^|Zo~jIwK#)_|<R5K?7&f-7@`8RfuI84jZvsIQH;<DBIry@)6CSTuwJX{ca7+5K
z^f(MY^nr=)IBPty9uJMn6Hp8DJ6(iXu>EkVF4c>oT<$69BVA%lQN>mdl@V~J-Pc=d
zKeu9rXnEJ_t7vx`n9NG&vJP=Tyu)Vt7S%zg=E=)c+19focF;84%41mD^h*Cw8Dhgo
zduUssrloA@c_FSMvVTdUHRGPY4Ky%^*>O8WBJzP5bXnv`JJ{ouw7HZlIu|I_LRppL
z1Z7&RVqx2S&^pF#!O&vslE+9{O$}`2LmryW6cY~KC!&>iYt7Dib@jpAlpgnE0SL{o
znZ@Dyy$f-s2|f|cP4_p;jABF5h%)_SlS|kvPi&wRmh?%N(c60ao~uV?L8)#bTMI94
zP)e4J$B;RLt?yVf>RDt;&1AoW(6q-?|2%8Gd)DjB(-78Aj2&i8cuYgC&KA|aeRt^j
zE+>D6{;Ipw)%xVXfldx=*o_e}&#v)7+ib%b!E&(eW3@~rZiv){kyn1opStxB#tr5i
z&VgAetjzP>i;rxx{5)oV3a3|(WH+qcsO$P3kZfU~<VLb4UH~E3ZRnW*lgO@<4R$ZQ
z&VyN6aSvG147(iH<(Q6F12v!D<M$uY7}a!CuZtYEKhjSrg-tW|ZR^1vcs#8WT3k8J
zmX$K*5dQEAyE)Q7(bbgaHXeDO9kyQbA%*gt*;9s7D@i3FX?*uJ&q94QP4BzDG+MQ(
zA3d*HkWfe(VL7xyRl2jA+QhwylL&$vNxHJb=oyavs<R=<93Q991w(2YaerW1lNEb|
zp}f91d|mx!j-schZ|BtC^y}KNp|24id5Kl&eMT#o8(+3}Hax*`He52Hgiac3xOrhA
z{puB#QWsnQ@g&iBPWQ6c8in59YoCR2gI~3|6kSV}Cg5V2oVfrLUFCX)@kcfWuV@w2
zzpq~M%^`@hp2s3T<(z(RKlf1VSN^Y$TI%C567CV+r(MEsxhEtz+9N~m2S%h9+NU*~
zF7`kBd8sHZYYoLNd%%5*HTP4=U^u5agA<oh`FFX8k0Q<QaH+DSy(^GsHb<3z#Wu_?
z_N2#*+2iG5@A{QMr}lT%O(hNGvZd~qo<K9MovLk6-_~mBzE@T_0()y~(l+H3S0A4y
zn}i^d=x*7tZoVxTHk?c$7V4r?u!g5STF|*_V^9-x9UWX3iuFA05lXk9cv-?w?9h{E
zS;fGCE{{$c+FX0Y<A*Gd>rQPkO_}~sch@(t6iB)2nuGKf>nEkwm&5!5wXBDrFeW%H
zkExMLc^zuy5l4l6>S=~2ej(<z&ZHBcR7ueZ(;VpclFMC1C-~{A%FIJdR^dlkfn_-a
z@C`C78<i<UBuu$woz->bD#O#gnY{<h{)6>APma*~W6UN9BNuB%VVnh)m(x6z_-xAS
zX~z_c#T(^r+c`AM0pgk{+I;?tLvo>w|7>Xm<U*(#Ubt|?K*H?v!SS+4$5~cnx<<u6
z;H#=@sHkA{BkRH=`_^PThmd8h=+@-YKYg30+iObueL0HwMiLv_a?;_>f`4b$FIv!v
z$Lz4o6hDQC&a?V|(Z$Pthskh@F(%&pi4m}u6OvKm9xKW5$Hsrb*xL<BvkF+kXaq}9
zx|N6vgSRBa69muT>``~H9u6JDXUVpz(2yuWGS@Xr+I!hUYgOJmOglmnF(m(rWaax0
zWlr6{!>cxJBmhUmdJ0daW9~fKDOB#hlAj38o^rC-<uQW3Rh8|sG<;Q=tj6B&a?iL6
za(_7K8SX4^;q@XSmLMTNrQSbVL3%Tk+-6<%{`vhtZte5xG#yU20;$`cS53O(CZyal
zbuo@AmZjImpKIEMNgu8M19;0y+=^oDj`m;M<QD6v(u?sVA2}Bf+@yrXw>I7Wphf*p
zaRblVSc)HehoJ^K&mZ*(4Gg^l`8H*QMDnYbETtFr-+B4weY)n^vi7B}1WI*dYnmnX
zN7D}nAK6CAWD!SO^=G;M#|vtsf_6S(x#q;3k(0?v6;8n<mM2vPS2M@68pv<j_A0)@
zzDrD}>J1I}!qD`<cy!kgE80k;gyc;Ykj0p@vz?)po;LNSt<imvab+{Qv{EdVSwqAU
zX>{i78~B@od%NOmK^rDV5Ut@JCQg5}?VFjs&(>j8M-ZVAr$4LzfJ&8LyC~BtgZ<aH
zY>!fI+Upi^UO0Fsel)^*qnybRIktXT-5U4@>|Q$<*fZXIoKx(@Y4^5|x%pYq*SxZ6
zb_51V?iA{BKQ#E?n~7k>;`-|1ewg^q(|<r$a;599CDnhx%^$4!n`gf9{5(H?jm@d*
z-FsEvD%+dVO|b{~f53sk^5f{cV9Bq{MyjJnPP1jF@&kulN-pO<RavfQBlYbu&qUrI
zGEtcBq)gZ+1b<sDpZweQga8ikv<8%$*g~=EhQlo@t1G)qb^chbd8EXI@%{b}u(Q!F
z+I;LXz#a>_AUQUDN}U;sP5lFGjSF!3v-+X`0G;Bm$4NgbDJr*l7r*`Lzy1$sz|S4(
z9@@H(Igfr`7+?zLyyf59OXn1x+Cah@j%<+=7tlEoPnMGpssWzrv{CD$M^PzdC*rsy
zS^T2YI}LUwqW2%)viS!*uw)w6Kj81X%8NTypKEih2+qfM?S}pViJN@yD#h0e4UQ%-
zL96eUE&sON{0uFXFZ>6{q(uM8oVWf5Jks1Hh14+P$XF)Xu7Q(TX~>2wY4KgRI5U!z
zj9m?3hM!97F{tR|PpEjNXfwrYBnKPENJfiHd9Tt9lHyd3M*cBpjG5<|Ux)q!PP%yP
z=Nv!sh<cg)sqrhKCNV<HLj50bDD)3lwLG$5@{)H<82n0La$uZcj>bXL!*-&n*etk7
z^_MG<|L2&>YV@+nnX%(xR$4yss6VU4|A0oNAw-6AUBl(S<2v$6&zlWqI8DpzCMHF~
z26&iQz|nT9Z}QN|`|~*d?MovjpENt}_(_UX?v+y)FFfA55##5y{SUC*tGF7We3f^u
zI!;Awm8q7+<{vB`t}V&EbtB6)J-pLjd9=>IDMnpB9b(w@D7&*fZNu*8zT)yQw@M8)
z_||aWk;mae_f!YWs!RF<;2isgmVfA>`=;GsgNe0*0!9BZIJ$dy=KS{#i!*cGUu5!J
z6rUW#Glh!%yDh$nV&1})RG##gsVR*HUw9Adf?<m94w(6zO|1+8=44nKj|v&=S?@?r
zMvsI>y?ppRErs;4R;=0lu5I7Z+|hrc<+R)>vAik1DY@AebzW=!q3aQ@KB-)rqPA_Y
zOV@9<JbJd<Pu0`>|I16=z8R~pl-60LE13$vYOri!@N0(+e6yrOjQO^xmpOkvtE;U3
z9UjL5ILokMarY9#WkDeN8FM7!>qJdavy9}R{DW=-@`amUlqC3Px|htJs!ARh%^564
z59dT6n1@`v3&;1wH>LL6^n5ZDpWFHeQXTA$2sxWdmqH|eG;+SVonp4bu+*Qu!jypl
z*{nL~6P9%5iTGBu)^gfEV2}Ch(#+JwxS8#)6;lTdhWPdzkQ_+=roMHq=%3n2<I#I*
z$u0qGmC=6<{!?4BPdY2dg@^WfG%WMI?a}?rha`VVi0^e99I+h|qcZZVgKcC<KMgul
zKzf!2`^kIse}DwyAMp5hn^peNJQMxdtUA~_%i+S2amGJjgPa}xpWH9aqf~}pP5w6o
zre1$;B$^J5g^AA-;+C)LMlkPNHPyQv>tCvwyk)R#wiLHn<5^RenCX0FS0q+^mgyqa
zqL+=2%rMahZ4)ZSM<wsbmY-jC2t{tmlnI<aGGzI-NB;WfKk@5PQ>7o_w}<`#!}&}1
z^!z7NCC&{zpWkwp-2(CtnKJtCf6km4O(i}(l2i@Y3i{8`%H%S>=xzrVr+#%H{dPZ-
zm>c~MFgq&wHvj3+bR)(zdY^HWksES>KG}3>{rs`mVV8}gB!dm6fk|bXSuvNCGt8+@
zX$;Axz9@(qe>&y!4@l$c*kNkjVc#K%q~F<;)^ene`!Qwq4+u6O+a>cUAqSM}!k}Hw
zsy8LyOp4lOb7G_I?8~3A78-1KYW6BE7kQp^D$Yb>(g&g$@xHmC;R78<9^bz6njNM5
z1DFnBxF#GWvd^V&>-f<8y13L}vIi$jx&j~lKQF>!>3-kgp;)2ly8-5e{7B>T><OMP
z%T~-kc;IGf75@)l`rYA&yYl~lWkF@jMyB+8@|O~n92cZy6Kb*#(ltcQ#P{>}(!||5
z<G*y|itnW|l~3Mbta4`jGB~iwj0s#=6Hpp@_wSb9$uuj7>*{P&sn=_}%BiB7epABB
zy~l@h=e4eEzI{B-73fnFp(t<jD5|J1qNj_Q0J_*?``K18>UDhcwvG61dce9b-Jng0
zIsehic1btdzRPDZ9+LDD(U7D6bgGd#S(>{V3pbMtwhcDr4}-2OIZf-LXPXSxeJ||n
z=R`A|mbLq@TTJ`Uz=>oTY)bxZJCx66l!oTdcM$&p??yfLpX{X_cfbD+_`SpY@gGbh
ziT=;8b{Wf;Gv{wSy3Oq2|9?f<WG}r>?8kprn@n1OH3>NzZk-YdgZ*KC8Kd9M%9I;V
z=IeX^a}00fGw6);q3ASL%d;`x{$HGm=1t6DFxBRN&tItvcUIXt#m=+<Sl)2-@|lB&
zLd*O@`g7vDse7|}@)^vMUuUf?Ii}Z$i~jq+>wyMsiqXuEN$t%>rGG4yFV}B3WP(8P
z%z81Wi?)9fC%v;Te%NPl&~^0kVQC1~+F9~V5)yI{*{9DO^?z@beME3~+v4X>mw_+#
z2<LU4m$?5N*HgqDJB|&s4|_BXMt32FG|b=x3~CVvhR$EJ?rfMoCM`KaWWdp*kj%el
zyb)gU8sFhyE*-X)b&^fc$QBK2Fv~A=OrKsOzTZQ*<>j<Au(GOSKG^XdNMpxxtL`{w
zA6o+5Kt)_cH1l$OLvqp_D8exj&8B^(0F@XZTAQaY<K4*GFFl_FR*K!;Amd+N$T_y;
z>qX1XHk;1_@v@Ic@cn{E5@wg29HoRs_(nl+F1i9c1%YO+=`95b&A&o6E%4;DI1QXE
zT^f=a@m<Gh0Q-}DlYEl+U^3Q)t8dKOy*V{7h11`~V_h*3rk&&X+2W){P^RWO{-npn
zw)rj`r<pXHP#rD=!Q*|%mklU$_XQ=t<C{J+GdT)L*RiJ#ATypSt9o4Un2daUwmw=>
zGXEWZ#rL?=)+XVvXu-=`gx;&0w`j3Q_zaMl(N~mOEivIjm`=IUD>hZFNHJKb18Y!C
zm~`jICq#!D>RjS<={ze?0A+@VE<*MT?*QN(7&O9zpw;4k90qzmgboTgC2LH0JH{5{
zx*bsI1#Uan{5rW3&`3om!d=D@cc^}Ucv6%$tO4%v5HHxQsNOHO=vPQ)9>brNO?ut3
zJ}tmV%i0eKHjSV)L&V{=_0t7xUO*ZE5H*Rf2*;*eyAHn#f}Wx<&3Tb=BoPmjU5p`)
zGaGE_V@|upn0Xp_TmXDF1>i~g7LJ2<j-&ITz^93qxP}Fu6k>d8-0y`{mTHa06j;N7
z{;w=_izWnGAf`bb#3(`ea-pLax&aFi28X6-HSaSDy5>W4H)s$zblo&WwjeVVl1;j?
zQ#0vw4QBQM#kGzy0#O8~eD|bV;x<_O9}GJA-r#`=QX~a3ae^4v&u_ZU%m&4)%tX_9
zF!=c%GW0^M*&S66q3z^_H>*S{&pGO^4%DA10P{I@E_6#7GuzLF^o_WFuC}vT^Tm_U
zei#xm3Y_B2{epr{I?INXfRJ2=+$kKavmR6hgffDxsoks+bT~NG@O%zRM|V(_CF(*l
zI&r!}fP3TXGN`GW)@qg`46)<49qJ)U)pxlv0`!JbZ&(cCQpPl&c8S3=E=+6dW;3i@
zBDDqMaiio!fV~S5oh8BgUiIsrEjZO9_3xqnQAM#|ADT0f4@^bINWKua#e)e`ZO!KB
zR5V?>$qQ&!5$)HIz$p1oB`4oQfTbm=(UrH#{rkz=3CZt9WMXt07yOo~%0S+Rm+q}8
znZT$1mX+ZY_7`!C=J5rB0J4Vx9zPT7f^RoaEB57@oNq(L*9tyO5c{A8H|a@4XyP>E
zRKb2@%0PVLx)uTKteq#Nn+i?i316iMzRb3aAYBKaRGro-k5yjs`aJ~$I#IAW8P<@3
zOn#x)eCvY~_&me?ZV8o{@h$Q!;h8TM3a9v$1^727SQrJNh2Vy<*bHzq7keZ19xh(l
zoYpA?Xd+W7k<H{Y<Vbt)LO(cy&T@NQLCRo5;6}pS&vrP-9Uc;&P=-Vxj_DQI(#~m3
ze~l=c;Km|BZg`SAEy8m&ZCu;jWnwfo&C6P}>~o+hN=jp@oChO~Sf+VF1)=lXwMX%$
zGj4vf_ofN_0*NIeAgS6#4Bm2{BADh2>TGolHYwQoYSS|1`YA#jAD5)>1_xj0f(Gl}
z(z8K8X2HJxF2xhdzs0~_XvbFNQ~CtZgL-0&2C%W5hBJV73R##=wI!`91zbW?ynW-4
z)8mV*$q9jTZxQFTCcl2kjx6NC1VpRS%2?*$usSf-rDx(PML&AvM2Xr0dZ$#P=u@MR
zlr{=af$?jh3)qgKv$xxz6KuyNKe3DPu+zYcvC`nRocdW}=V3p9K8<@SW~k*IEBRhT
zS`i9*J69(dgFbh0iveM!UVFX~wPbvq`!fWY6B4>lfjcjn7q23kuWyirQ`v@E04O>2
zyNkjWBLDqbhCYwJ3arx$`XHVsHTgVwp-=y;2hfsZ*kTFfe5gIe0EPrLGA%*xHDd#%
z0Hhki=f{?bd)y06gDjhu#O8Mted7GYc05j*m0fa)<=9kj!^EVkNWPYqHp~w#+S}No
zAqDVg3cP7+i%i4udKmr+jB3Rq{exb{XB)X_JP$O}yPE}i4%5`&|0A-N`CJ95Hox^_
z(?Y18t_NkV8cOZNxm6>;Ejr8P5C2~aN5ciT`H5aFpC<whk58o{)o>IyLVgjz&Xye<
zA)z72YKr-$=2>#NP3R>9y_0OrOA!E*Exz&5m4Bt^x<!Ykh`iB0b>&904G8^pq20I0
zGvgzja9i~Tx+gH)SsfhXj`XnH#DAtEJNIeIXNtW#WSaw5-Td7o43mIB5h1z~NH;r1
zFb1(J1uBXR_S@btVa0QTg2ARns0hacMiPF0Ix?*VWimaPUlhnxii{t17qDKE^}#Gd
z(_#a$)Ew@DOT+8=zK)ko-J<ejA^6N>!@VMkE}0CX=4f0P6~i69m2G}&n0I`Y!rSjl
zj-{Ccb6QDV@7*v16PtJ-4YNT-m$*`N-CxIBd`{D#Vx`$*iJEYfOM4^_YnmpN=V`I}
zGu|c1r<x??2{68gUc#L*FPM0#=>j^wey#lKu(Iz)W;|rC92}B({P^o)A79o8@x`2H
zDei;1aIW?TnWA+b-M5~<8y*vJ@yH|6rrGeqVo^s#fO5R#8_n>=hdD6UUXj;$ge6;(
z!c$&k4q^Ww1q>jCP{F9h_t)Yx(6ctil-Hf>c*6&AnHoR=+zb4}OHv96WBoFJS&z9w
ztSaybkn|B?*mTHl!C%pm)DFS@8HF17wZ6L&{Fd==-V)))L}c9Wd_HbLc4UwFt6K+Y
zJ;DjcI3i)1d6!$1c^4<LH{7UP7hBC-?!6C8%hUe{sLk_8W{+zd5fmGlyR}0GxZW}-
zjh-GK6SP~-+6-x$-AOJI(V`>M=z)}5wUTdY8PO{|;oX5(dgjG=6Ubn$3<)3(Wr`Mp
z&<Y$k*z}I#XI;qjKwb^jfoU|74}YOsYi~?9E(6X=NG|e8G5Hmq{BAWTt@j7N1`hP#
z^+T75cU=gYNs(3@f#PU->T<xI#4-M3S}S{F!0Z-A`pqE4wZO?NG#`?=2G(DDz(Tt&
zO@lB^7wb_Lt`N|Q`D&su&0VXFaaHJ5<nGB~fMTWf;~}i<F)iq*{5satoZ0nsCGa{=
zig|O4P{%!B-g1`ht#x8b<SB!TysS#PB2%ohi;^?IIBF4+@LW{7?GW*iKu5tuXUtyv
zbB|B8u6Y!DNqxEgX$Ekc{)!*{r2hyK{govRj^lqRs7cK+WHoPYG5l3<mkrlY$1)^%
zY}GRP@BV6BHo2__?~Eb}^wJIwj|eCVxwD}tI0J%o>Odz~ua%5r(BnJGcgTjCy1_b;
z{Ty6JST--xyg$!8FY>t?H275B51}4VDENiH4jF@BJsHpfUIM`-1AhbJ^l<8B%VZC5
zBJS<wFUCAlts7LMKpd8kdIJO7i+VH;LXTQLF5uxMPs3KhE-o?6^T!{fFd~<mfBb6o
zdXVPB$pE9~(||Gz5`pAekNS?ngT;N}X3JSzNs~M@$U^%-!rB8fuh*?^sM*i+=d}@T
z?rrN<+}It|K|bHJyN2K|-Dd?>ZBmtq@sd2^beHz8`jWpt=(Yff;pf%NW?l;ASw5`1
zK_RqxbTS}9yr%-%tpTT=-Em$dpp-o&|6s>$2EgE;Tw4iS7Vi3Z$bNZkyY`wnJVGoo
z^6pnSw<`J@f^CCv(ST^7-Ns%o-g{~imdt6y^S7;S9M<hE`90AHcYsm4G5Zy@<k`<2
z^9g^@&jkaY@l}$heRzS+cn&adJ+QwyrBH}Ck0wF@cWjV_l1RSQ>XnJiU@vmNYoQ|r
zz>ir9zSzV;?vPXGGq^JIqy$-Ei5EU@8-e0~k6Lcwq}Ae{rg`nl5a#1PxqX$11H)9a
zJHCyXtfhVELrwv3c3nabxvV*MN%K4ZSW3`n=8FMM?;q4CrbR)x#9D{2yb4HMpK=_S
z2)%U~ac;Z~6wVYo(k)!T81-J?1peX!{gh)WW$>~TAW?T6!k_29n1zP3MqWPabPT%N
zYVOOE(?wU9Sfuc3P(3aaNzz9M&GmH#lTXOy1)8=7Q}IoLL+aONwJf0cL>{auf?^&s
z0Unf+@P3K>N&>E>wRK45ql;iHgC@oFd%j;mDmfByVBr~Q1~^`mM=PW69txQ=0mm|M
zxy?xfec$F8Zcr*#X=DF8FySJ6)b;#+0VKqGH8IvTZIU2`p(~&AlLCZsv6>ml-#)1E
zpv-r5Q6ZKy(prg(9Mtr}muB%O8Q`ti2S@^q28U_>Qq0NF)XH4@uJQ>gMA6=-0fN31
zboQm>MkWBchF=Hk&|(RSJMl)eHI*3O!HVXCIQpLore6WW6zlWh^nwNNW_RCHpuHQi
zCna}Fgk!fCi(OJivB31LRu!n3wyK1W6@%Bq%|Zv|J$Yg6NBsVt1Qefw3eLL@=Pplm
zOHKS(MmFs1mOx+956<AEl6&W{d8d@x+V;>D$VrcdW>+nEKuD*A21vBdQ#&pHAvHw>
zfN2V0Jj%IuRCxl;WG){g?|g>~@NeP}#tG!O#GWZN!x+}c$8j+IV1;hKy4lOf7Vy9V
zMFI+OL3!#@Ca7A87;lUpkE`IXxM=~N1zv+9-%I?zG5Qr{L~z^ajs(Ew7D0`3Y736E
zEJ|kwv)5By9Q7;K*-_~EwL7p-d`wq;IW#^~B@SJ{&Po;T%07zA>BkEn`?Ht=$3nLg
zipB^0#-^JI!d_y~V9YmNX=Ex-!UU`t*(|_T7dVPx?sCyg>^5GM?z_4hJW`Ww*K{$+
z(;Hs=lHzE4U*lQ@H#j%zygurH&jr31s~@sy%+gHpk8DGE)}dp=HTuF2`i}vuu!M66
zeQ!KOLmvZ>!c!sx#K}cautn50!(aVeqhz-qU9y1FxZAEsU=AJR)m8ib6qzd?rv@f!
z>2v5rq2{s?;roZF&8V;5aT;3E26gOD0xy3BZj^?zk~7nMO5KP8tNQWHF#vH5vVl{c
zg<ky9ZADZ`i<(8mry0<KIti00s(oHKB&h2{@dt%o5kls|mEUz|*JAo~Zy|VrczVFZ
z22KV@a7q@I!WwY59}j$BST6!2bM_seAuWe+sIOTo%WKIuLuZJ+3)6g`6nS2G{}F%}
zD+Xc*xxRjD0z1Hu1;p_;#sB73Z3^a2Vltm(y?L(Y{E`baV^Gvx^?A`YB-83`Dl6A5
z-NxO@jB)#=M)Q?GP3W=X2Iv!V`_JhX1J7ON555=}kBjQ4&%|_0y<GWv0oV%)tJo_l
z6<Dr(&0$VwM;?b67%t?twr(-P!j!@^5mr_%aY0nV0_gB9F}r$<{7_fBXl5F?(#F2g
z!6X{h?>{>{Fj2}=Kh}}|<gUC*F6mik;NvY@e*nY2@R2w>X}<2g$Q3pH<?cpHOtd&X
zb;N|}mkB)`nHx@b&3}IKIs9Ds;qf&QG$rz(t^8nB=?PIQ)^-bw&h=l<K60MGK9H`M
z1+nZqqEjS)6<n@Ki{d%^L@&PeQ}vUA{#*O+uN~+B@bEsl)6QmQ=idpN_9Nk~?tSkn
zF=~g!SF<zEPGm!clf~`iiN_AC*)aT;<eCMCM~3eMk*9NivFz?D@L1_m21E$jb4>|q
zA&kPAvh6hB%I`h&;H<w!YM^=N2KiV&(rLq8dcXV|-02iK#2}c0U1?*F=>jq>Gc;l*
z*nW;drv(u`leB5|=ClAS=@Gz#qLA&>ZNrM1%X}RbAOCdz69-O=tM^$b2=H|?yHAhR
zc&S0x{!3=(#Sjr7-4v$ozAg~E&;>WAHUA~V{sT_D8ecNxkDq1~!BnDcA)QFN*6SW9
z+T?c2{6%Zl`Sys34ML2HP>TiwOgBU$zJ`-rs52t#h@Mwa5qP-y%Qo}gm<9d)VjKmX
za>_E`)psO&T%#!E>7i8@t8|`UG@pG&y~%vr@hq25U|{ae8*543J{4KsQ{1n2#Jnxr
zB()MQeO#vaYUK`UgH_k}84C-bBp564?_nA)LGX3{$4xU{4`Hz&u{aRqDaK3|A|Tnp
zsKZ2}gBdX)`CUNSji;JI<T)s>LL5Im?@AnUJ>%0cHf75ZARb5|G`6*#4Vuw-d9;M*
z@lMjgOw<c$AmdZ0V+3^X>qZOUezSlnZf;eZ07GwoD3w`B8fBo-k@#$o`=K@JGXA2Q
zfzn%>m<UHD)_#WL4J}oS6$GO5_OknODzBeQAi&x+57vT=prV8`^wucwK}RmZ-y+h=
zXA+4bUGRM0uX?PW%79=SE|VDx<8;A_A?zA2nOjc*(Sg(ytp(*Q)P_J;%m$9DgLi;p
zPPKBx<Gi$WX*dcqNrr&4&jtzUr9i^r#2@?T^|=t}gx+^(hqzxlBrmFn`GF$E&DR4z
z?m_Za4Dhaxd*{JS3qPx$C1@J!lmmuqOz=7|Al35Vb>eOz^E}1!ZD;fN5;Z`C(PsWq
zi(>Z81L)EaguM8bL;YeJxw3m}OyC0#uOa}F)rj#~r+^0yeLHC6eA|7GiSOCBm@8>R
zmq<wn50C%Qb|{2^G;0gc*DsWJQRB%l2?QPdVn8|sV)HA(T1Wr{b()Z&wWUde-T%VC
zhIz*TEsFs02UZYdeyh?ZvF*Xf3*SYe7C#gu1z{hIy_k2?A%P06#CD?eS2LSepc<%t
zMTQwXC<>7lAu`&Mxr=0(^Ac_MR1=;CW5^eNf8g(DSr5{C+wHaUPIU@kDJ_0EoeG0X
zYt>=L&@eg@;{DgikO!sf3g|ep%pPPKExh+Q4QR3&5Do@jniPvj0cZSxAFBIEiSg@j
z#fanY(5Wf2;ZkV-yO<U(&l240*7O3_(J9kb7EIimCqmY0ytn*W9&FchA?jy0B!oQ#
z;{n1SL&10Aw6!F5Xe|5`?vQ7#ja`|0E;V`a9=m4!BzX(~tBOTRQp0)8^oBDhph@oW
zygclRNRhyBV?=NKjA&NG*8!koM9vKm5T48IKJ}u_g)IKMfFGqCprN{ta*dtIj<%)5
z@SceQ7Cs+6TA{1>2<`HHiUI$Demx?#u_vTO8F)bMUNaz?9^dezo4Sj6fw1iLd&TJ3
z)LDh-*9Y`+cEQvP9w^W5BUbKbJv}#Xt=%q<`39c*Jt9Wb-M%e{AwtDW@nZz&%WFbO
zoeD2LrgkIoZpWi^{knl>2$V;IwX?`Y7=jSkZ^WUwzD7qc=0j$&X5cPR3^Ud=>6Uoj
ze2&v?qZK%L8E^P9W3QNkA~viD;dIG;$+V(8!@N%7>@93uim$Q+P|%VZ5?AMy4Vl#`
zhs-wCzNS$iA})l@CLpHX<nH}JUSMjs>_^{X-Wg||>J6M;4ZGe-h%8QTK}RIKL|8RE
zwJSloF||cm_n?&3=T28w+gjwy&D9J}FxJ9SJTN&k9_@6vQ${^?EMy8|`tBc~q%ya;
zTb7V;F7ZYY@>)hi->(PL9x<`r{Nhcl_uJZ!qYcTkc%jT+h@Kb)s@CDJ2d5Te1iwSw
zK!^m38)RdDef(NR3py}hf#Ii9<A+s<UZpT2!{_*P;k?iBfnBe`p3?r1Ann=9I#KsD
zDFk#rZvZ>=Dldo?Xm}+d8B!uZQB*Q`V51Y%#owqm9JQBOz1oEezOoOUAGQ8@^7G<*
zv8%wgXq`W>>fRN6@-W2bs<SOcQ#`)+N3Y{gbqgkZMwn>5b=2p(2y}S&hiZZv^}T{F
zLVDqq?v1P38?PRTpSueRejjML`gm!t>QR2^_DO~apr9IntXK*3g5$YN7<T6t5)y-r
zhR){ww5}<O<-9HDGV~~w*KpJJ;>>T>p&+V`%!l)3N<p(3uOXv9+N34?tFa~ZcQ0y-
z7d<Op4)d}NlYZG)BhEd~l5IYXz#s|Q7EX`}*7jK~nL4{O?v6DT+NJYd(rL+Gmo<Jm
z9~++qJ{lUBoca2m$ofZfUc$Z1C1I`H%5N}q^@Vj(QD)_J_1fciCu?4q*A!n&Q3wnv
zvY2SyFtk<caOb#rdb$4U3r{1EisZX{9}UFzE^03=<?R7dNohO&-0w`}<-il+#Eznc
z(^u+I*|$XrlVs!UW@liyVgKS6HkE<PlgW0y4^9UtexB!#QSFR}ZRK#nYPEM|9Je7-
zB@G?O8S)pfzR+)IvO2~XKmfshF!MXVj^W}C2GLXzHgT0cdZ2cn0KZrTTP6^7tyQ7b
zBiQ$pz8CT#t_2Z&X-XRr|0V_}n-RE{8iDH4>yWM!LQghl&=O*kKe&GHV`sbT;{M5f
znGty||JyzW-IW*|k+($Y>cK<!$sF4lmIattys+pegrtOIuo6$EC2v5?Ec7@X6f=2B
zH=xx!UYMLP3flNXy7mwFbLrid1&)f6e41Tw;%>Rmq0$r#ZBm`+sjHUSq7r#&zhoNr
zrX%Jsx(Ec)+GMrX1Z%B?wv7@~$1$TMm_V!`5VC<g(UKqtjL-d8xC_k#eh1z|3&rrS
zc#MYjCK16=1eTYGxWKs&5|=Gq0~M_&0&$Rz4>EKoLf#*Ciym&~UZ9wsljhTz6GJQW
z=!*kp<RM5ygG+e0Av06`)pCmdG~+jl_aQ%p+)ULeR{%Z#WD&5<&LRL6)Jua$Kg^y&
zB<>`_gQL-Ht9~6nWI$Aw7D${($;ohymE>MM3jgbk!?d>OUR6mNu+v6W&|tLjf>17N
z{s};wq(<j{w}mQJYeQTOqD57Z+z^X51jM#@RlCi3VPIe_P8Pl-djC6s#G!@rw$?0I
zdPH~(T=hpapJKkiaMPBe-+^YH9(+c(H6ih<lDEZWyfA0W42mG;1j1TDcvMm|dA1oj
zKoB78*e0oP%p1LtXykTWp!NS4y7G9Y|3CiOVYXbG^>q$2vyo_y93fi`TZW`VQVBT<
zIm!_+M>P@Ks4yW(cTsYsjOLig)#1MHlB;t2{r1=X++&Z==e_6a{k&cPj_NtS|7ie5
zas|ErNoyQu_@*CZbl!arTnsgC79e<>psFaFdGll^XugLCc!($R{s(wL1j_x=Hiu*}
zR2K*HMy^(tZ?VwO3(xL`!7F(ANN9sa7XVV^F+^&RJ=6T!KL@G|Da86C4W@h1el^#N
znRwG)(E4)O#+#v>lAiGQAQ0CA&qd0>i0OkO6O4OQmIXU+B<b_}R7-UuRAg35g6PRw
zv?Uf1D6kOm$3iS%Ak}&knZew%NP*hyKR7?m^u>3t7?*#O1jsFeqU3nKu+kWasfkI-
znZ{Z38)nG5g`YzW4GfTc=&C}Z*WQBfTjIv#X{mTZ1ne9jh?j&!vog{IK6NtjLGT3r
zFyY-}!0|GWfOjRV(#>Q&IYU)6=|e0WFUtC(==X;c8XV7g?d%OJT_Up#3hx;&grcyG
z1=CMV<yj;{b-FoYZ$Upt%hd}V<OoST42QvkCnxAB@O}blse)eg9f^~Ay=)I8RAGk;
zc!KU!s~|Qqm|tr#xPL@WY|eOoiycoKL1S`bMo?1mGnE#Kc+I0xo21>6s6=~guLq^l
zi~+|}S-*8vAE5B239&z#WufgoDcsPZ`M)**%pxk*T6L7xmy4H(OdY&%I4mCc7y#V~
z1}_BJS6e!X;Pp(Ci-XyRTUY`ah!dA3J0FAg#|>3j`x=hvtDgQiK?0~*hQcirK!z%N
zF_Qk$zI4D$`OrZNr{5pmSm2P%_BJrIesYUrWcQ*ghB^Q&Lh2gdMBMMRUjU`abijQ@
z=t>yZNI;V)@-VACJDK?|Ig$h*{I8sypOnDXFeutdSwL&NInE5BD%RwEenTilnQ0K=
zlsH0T8OnNm68A}!B@Vd^X?^9)MwHS_@uziKT0nlX;%Y{tz)eMTr2Klmf@{0y9|tJc
zJK7`mQt{nx1KT`O?!=oQGdVaEio*57>w-YA0Dt!)6bTxPg;;i~!lHy(vI$mC<4w!I
z5s~oRpaT)YDPBm1jg+urn+0`GFeyz+j+wqx=Iw)PVnNyXoSZ}!u!sW#);R}n=E2<|
z{b(y0)n`ZQh~-JbgDOiQ2r_~b63Uc59FMWE6mL}oWXa``8AwE*MmVt515GW_^;3W2
z&+xi-JmA_<%DG|_f>-2q!iT?^1Ap?~idrQ4qYJm~vS+XC&AvMLx9Y~%9J9ojPgPx#
z#wl;QE);!u?Td#*OK(m7L_hs2^)9dKX=nj(4%qA6IeVp4@Kr|nJ_i-6cTB16n(aLc
z?&(a4=;#o`$~nzLbv2{rud(TA#~``GWO?Z5q;u4>1a`Y5*U8y4Zq=(|t>@CG{Y?iM
z>GBnXTH}RD{<<h`@ZWPrul0o6gATXT&~(9dd9(lYUp%|&F|~<pF8z|`G^ba*EGpc-
zuM&?rn6J9$%Co`DPLZdqagW7q{e|9<p^j&zn?#k2OEa!_d1b>6r1v!o1-~sNlW~^1
zFa4PQgguvP_M+EzY^sLL9`jR(qG_Oak=z>9zJ*t%W~k{#QB>;N+C+Out^${D1*}$D
zZg0wUwd$CUl(g)f5ZwIFJK6rF2Fc&2)&9xMf0uu=>rT~*?|2e5gJSCATV9&%p?$jz
zyH{CwlP)DRk$6459zQG9<8-|hE62ret?3=4oEoyQ>xE|;qME-ycF2sl;{2v)=KOI5
zzh#L%_&!G9{im<!Qr8R!oqc5%+nXK5zgOc}C8IxcXU=qhj{3cG0Ml<~rv223$+}37
zk}MqrHC|~AE_v~u)>`goW}^gW!a^|w-*Z6OkDK5~TX)>|QS4KLtR!ew&e(KDcV(Rq
znwzrsL0Ng4S04pCFet7ClI`ke96AhVH7Zz9qr8{#EA{}{7n^)d^xXii+o1ym7HoEK
z2xL|@JF^U10f7ZEptS`QQ;z?&BPJ+gjD#BWu*AcU8vhuCl_s`*bvp!LdSXuEIvG@y
zc!wFZn}eD}^CA5H^te7DAf73Q^;lU#jCurF*Q#4kN?;N4z_M5SSe!|5<2PtPz9Qcu
z{E#=JB|_rr0W5B-N!D!ncjhM+Y{_FYKg3=h><$XC!Z)18FSeY=exFmtjZf^Pjsi`h
zJE;vI1O%tvoIrsu<zy5h-361QL++2tD5;SezYnJeKr}4lsVD<?Wst&OI4e#eg|9Iy
z``$CNvZQT=Z20IUI?{ABrjGm@3FR>TW!Vfx{Q7B~bX02+TJ;Gm$z=3Xno&}zu|HwF
z@T4rIxJfn_FfXx4ofa#_C@dlmXI-?T7n`P{>QkppYhv#n0V?@%Th&`g5UG!Bs;d1$
z<)<vId@eC~ff<4B6Y4`%v3=E0V95j#Tw+Qu<4Uzigb+6#CXmeJG7S=0B{&(BRRM6i
zv|8bh)gJ1oVD4`HLyg&6CdS^434T^o7VcdCm-B!wK3R<>e~4ycU;~TRU#3ziL_;i{
z78k&00sUjMOb_F-?h@>SAf|zZr}%t5ihyh1lX5ljdbX=1f|CIRh--Yb7xBLWh<>zN
zuS~X%s#nr@j}{E8IE+<DGWeMX?Z7*bQ<qX@<QK4n-h=jUnbfvc;6c7ylcHr?fpd(g
zIG7<K6+H*c6MewZh+j-!19Iuvl?_B$d>3Z>tdVmhCt8?S;O4ytNkJkZtln<L1nkab
zNKQlyT|tT!C)|EuSt#u&U%5=iQRI^}EW3I>i4AETphiG6bqnms2{=Jp`vlDTePSv>
zkR0>$p<j#j?PeS<P}lVG0Z|aNzjhIaw65}?e1sox-!z?+njjh}up=wWhqO^(8=!>|
zMuy`+#&|Mrrcfe?!4Ao}--wL;#|!?m$nb|-42}jeQK30OyPZEmm~yH9JpN!~hrXV%
zhP*1V3WN*~#*E-|#>yJZ?YVXoVHZ@NgO&$3+qCdmU^5B>XlOWSUB8*Z1dLl8fYb7&
z6#YU4n8OaNmy7Efl8s!1Obl%lG{x^xO;VJ({>5LGAxv3c(}00~MOaZulJSNHppD=6
z%kwy}PPVB(s(*>GeI89mT{IpB<yAPA2S)4dK?E31!c%9ohUGR5@u8-FZQb=Q8UM<3
z>&MW4v84NzvuI5;F!T(e!|EIo)M|di05^ffC-KMLXqje-6Q-5au?c)k%2tm8OzdR{
zwk>x@KuN;S*l!BZB;&2G0aO_Q8PXo}B#w(we;bvw@%I44yIUb$)11u3Dwc4XfpWH&
ztXJdZQ2Z%8!a(Dj<fftNyBjD9(f12+mPyE3#7$Z~!@7T=Pe`8e)~K0Trb;Si0aUyg
z5@Nmjp;La*uX=cELWhY^d8z&HvwOZI2-SFLX#9@wo+=h;_-A3gC?Mq?Bf&#|#p{)I
zw)Y+{5ysL~O1uK_ik1~|TJDOxgvPY&2>d4>n1(Eq3O@Kc1(9~IAi@A)vFQ6}txy_-
zf{iCW4j)Ag$5u&(c$+4i+bA?kHy)cvy#5l~XjMfv$$LUvBI}^mPHRC?^3^RTwUR38
zAE&*?OPX>bhPT%8F(9_3edmo&55-xuCX!6l&J;+BeKRN<0OAo~XdwV(#z$CgKcy}v
zqqXUfx_|+oUf}jbCw?+rWT&BRjyP5^;QKYM-}(MG?~{|k0{>S6)O;45E()rKZiSyb
z&a;ycB(N9P^e+G0_2E&ffz<F)!r3SP{T2Hv^d(jco%m8MrlC6Jm@Am?eTU9<#_sfa
z6MC5I+L5!e%d-AAZ1;byFy7$T{>kp}V|O|nns3BEf12I4sgclg;qu2E9jV{`TthPp
z+;_0w5l_aYAf&jWPN+>oXa4tcA~DTr>%{&Tdz<ca#yj@yI``lu%(n3Aov_q9w+#VM
z%*>I%_V9g;Iq)^r=CI>l-I&SHWtDq`lP8WgP+J}|K)w)Ye`1KMV_ml?R^o}}&z$LZ
ztBQzAdM3Px#-YE53%|sC7rws%Buvz|&$GR#bt#H*_(scge*>?05BEdB{l}iHU=6uA
z{T1zoOgS!<gCx#Zk?WbGtL49Qc?CW^)XHq`w8-ec?ua+z+9%1Cw}#%g>ls!1n7Mjd
zsPiT?yo#JOIpntMuOB~WN#vBYE>g8iwx|xN{#}&&F}~>EKsofruts1-HN`+$_sWlp
zSEF_nSH6S<Z(6jS7yebvnbY~)J(Q@RU-xTqjS%R+P{I7FImAjbCY+#m>>D@~s1#t=
z!jTFdkL;D#oX7axO|M!u@Y305d+Y`IoMzn|@A+pH0ps17rw7Uwb{XR83zH*bZndXN
z`xQ@BZfxFDZ(q;9B6ss6rbDc-KlSH}F3Wc=Dux;wdG`v&nlF^!3)W%=tSJ{r)eLZZ
zVyL;JGow;%QK-g$N4%xj2v#y0WHJimIjAdu7a|+~lxsuKqXr^bU2^mN`qsK8kY3i<
zxU!TOjPs6?^(q`8P&8PJWSG0VW4vsm!gvB$dWRMV6`VfGMj`e8tmnhbB@cKsfq$q`
zLrqIWy5=k!I!--J0t4~{nLt_Kw={D930p866QRx=rMmR$W3WmDGCCT%2o`IT)x<&^
zJEI=G%U3DO-D(HpS$Oas2%-QN0w?J?H4zB3EJuCsMM2v%(+7?EtM*Kh=XO>RJ~Hr1
zJerV%QW$Rry##d=k@_<#w@#7uo5~1iCON4bPu0Y7&>=w{NXft`q?hXdfY<^g4l7v6
z@CM~tKbEsMur6tFyLQ;sh`zkAk=PC?0_8M^Kwf7Gdc|A!cM^l@TR2m2r5nsC21Vs1
zq{H|-mz-4%&WWd4cu(h}tQa(MyVT>Xsf=t5Q{x1KRJZ|j3+#K-#bqh>4J>6uKVp7t
z>41mVcp6u$GzG=4qZB*de*9@X9zvC;mET01vD!I@8Er~RM(0kB5aOXry2fNlrdP76
z*wpfbmbr(<XLtdn7%guYBlK!w3Ap}o`L%!#1`|nS>q(+k5?Lg?t*4otpD7BHlkeM+
zkZFc_Fkp(3s~*|!$Vuj@TA<tzklZFLHJ$;EYWC=KIoC=5?X)Cu^Jdx*H!Pm8rzIjE
zNGAmya1T(N{2#!@L8t)@ksA}bg;PKOl1ahXRI^fo6ep)KUW=zjQG4cOEC)%jdIKtp
z+boxRb6CZkDp_kC&>&+h&j28TtJo&FB#sAefX!J5gQV^-)b}(NmJ)zuNZMnIe1B=S
zZi~pe>LnO=Aiy&w=b(yEp9~h1A_CPkAruZ{Y%5kw;ONjf{;&N{SOT83BS`Yhcag`x
z1@EG$!6j_Q_|jI5(*x@j@xd{*hKXmWt{_u5s8UTTpUed#fgG@DDeiV9YQ2yG4H31W
z(#RkvDx#&1o;Fef{_#tEg%0{mik1=|Vd8WC2RJmJ0{rcN**~B*v>G>f;&YpA;^9zv
zSu>MUg_Mt+BwO_#atT3jMmrP&orG6~AsV9xWeUj0v91*PziULM<heS5Y_=;@ey+xy
zw)WJR3B?F{iQhC5C5@Pb9>a~qLY|CEoek|5kX7eCOy&fxFn78>ZMG?omY^7`_3d(c
z9R2BF8cuV68e_}mL1|y42{G=Ud{jr3l*wWDEBfa`72R^rj$b|NGTqon*xzGn74B5?
zIkZ+EQm=n8Ar|rO-ji8NUf0)0r5&Pi{od@ywhLNzPe^wq=5Y19qkGQm)=pK7Z4KGD
z&+So}_u#~IKsn;jS~Z{f>z6xUQ{G=V9E5jIxN<h?^a39$Arfb&{;cU1Sx=`8GP^ZY
zZgE~+EH<2hXHQmhO6h{rhnkIY@NpMu1aM6I2a!ZDrxud*61%ClCb_(hTd}_w=Y<~P
ztHV^1jod?`p4~B4g=-{DQZ!~(1~4siV<Odz-J#jTC)6s_I4UW*gTCseAOUT>uwzk~
zOD^TC0r5WA^mx4kByWL-Bx3*B0ucY6V)0}>@~w}CD#%dk3oV(PNn`(@m?ps*Km7_L
zn=^7e(<7-Afq<vLy|Iy^<~ToK`MIo2(j7$nOV+tN2_0AaF|U<SBv}g`GRXF)fcN_i
z^UDoamMEDhPJ})PeUK%W^-DBxda+ht#k|#~trpsjGh+{$q^DKmm3zr3{#MOV+!r9%
zdxthkN@0Dg4%Af5!4Ho8Wgys|-*E^0((!bhds+>2Sp(xGVz{kWE!$=3vffcdfUOH1
z=49x_g{Ia++@JgpxZG&*q1a&5Lmo6|>-6-=Hzlu4sd}NryV)zJ9jy<;e(RGEoZcU2
zJYPR|Eqc&U`DB#78LIJSdSUEu8aKaC;u%B0SO%$_h$fls*WUYH`=4Irez@xScHNe+
z(^f8p1;RLEi$nb#Bw)8!=8{Y4Cnh_ntn@#Gl%KF#_MxzIVb14h*v}c#$V)tASgtI$
zI2b^V@Ek0TtN7Sd(cb#4i2}p&a^j3rDSG}>(mCe-{Y(nf9FUAW5Kz=dhxXcU-`eH>
z^rA&uRAYzB&NI?eaBMOPS@|N6^FrD4a{YKh>|P!1(ylmYIX>js@7^)Jbe7@{$E&Vb
z8HiQ59mVU&#gFQM{qWeVv(g@EXv$U5KQMR4vEiookBQib2<=w)k;O6+mX^oOwLjY*
zzq;3uNkh%-u$lRnOy{`?aM#E`=<+zeOCD9~YyGD;8iKsXUTa1LDFC)F{f;8APZHH1
zyfS`2yLfC-(p6gAU)%cd5{19O+&?^_^z_*kg}Px-wP1Jd!$QxMy<h*m<F5)gbRJyU
z<zP;~DZ!)e>b<X%NB@Q8c_C7Ro-3bsIO4LjWvx;4^#hxs9I1Wz&i$fqT~F$;NShI?
z&izn_$qRJc6g&F(@zL3EPHFe`S<h=FK_{R6^7T74|Lj=T+!dSHX9|+P&ut!L6<g>%
ze0Y2%JH&Niq0_yr-7wg)MpL+9@<jY}VQ5}OWopq#x9y)(F<<2Ut|)yllY8VGu==6K
zF?(Xk!F6=@6yCGxfU^NsrWK-MoYH!Gcuj*<Qt~_GxGv2nOguWCLn0^tqh4~iu22x4
zU$SrQtH_n?8;?hT#^VGuMV0i={wALKRy9M<97=<Nfwrm1Jim7Zy0Dn-&BI`<o5!W$
zyY1p0Qt7pjbUz^OQUTKVLxZ=Tx9%UZjw?=ooC8IAG^S27p?MGF^RjJ3CY#9I!vNxT
zJ}srm&W~u}8u9BE2x{F#f@dE3C70Y{pO)qY0U7$G(D}4@sLbo3={!I50=4&&7at<O
zBucWwuxcmM-f%RF{t<JujgvO1R6YfJyTen&YaXo~&q~f6+jx+cNS2$*N+ct@+Ja&)
zk5u-;Ay{yXFOc97AZyXZGL4TWt~fyLUs!m3iFL~8RB3JSATi0Z?#a^R3he@h18)2f
zsi`D0g`H_+6t1$sR3K*ZJe(am)-gMMMizhP`I`%bnTbFxp_LtSgGN&Scu)0~eX(93
zwx#YOwbuu?#Po25^U%?aH>{o=rmzj7)9)@tCyvW>h=KrRj>c7hyT7bQ|58?d)}`xf
znkZQcG))IGvywPp%Foj|2`JOA2tXrshpDPE;$n(^zC{03GV_}AU2z0TuQmxH1nsmD
zyWz8epX5fR*@fAHDlPc1uH}Wf(WF+5c%SO+r~c|d)hL4kZb#1%lm8v5EV(-#UMKZN
z5`j$yq&kY>Vs8*j3L9JLGIeAFI%i}+nWt!FYG0bEks6Zai^6gpUqlvyq>ym%Qqmg{
z-yqXd<)mj|we!rhE)=3!5YYIeR!g%8`cIfR@-{i~i=Px>SlR<=<cp^%zI$JIsh0@R
z9#*Jijmk1d@U<9h3JuTs0<ZH;x&5~7vV)Xwo@CY=@07vONylFal4Xl!j!}6zt6OOP
zID+-A!)x=5XV$NT2qZ!-Ir9%0C*%8}F&B*j)7v!B?s03p=8FMDAQY&BTcONp+#vh0
zKkEcGqpCwNff)yIFp(sk?kSf6j@LJE5XlWxWao5Fo2iW}WdLn~<oWlz({da6`C@hW
z)v@&qfsb4*(D%D()D6BSkv0=c6?`sKds=$x_cWl3ikszzM)k|UwXtUV$Z|&Z4<`!)
zl3>^dn)v+6L8_hd1Xo?bun%SVeHzIOM?=arS}kO|jd8(#j1CXb_E0QW-@EN5c<P&m
zIk{Mh=M^V=yRe1)(ZO>(>t#BYnj+C0QGV~qW~Y%CTTeGBZNTU=AL9kl9?+{E4YaCC
zu@Hx&6g7Q3Y13Q0q8~rIO~*b9ua?C#AJL1k2bLVY|6YFbGaECFXJw*iKX?}Xe0o94
z<YN5z;yJ5p<6UdYo0Hin|6RS$B%gd{7VTAn9lQE2dj{CN_4m!6lG?P=7Xdzp0>b82
z{Pn-|kSb#y{d)D_`ZsM@t+706<;uYR=dRXsW6hs`{Ol<%Ju7(SNI+N2E>49%&aI*O
z?o+LUFOVFyKa&9tAie6Ua{R5`GCoVfGMkAzNSpK)rRply`d*Ldzm1BI1a|UkQ-b!a
zzz2ti;iYGqFxdVio=eYV&#pYZR%lz`)L%#S$9|3Yovg}H@yo+af3kLU20?rKTLdzA
zd#u0}*`!}v3$rphz^KgwZ4}BZR5k!yU5)vwgVp2~Oj-p)f-0_n9bd3S0UCxl(fI;i
z8OkYSy}?3ja}D_$+DX8;?4{7OU$+1S0}VM%Ab%M1ueu;@Te+J?!MhAPzHZV;x-1Xs
z&Hy-&g}yj}22!`hEA5N0Ko4r4A2Xp9B;<8p+iC`_$fFfU@rRs8;_B>^*}wIBw3Mk{
z(c8XIgofEBm(XQmLBL63mZOVSsduaDM7MhFI165zrj2(nkOut?+1AO4Yb@|kFUI(b
zi;mr4g^yo<813`x^#pgqH<r%?t;ko$d+&C<G8E-LWkT+b=?nt0R4>LC!WN@=cX~$h
zW<LpgLqMq%@IqWm|8VH&BS3*iZVrc6J3jlSWJn2kOO&vCoEcF)WwE<E0oTZsyL<aR
zG@f8^Nyp+x;)L;ozAHBtBs>m%op~4qfT0Pd8{5goFNv0agsWZxHoot3t)+c{tNqGR
zvHJUpI?9$jyV?Gi-==4z+t+ZN;v-hE*~|@lD)@(~i;Lh;Z4B>(i_mZWicMMaQJE1g
z$yU2f&dNDDEBf+Ouavs5gcr81?sKNgatM(+IvTsT>Gs|*8NR!>^nO}*@9mqY;RCNf
z%jPofzteke|J`$wB5P%R;qHf5mv82FdV|OQyn2H9p)B*c*5B!s8{fL1apOhkV$<{|
z%ZB+yZ2ZC4|6aEhJU?}0vV!gTInV0jvE#F>fr>cGRj1>xRC_Sq(h*fj@xKqPr<Om-
zUovUJsQzcN;rv_<hV4&OdS>&M1=Kn@P5;$VPDb}!7AEFY8dRRD+s^C#;3VZg>>WDO
z{Iz}Kz|xYA6hH44owR>r_k}Z`j|S-th!?wN$NH$yi&yqJAW%CDHosYJ-+Vo-{iEQ@
z&eN5zq46+zwc=7a4l?HAhldYdJ}R31@8dp)_w;JDS^2%+FZ(|@DS%G`>|+sc?-PY-
z-}1a67k+BpsB-JQ>!<i`eEr&qV_m%}bHu|%)V*`k8p!DCy78^Fode&Vk`fHOl9}pZ
zO2_YRI3MXeO)1N{v)`dX#ds#(mEHuPlbUmolktX%7F!wF{N|6+m~h+Q1^tqMmFAbP
z=I%&;M+NO+>n1^4EZ#aF>Fvm>U_kj+-PgALm~6aiKPq^|E7|U}KEP|*$cUm4Bu_!$
zo)!4iCM7L^R$h2Wp>OroShFdcYU_T;m!onEE|$+B0CHS3U%W!4!TAP~krLWtnhumn
z8|EDaD8MqoixJ)&h!uVi)*iW$B*&EEkDu29vJ7mnBGZgCP!_edPUxlAeBRL5EzW^b
z&&b9rVP)rc0!ZDM#t(1U{wPrw5y5E=^tjSOQWNGiR%!N!4Dv-?#6YzFH6>N>Q=ryG
zP;`vPj&m?qe;G3_69W!1Td?!Z<&QU@?%rrh`sJg|XWCrKn`Xk(3_@ei3bmjz6NarE
zww?i^X{$2{))A4D{|DTz2wn~e%rOewB$<PWA2sE^Pz;9+ZW%W~b0n_S!8jB=*(?jG
zPLj(ND;%3Fl7tD`{ENC_QCs_2O15ukh9TIc`*Xb2Oq9CvNiYtvr1tDuVTEaOU&_^d
zKZI<j*rEXounFdfV22D%YOQ(w#}N&-1WiC_d_B@0%$b?8+Vm{8<F0_4LDguFsIX0L
zw`1HdGq{q?sYNx(Gi3YXusnvg0p184tes5ri)SnWK+Lof)C47`pk|cQPFZ~&Ln`d1
z3))C}96x=m<4qOOEM}Bg3UWuRsYfy5${K-?$GtSdFlgkG<IXa4aDp+p^SwYBh(c%d
zd4sBJi;r7=1;O0c2*5Br@AWsxp^X5Kz#v)NA_|jo-7pI!*WyCBVqkK{%f8rA2%!Pv
zSsFFk0X$i<Nz21fAC?C1kjGGRurW>=5L3pwku#Sw8iPdUpd{so`pvfTKAl6^#vt&Q
z<RH2nt$*Oh{D2u$dQs53|9=3!23Zl;AChSUO=(K1&Z1?R_0zZn$`NEwi6BnF%1AGM
z{)Vl|fy*e1VGKCX5-pLLhMB;Cr6xV<nrJEY3+{Xh7|xZJ4ppNLhl#ha5W^5L9M*Oz
zxye+N;g~C!9K@@eBmt1xsTCXB>t8HtjyBYR2DCsGXmk5tAS*6ih|@)g0NrO8j3slu
z@>HGVshO$gZ0W;Yz5y*^BN9JB73qJH{zyEeAKJy#)2LV8CGLs~GT#L**2Cu=h&<Bu
z1W-vshTKVP_bewTyJ~8gW6T$BB!SAToK~g_gT>xL*9Lcf+dTec+D{zZB|JKC^d>z_
z8hdvBkMpcbRChmRN4j92d7s&i!r^k~^5c_t?^Y!EX{7Cb`6_QN_3Z(7^B@deA{$?N
zwO8-{E4|1(JKXAGb<USBDq}Ts(r;%<&8`<Fo7#8yZB<`RII`HH_CgPJqdtFl6cvY`
zBl(~&*O;D}xAQ5>&@oaOYiZf~&)(;B*LTfd&q??s{S}=yS35d(1vQ#(SFXlNTC@tY
zTQ<wRFZcMV@jEhPPAIN*((N9&1M528uXp)FU5s9Y;rUS8lT$nosYlj}4nHjJv>%B|
z0+1L6)%_uKGIUV)&r#nW!AS;}0=MH%oxf=8s8HhkQff}GCn3_LQ~~~K>eM@?BDwc^
z9mHLNYrd7?e!Hj%+AdK)C*Sy^<`2~zG%-MuTV*JMkO8Tp{s}w;pfjWA@~DdolQK-h
zpBYOeZ0<r1Yur<TjE&KKh1NGuQ&sCxuTZSCth<&F{BE<!p=ph`#I~S%L?0;zpThTL
z^h;c$<vPb%z)g@$a(o{W7(Wz*$P$_ne;`^+`QV;7H7cKsjZE5NOG+(oNylPy-M6g{
zf}?^bbFqV+_{4Uqxk^4tON{M)wnmeb;O?a?X}hx3K{3v>h|9}%0uvSSr{@2WCE{2;
zmcpjBA~p=bx~0pq;vET>fb&q(K9yg4#+g%Fua(z*i;*7Q(VjZ-g-y@!0@bKJ*8j2|
z4Jf@pT!a((*H+F%IIcIQ75rKd^?1k-{koyNv-0oQRdeMSY4G7ysKjs`<6dTCWK{>E
z^iTQMeyN)(-|y@=JOZYpO^#ju`Wb-QFI&C&(Y5y_>>;n7IpKN#Qlh23;cO=qt{Y^=
zS|6}`*m3yLvC#+ptI+hw{j>?uxJ3qHP|#~VM_)c$@%CPXMM!AB?+UQt@ff!s;swd<
z2N4~e>hCVV7HE}31($uVp3D_v{7PFyF%mDOeM&E*c+xwcDqk|e@5(j*d;d$c|M6Hm
zUA5U-t7g{&alQNYxL+ae9Ue~=9dqlXucf7zvmZO{yH{Z(k$Ncp#>?J){ElZ+T^ggX
z16^IIsP)h;x0$V;<keJB`u}mh1k1PadP}qM+}9%k<lQ5N(5?So67Cm0!ME(wK()=j
z`K;@3YBc-qjklR@Klv=u?q?UY<@S>#xm4td^A8@_|Dj8PL#W$7#b3=xS<*vo>llO3
z6_=6mfio^|r@P$aUEyYU&z~#1+7|Z(=$IUhfUuHCZ{J<{lFN*|Ty?I%AZElie|qYS
zh|qocj7A7ca4-8i<p0X`%%##lPvVt&!zRXsf8<>ES@{{QX9d`FNud@=nUK#vW-DSI
z(6b#L9lKp1p)@9ovxO_1KJ@U4QrO*+2IHg?(WWywOXtgr%AQ}0?^uZ6FALu~E?(+-
zcvku;w-0@BD{(;CO-H}bw77gX-nQkX>a(_M`A<J5JBlh2-b^-b8D6@#;W7DvE2R>W
z4$m3Om6?1WnA#TgUFX~G`0ucm)wNf|o)eb$!#JMd7WP8c`fJ-6rE|7}`@`ky+lI1#
ze2zSK({3XOQ9&S&z0voPxM>>&uz}h?8uQ=GDRT2`yImEMkul63edydfR<VYb5Qrn$
z7shFWpt<CQDtADH?u|ne1G+`_DdF$}_XH#f*qAUA>#^E1%}nz(7^MMcW>5>=61lN<
zPt^q+u=Qi@06=x%CJ2OaX^PhiwoLzpl=Mt#8|WbGF$s9rA#TK#)x0NUKsm=WE-om~
zKNE<ITlt5cV}dH1&}v))QyGXDJp{JoEXHb$>aVO9=|b6(!Kmwm9(_Z(_+-Dn;Lz5T
ze053oDSQJWX<@_yP3lX(n5+#X^Uy71d*pXcW5m{BK*2Ur^=$bq_ih5&Fj+bF7-7&e
zl{i11<09diPzN>ut+HTItskzT=U^Zd({Zveo|2}6DeMq$L&M=^2jlW+i4(<$`3P(e
z#*MQ0ZUXzcF$zJ3bpg2+=)1QCIiq8Z1}R1&rDB%%r>PlK#_qJwUybq0hbWjq3<`Dq
zYCMIB9RzVXARk~QROw+=o%!?f%hsh_>ok@Hk#x$i2Mb0RTReKRQB-G}*MaoKe=(Ii
zBx+Fw5+MQcc=xbMyMTW;$<_c@Nw+_AgNRg$D6UzdK@C9L5yQRW9nvjsXJrdNG1zoe
zeP=?XVr5(+AGq68f(~&8%)Jr0J2@IjO}u8eez^D-Bx(*klHaGP%f_FH*mYQ13C~IQ
zLIO>ya%7Mf$2gfm=z-}Q1R{>O@Zb<!y;f#ga;H;qVkzc~SV8*@^>*En9!FU$3%>x1
z=7!QUQKL@9FI*^C<1s?;vfTg^$|Qdf5&a=WSfeqlS7Lo32xsvt^Tcj!udbj)lw1<T
z4-CQrANK`8(JLDdbj?_c0<51~=}jc{fP<edwqN9tbe40Fy8bF=BY!1iSmu7EmXdX8
zR=Vd59Nrs(kliD<ab4dXf`>HV386FqPE4KFRS&v5xyi}go?)2>2JS|T(8Ud5x=CtP
zz6qgd`K^zu_WPiWzNUx+E^*-g-07Td6l?qe%cO#5&%O#$4zU2)m$u$)+U&S1OOVV3
z;YABR#Pdk;?P2mrjY%$1zn#nrC2=M0K^$H<qe3COcHI$XPTK!|_U6G(fn$8?BM&mv
zC0wV>#@z8Ewar-eWqZ^qq^Eq&i)Kg7AGL9SPR39fSzjYpwi)62=OR+iKWgF8KDNRb
zG8n2>APl&cU!kTvK)Yg~=W!|cdfscDEW)#;Y1on^TyEF%B>C_r=@8EIicGY{ilgZL
zN|V`31p{cNX-Z@f+clsiB4eD}t)ZYHd%ch=W4Oe?ixrI1=h5);(Dx<)?a#rWa7Zwx
zF4DkJBPW_ImT=C{t1pKNQH37By1+}ID~NoGN@F%C!Rw26ga3i-d!r<>2GG0zf=+@j
zB;G&i6>f}(?fYvteERWOKV?Sl!m(Rd?{c7va@@BR+<0G&YM!PJLV7>3`s8_!3IDFw
zPzpCzGdOL~m_tlhnJk|E{;{V-PCv%#%7X*XT?alTh}gL7FZInGzVfJ;@ov2JFHhc5
zJTKv9o(K5|*Q9~+i<3^{(z4#1WQv<4uRS*K@teXQis&7aUsznO0+sRTqr}AQ9stFg
znpkN*EV%@))#^vU;6NV+t5}0zaWc_sTN!2P-PTQW%TxdW0X#)TJO47Nu?{R5OiYQX
zut(D3MYTn!Sri%O7$qv>D2*c+5Q(iohg{3b#7^=$pC__7fP%!MH6v3Ro3Q^0^&2AX
z%k%*J6wJzN+{IL)k%&;v0_%q$!4#N8h*|kJkQ8+FTK7{osDh&L)MR3rQe?%SjSMmA
z=hWGKSFWnR)h++?yZXbh;bB)sE#bi5s@wf_9lFvelQSn@6=+>idhsEb`kq}Cz4spX
zngki^wu)|!=oJwe<(h_)2&=sDTZ|?-@h|>m=asH{|Mm@@o6tFu#0H>c-kZBWAo}|R
z30EG|_P!j*{1|KfIWMqA&H71b0{mjIfy}IE<LvDVHxzo(mqfx#?`5guLtmPQ+v6OE
zr7D=zYe(CN&mB)!3@zNYn-#a9iwOQszh_QT(t!gH`9SHcpXw<z>Y@nyG*7Nyc)$-N
z>C5`(&6IA7aO$*PYVo^oc_ppIx`w+R>7_joYuOW34hoJ~j~uyJW3v=Esu3R*e*E(N
zeP=7{S}j6Xw6^>cmT#>)<k{LiGU`6#DtY@y+pXN|icchhVZ)&-YtP?h+8uN;9vl}5
zfE!P8m1)IjX#KL9=I!ReUSZLc1Jiib)qZ=#>S(9IJ1m1Xxzp0rJP7t%^<M5zM<@C>
zt@{O%XY0>@b58y0%Twl2kM?@)9V+LxZtIpN$rLET$jGW&8I)kL;zjn+V^smV<o^Lq
z`RRBhkd}MAV$Dj_-k&gk^_tJ;X`}HNq^HoWa4_p+5wCQdN|xGju%mQDR6>?6U#fG}
zUTlY{Jgh6W)jaRd8TQV74r!rh9kM**N-%lc_fNYL3PY)iegJA^$2T10+{-DBDq;{M
zeNU=TIpl5D>AB-UBU|YOQ-6OL8PCZ;1&tD&(}w2>{0cz1^`6wP`|?iz(vE+E{cf$y
zslRs>#MC#dzTR<vAk@>yN#ncWW#@B>mK%`-Xk3yi9lZ#z?yEnLwlH+1K~o393vXH{
z1aWW;)@{F%H56Kx*g7(TXgq?JkgiaYQEt=?)`0EKGYJ?=vF$I~0oU+tpyz#@$&d4N
zCq-J5$iE2yZ?g*tVv3vnW|G0}Hd3o<+1AD83{+4CJ)zMZe9_w=bd(t^PHbPg=1i4b
z_As2z4aB=a7eEP~wF+xkCA@=UJD%af5|0aA(=5eEK!@c43Y2#m>kj=C!NE^hn1%+A
z-j#N5U{DT$ls%W#j@*WTq{{s{+WArfw@^q<468{jUt%ReJ3l#iSYHJGP(Uh=-PWqp
zx;!}elwVm~nf&ipRv(iTntE7LNlLiTa^OIhXEfP?vCqcglGRzR-{$t_0`stlgh+oF
z%<&Y~YJ=^IoNIG6`1wjv(P;x*upXg5VvPuu1;iB~c}5@6;dK4MZ8%Jrki3;KMR3gN
zy%tIDhnNUcorM?kdkv93<80&ko1==QXXiz63^n9Zb>rvHv$<p>%#q+cm(D{E0CANy
z%`)TJ<81rJ(DkKU!m6ma02&-)Aq!*Dr4)aoneD?KUX%8n@`fafBLHhM$QCXRGtE@>
zD~n$U^uUIz(tA7bik0s=GqG@NM7|QfaYhx9&$i!O%3)*E^}7i`N`Lw|^Kj>$Ty@e^
z=DWIRhKnSal<Tfo-TFQsxL*gP>cKPG;VogzkW{kB8}D*<5^Yuh=W&wW09XcQ^nXBs
zAE=C|Qh-7R>l3j3g=CYdaWsH~qGdflRCl595lMVUJkXsvcX=58LC_4g%)j3Wz-CzL
zZ_73#S0BcQ=wcG<^Zyz88Z$JyDIaY=3#AA8k`l!6Q}ea<0ASDm7<2=G2KO#mT^0)`
zv3OpL0|A9xY9vR@@tpvpuqhf?eSv6POmW;JXU6DwuFyY0mL6|aE_86|#AjJBoz33h
zIkWL(YTNAf)?e<qbk%1beYK-YB#tl1<9j_DiBdxs+AqXj+L_TBxPoZA%_T=tKZ^}F
z%Cn9=%dF5R?ufi02>x^InHsCF?sG>fwbUc##0eMO`qZdhSEJiPz*Q8hl5y>(j@kjw
zK!MN(LGa;AKFIXNMn0A+ZffN$5ZovKP1Q4O`JgsU>L-$Q1p%C`Ste^s*r|6piI3LT
z;7_%ZN>2&Vy$+1lK3x0V?x5<36)o{zFHEy%g)G5)J2vcy>8PoK+=O8PKUi^Wg>3FR
z#wz!E0Y!(7ro*Vz6>REElW5JCzS_|;lv(|&)_UP@)ZS49*8w_Uvw<#+pr@}ebCcv%
z%O;HGyfX(DM$b8_)K`nj@KI2?6yBlqMHw~V?S~iH9_HkUQ;*yL82L?+N~vRFW(G6Z
zm}JH~sf)ST`So1^ehw2ppXKcQY6>B*pU&F;j5!umIUIJv==C5~GMT%7^-iv{-JQ48
zo3_4l*G=)IXKdMR{kt;ut9QeXe_{-g#8R}J@-s>Jlo-F^_zr#pu}TXhP+e4u#sitW
zdN!P?>?a#pO^6S|MNkohao$4u!?oKtTd@D}vWV3nL;}4g7!AT8EJfmqIW8(iNXCJ#
zG?+I#Maspch>|15aYX<1<z;)+Vlb=+1bh+q<>J!{4(luG6VRDZnyd@~*hpmdTfd4*
z>j6a0;yyu)(3IB48QFLoN|o@a+oOgr`%F!i(SVBzFfA-d)uLALxjhR5N00mV{zys}
zPFz))to!tJ^;LMEF66?Wn$fWPe^?ry?>tWZiJvgRm}gElr3s=IZbVpVKD1%IF8q(l
zg({yr{L=a>#^w^|&Nm&M)NB>A_N|I?%=IVDw=bUirxM>h^9s~gFnr#3UOI@F8@WGO
z20u}gR&*n`Lds7LAI3T{GkY*MT#+eSn)1oB>~`G`p;gQ-?+b56Et+^Q^rR*Nlv;zB
zl1m_cxEsPxGhddp|JH4#dVa*<u-c<<v((#ymf*2}a$$a+j}I)3#9TV>=^kJ<H<owT
zjJ8kVG#KWsJ6-Vf*zL!JZLIs&b&ID`;rc~%-rD9j8}y082!Eu9h^V`A@RZZ<Pa}nl
zRIuyZX(dT8?-BoWr`JDM`c)tQ+r9fHk9fNDk?+|Ht+z=H$~B9sw8rd_n))*r{r9SO
z%R`&l7jn-~g7J%6|84KJDh&5=I_N6VJ$dW4EIxVbnA0=Q3;sa~giC)@Kd1T}-<ChF
zy)pUQPtIiXSNZJE>7PacFFw6_=ot)#+$@rdjQDlj?9h+y=9l`&Ajch6;lEqvYMays
z4tHMkZ$C{EYZ5Jn7&Jl#8;jrJk(GG6Hc!qe9*LTq(@pg^*Se4;g-?*H+`Qvs@ov}<
zx@<)y<URCDxc2zfzyAS--k`YIstVD#nX4Y$^3~k_^|jL*%Ee&l*{UMTk|%q=_E^7l
zaKDGZFO~CNd{7x*j5xHL2A=F<7d&)1R@i9N!$&2Cn73?gpU0n(RaRI_Daw-1>27WF
zYK8JdfTdmxukwn@$->C&oZ>{azVs3ZQ^HbbD%#iz<eg0OFhs3Jp5aDOR&jNz=kg&C
zDZ(%XtLpARneWj-0>yI~te}MC`j?_6RU`r4GP(VRc@`0n_-~Om4(Lzwf-hb4N_m_#
zSr3%cRhV^Wn?a`g&qf>q<V-PcMK^6Z<J&ATd`^M|m3R+P!^}myL1HudiQ>UfL5kxn
zlZ@hJC8gD<^;pp2v1)4B9$vx}FKOolPjL)#H`7_M0^FYMn3;mH6&ekdWOC1uE^`Qz
zQ>_5?V7@X!Zp5gOQ=)DmRTnXSPG4WpI#&le__&Lg2Ti>MmI*{F5AJDga;%7}f6ru|
zfZF<Uy|1AY%G!NC<Uin2hqq&m$T^@kV=|5cO$Va81Ry_+^66}MI;Ng!?9hcHmq#Q*
z$XHnq{aWo|>8@JsLIuUG-M7fZ8vkuyaXwZs=*`c_2L(@n1GBw5^aq<>w3CO0yo}Nv
zUX55<95Ms>lc(x%z%Myj7%$dU<pqQ=PZm!lO&IYx=eo}{6*i&cV)j2Rf=M<gH*(1x
z;z47`4DR`M+nd^<d9pkF#JafVL(2?|N`CL;zhMF%)?}ubHk(1oDt=eEy8V=Kb;Tym
zVNZ%|w!6^6)o|;`{Z}m*j-f_}Hc#9($SL1f?8m!hKW%a`8aCWi-hri;iL;XI`~cCr
z&Se`WW&v()YBf6>M^y?gt&HCSDov7W^SB=HRAPGLYXmdYAEipqH(O<W3{1sHCO4(Z
zWXmET>HR^O`o{Qu!{>0_(9|_!Z8$R4yw2rDt}fH(D^{%C%Mpx55LQy7aF@jq0vc{+
z5JnBpbrgay$N&!?D?CX_KTEFqVb+x8d(AQc6~kX*ic-Dr--?)+9H!Xs2wPb~s3QK5
z2bPXfGm_FUe(#T{P4MC%37`JbA&%J1-3lbbzHi5$;Dxd>=e^Idt(%%(*)OKnUe&KZ
z(xE%-(NkBd5;)a*tPtO1K|dZ&D_7}!43#uZGff<>P4y8awY|_%Wn_-H6_p%^ar~xX
z?+~8GS=iA9NkYZJ9Cy64@TlwD{gQ+fu1gG9&xPiHOiaR@ZG{JRMEu=;M#pB67)`qj
z#w^wg;rCxBXUQ8JlKMha&Etm9VG*^0m{|kkj{$?D!rPDhATXK7_cPvnFKr-~Mt=)h
zz;f`!{|?5%27cd``*gUJm1VdR>-g+w_QBy(ZCZ3=Y!l``T`Wd}t*fQo%O`a~z*u2|
z0JjGMgHjm><l?&(L@`?s_RM$a(Dd8&5~G`HU-@Y{xL%q<`to7pd}2+A=@D`Cm`0hY
zE#a>ETVrOB^_^DbQ^1+2l$Z;tYY>*o<5sth`z7Fi*eA`ZmgPosrAV=<%C+kkpw>)`
z>a}CL_GUs7g_r~pCCS}&rx*IJGuSXM^Q2$FG%N%=49*yY>0{M;j(8WU-3S;mc+vGl
zQk-H@KR?UNya}d2mh)-~A`&c6$bdh89VFemmk#Vm0~XVEg2)4l0GuuBD@*NnPY7&&
z`&a$3uRd{NI(@HvWv&T4o>?w$peI%|R&~TpxgVYm$XFv^d?JK44n5<%_9N`JA3hCO
z7a9jZ$+9wCi3YTH<BhNTHPRBBNHz|dM$t~^CJ#e+ftl60gHOwn@TTdhv6}b*ENY=v
z#7BNR@%^|`fbMAu*55CIWf8+N>%Cj^(ytU>(&oS94d|FS|1er#*UO(REjx+=i+>9_
zec<iitBs~r=Nj+|2U@j*d!<D;J&Rb^3ntUPM(d{ydDitU1g|XM#r>E}v&6EEh!33)
zp1!0sq<@)Y-9$39d;bSSH_`{9leX^Ie=ds*d60iuwmpIqaiVwF&bxzES5W)=`93J6
z`8}=a3t?II`JP`TG;9n89mh|dDQ6GExtGrAPc6~J1A3_-7=23?XhDZY7(6IE_3&it
z6IbO+NE+tf;)&94lqYWPV1MY@uIo8<Q*2NJXu_qSbL2f*_@VOTc_o|l3*b&#<GEOM
z?c>syQBZejPoXWnO&6$=@#xWXdvq`~+_e8}UXSND=lkDJgk?Xx87#|4!{|jlIsWZ?
z)Q2ylb)prszpvh8Jb3o#-5fRtW$cnrV4NK1*>4L7?{tkVc@nyPuD|ke?0QzV>#MZC
z@+qAow?ocK`9XUDuBF_vQ5xt)#41@%e`0{>tL^kkt#~h=j@|X~^kX&EoDXN>Uhb80
z3ad!3*nS@Rz;Exq8x`h3zNKuk=4GX;QY%L~DqjsF?mQ@Z;J_pMN{P=)?B4voP#yI5
z{ki*wf96+T!Q4Y9U;8~51u-XW$B*B)Cq1!(r~lcyV@N(?h@DflK37&3+G}Fgsaf&c
z^yn_|1##E4dy<hlf!I>AsK@?1`JdgVMEj&g2QlOJu*av4K#Qq7wP*FS`yd6g*5&bY
z{M@6jzn$p1a-mgP{ZYf9T95CQ16N!F_J)Sv3@{nF^hHD7v+JYof3h&`lo7Z1X%60c
z5_|2s<0E0a6VcquaaXrta#G8a(7`XA0CrGZh_>@7O)@6$ws2wq04`D=aaWWoF*Vqe
z;gvaO09H&0(KT-Top<H!j2EiDU=R@sSWlIxh0_!X=EE4m0M2|RL$#i?UlE6+(~uqw
zo9B$ELg^f^b>);0I|>yzAf;901H}@*99cB^2-<$)rC7>pjBO<%z@3q>p#q>ByuEFL
zEkM%EUzpB~$1^~L39NA(liGV8_-#Og@kO<gt3~b^OJ*h*_XA|GZPy!6OQ!rhy4hWw
z?}dbNceqQ~>_4wAo+qov#Npd!#aposeLPP3x#fe{bk3-=;GU}4;s{V;nbf3s=rk#E
z+-Dt6!AH1<|7Adjpy3n1Qr}>3|JlE^%)mYUkc=c_aZD=0ZXtCN#}|@&R!oH{p`$lW
zV^db4&IF&Cgmpr{J95v;*eFsOZ#;}3qGGw!cnVD5#s&MCm!qP4g%D!(idszpkd)w=
zd{z{lJa~O^SoDDkT=p9CWvk@yS?k#A1oz}q3Wq2uL!9*ocuufqFlJrzT;M-5O7bW+
zJv}mGi4HTYotpo90RGn^qhJvm#>-a=PF}24H&2pTnn&Jn=jzU~2~sPvkzBT!l}x>S
z^?{3->A^y6QSC*)q$X}L2h>9fu`g^<M#g$|Elh#kL!vutKL!tTij#2UH=x=ach%vY
z<`gr7XpfkfB>R<clblEA1S0xzl=w-%*~i68LEpD+NE>~zB`k56&kkjKi~xoNQ(Tu<
zN&*XmO6z`H@}W`R>KhhleSZk>K<GoS9`a>J2NN0=H;4v~EbV-xPkCA^P*y5=N*_FA
z)JF#rR1L)^b5dzs>v`0GWVER;0WK~@=yw@cj?CvF`|^;xQ&>y~-5B!_UO*hdtA7b8
zr8tl)9b4srD3#WSXk(K5CoCv=7Lr&E5|v()MQp)`oKpCZU+X|K1bg?NH-jpwR49@q
zlCY_h?o6%?Zq+hwI|*wMxFJ>&QV>^xB*{F>%<}z6t<MOtekgX;xm=1gJbLNe6&(h}
zICUrXV6{f-ESX%aiq5i|5p<Hl-nMc_^jq<?>Ss%KF9?C+=RFzS5y7(NsP&LLM*;-K
zLpt5sCr@?^?*T@LX0Ltf`0diRyY|#g&tIzfxHHU8%BF&ACKy14O)-+s{1?V2+3bH7
zM5zT^yk0)EfN1?NZ<4H)3$S*4PKo2FQw|GIoZ32~&tdu6ib`X+-6+|>3g1?;-vTPu
z0Srboese`TUCx~6^ZvV<9F|3hKBi=bx}+Xb`V{O=MGq5pdOb<^w4d}I-GA$O%f88R
z#+T@CZ{y^pQe2N6szaD1u@wI}-Fj*B18LuaIlCil-zoF=L+qz}KLrmzzg%`B9{am3
z{=I}>FQZ$qDm<?Bz1!7?6!<(obQC%$EC*y_CVU<p>e|QFt?H7Su$gn}b*~+c7b$@1
zF3GAMlU*~jh0v^b4beN-pI<zBC5k?&x1;0H(;O=e!>#=j=V~wV<sig_>Bo=Ho+}*_
z>LfJ}DV3C_)k7>rg-`5(64X#$&-u3}uyII_rd=0I5Q~E5v|~}$_hvN(>Byjfu{!&&
zUsRw&AFZxf3o51>bJRh@!r-PBpH{1*8^(UK-W!hv|CU5|vEKiB$Ffi%CoDhI)~(E4
z54lkozX!r5#hx+M_MLRN#jCBwW6rK?llw(w$niM6H+~m&eHD3Qv#$(~kcpQS+jbm(
z<H}(7vD${N*uFfXH?!|X*(4DieaH7g9AQPC89bTu^*q_-X9CASCSG%V>e8N_|7J0&
z1VpW?tBrGj{Lv?$&kSqtd&Qi2_f1z)z7eC9ZSRo#529IgsVCq=*?ohmS5*&dx(baE
zANyo?Zs~uyLfr2zff^f{uVVVCoD>Ke-2Wqfxa1Jno4z>lkXgD`WU^uR#FrRUI@mO=
z+zsh&8``la;>X*IZW6ST9jDFRlp)FVTIcY>FP&W%Ki${k@ar=Mr%lsV<0>%GUHXOQ
z7ZdjpO^+A_C<1Yz$e5U9bJyB;`LS9bUYj`^<X;r9(*IA<x%e~r|8IOJn>lV)&WD*b
z5^~BRrzwZ6h8&Yr&WD^&A*V5l$YD}*7IKP0&ZLp^hn&mV93m<?l|%IX?e`Dt-s8Tv
z`~7}juj_hViDfftm|moI=q|AshE&^$*Ku}f8XJF}#RMb|a(_4*@rEO}n4l*1V)04e
zs~akqj%Mi2A5UggYO5kc&pKT!s%7{~;K8e`jJf+jzyg=ixQ^WmHLv79K<AsAmBnMr
zKB0iQ6jJze?RRgrKhVc9fb_2eU&PLqwG$R5f&|alUe%-V%WDV3nV-1l9aWa~sJ|P$
zJrHF%)M|Yk`)DhQOnmk`MKSO<L6hwBSGF`!U?$>w)&BtUVHK?cjUFL3S#tFypX;id
z%tL++5Bd5OO|WwR<0*%M&X`n{7Y0AKD6#Qz1RRyO3%eb>z47l{mH0b=eyOzdTaP~3
z!{~mf#E@^6_WL`(1;_R<^ZkF>%SXQ7DJy0#K=uOU!cQV!_i9zXAL0f!tHNsaV0L?s
z=-z4F+o$wpmFQ2%Ix4<v^_ENHrHS*rzhTW^0-m;p$l6~GlD))_%}=S5`du~B-f;X$
zMh459vbM8rbBaq0YI**JL)OFaBj#_&@|U>!*0}cufAU^5kJnHht?d2XHgj1)PbGy&
z(ole2_fr%~hoeK}r&Pcn&wt{1sNODJB&b75Vg={$WeFXL(AK-7sGRu$3wBxu^k~AS
z2<ovytX)ZB6<6IAIcq?4G`xVK89i7+8Gf8pl8W0?uhoCKfMi_UV+AGf+(FV)X8Mi#
z6M?9CWs<Uh))Ui!0DC(GD*^)F5!Z@OHjzk<JfSA&JQmX%!|69g$AS}r8d41b%yWBr
z2L$e!WSvyfYK#e8l_Uccpho|g=M^x|D9qFFNvLd7o!_(sq&Lhi{y28_t-WL>9gk3)
z8zv9W|4RVF_<s25D1U}^`EtMsb_bfG2o~lnBQoJt0gs%XwRK9Cy|113`yHQq^BTJG
z+5An3$0s9fKp0Kb^dPXRBb^#14k7?ndiV7p{D5=^je<Wn^g`*lja6WNrV&H{qdOdf
zj|qRTqle4~eN<4-mUI*2Nc-fh21ybFpg8c5l=d6!l1eI#fvZ#iNRp~{wFzMd7A}ne
zaw~G-a0phU5^V*2PMyh6FCju!Z5$R7P@qE?2Ek3`h+l;y7TTFaqYbyx<bjNp1QM1X
zk_RbZN8(^1*RI>Gv8+G)LgA8Vf}%^XiAXY831B_!v82T0qw46LD_+yv1)Qem$GZSO
z$*uzdYxueE0iZ>TEJkR5jN#%FJ0Hm4Cp*JYl{-itMJ@yvA!jPU<CWs<KT+1L^f%lE
zq!q3<<!x@AD6GJmt*qbF5<0w6hNJ&)EWPhAzPX)rj0H#p^LRT0iv0%XN}JN;Ls=b|
zytXNZON1$(@?`=*^r92UGc(<LV|^)`$5@VXq>BN5(6JzA#~3v&0ruRFAY_UkOr_%*
z_TL3RUIH1eFw6q@j36fI{@S~)BK$!3RVZLDq3snj^?gO#8jr!c^5Nj5?;M1r;A>7s
z*}Q4IzX^ScLfLj=6#z6aEUoXWRGL8alGC;>ARTi6-n~YUTxzN>VFRXolxXJ;tYoPS
zfSO^Aoq%>aEp7@6MhwE=(<8k6dgBvfUm}PK?z!Hdtjhy1jUiIyO<S(&F9RrnYWsBt
ze1$9G6mS1x=T-fowvqY~m6Oq$p`3?jz>Jxd3=JVQwIu_NrtaRL3r=bXzff}{1(y*l
zK?<%ouGK4Aq1(*xR+zeCKub26)GhEx>i3Fnn<n&vJxxaWTAOl&O<>*XPC!VA=*9x!
zb^%x=MM_#Qu1i;ubt_@W7wQ62$LrBQL`yw-E0OM)N_#|0@(WTbC#dv_Oz5|gL{GM>
z&hIaCaBPQFv5CJdK(G_~6Qu=110E+ssxJH0Q9>CqwRa_$P;yDKn#F777y`~GptDU+
zFE|xQxAdSxkeVQim8l9@w|S3ldVTBbz%0^?Z}^0=`-v~}t@TqSW-2A~rn~U~*O-l6
zvH{yuCK%GSr3N7())KdSv2jhlL~0OA!*wz8&eV~4`*;EW25_muk^_n*ywAIPCDt;O
zdb`gR(Dsf?0r~QF{*UMf3hABd8hH0!H=g1yo9oV<tbG(wrmhvqua;-bGe=0)O^la<
z(^nt-`w)yrU8lVHo{G-wlqaI%8B$oF0``x5!kfck2hOy@89RefE}Eh*lIQm0yKC^K
zeeE)vwe;?lq(Z9TMNh-PI*+V}%QT_*IUoKwClCH_8a3FVc%7C3`-Sg9$;P0JSg+R7
z-(vnnlegVD&975zJVRM~%7BhAI=QyI;g4_4hahDQ&lh(2v2!OA8)hlxW8QD#15Zc@
zK|s^`p0$Oo^5k`C#HL9dt&#6o_HAe;f?Fc>b*q8LrL}CWRZ{I2<EZZ>|9{i_`<7Cl
z12(#^c|!?_4{9lv1>>9dFTW`@uj>NS%s*V=yjt5jrJqPVMVgN&$K2iBXzXHnfA;!s
zD=(^B!&Ai>xqxUzS+<_k7h>Svt+$h}N?u<3+JHj4PCDLr(-Q=kUt}NT25;6aEMKaa
zRK!7&?(P=bVYk%c2e0rcQ|5xDIu79=A3xqW=2RWv^;VNmO#pG^kaXKl-XrQ)nNj}%
zGYNN;$d96HGbSN6n%q&BOm10~pQ}rdM8l6+6auaj8BSf|g@C@y?&=EP#Q%UZqA$xr
z<w|VeHRo){_b+<^e^+FsJtyOjs-~DDeFjsn{_xm6Gv$M2>9_n(60c6QX@DUumFPjZ
z#MWli&Sj&22CTemXQiLIOD<d%yy)v4p?&S${{F)5gH!;0^^ZZrTd%-LR~zSRc|W?3
zYT|-VMa;#2df`oEVVgDkSr3ra<`4#=A<*WEAUMg;<LB?)TdqN4rDz|+F=*Dax+=#~
zwEbB&1IiXF_@ii>tc`P}om=7zXKA<jWOP##J0(5nMg996YM}4wwu5VW*m>X#oGaSH
z!SX@ss9gVQEdH^2figKjWbBK?L-q}~po?C3@WK=@mMBS&{O{GqerCxyCj-{9t+sv3
z=HX>ofq8>;n>`(5^YQXP@6Hef3ujK;YMTYV=FKEmj{N8lNy6hd=|TcMTvq1wjZ}v)
z@qfx1@jH^znGLs0TN84!&q|P0VZ$BHV-w!!XnU$o#Yo{c1KaC=1$5OtJ9_tgp`Bse
zBP0O}jsgxiZf8P)jNre2w?tErk>MzkNG&#9<lm3HJ)x?8P=AgfS3pNyXhAXox8@q{
zRGJzoG2pfW*yV;Zgp+KzZahSjyw|IL<fstP9h$)NLJ1&G@e3rbuxH47fh5tnD|q4c
zfheaOS0&6?$aP<d;SW+!daL7@x%yI3K)fhcK)dXo1vDD>G2`0%VD<wB>M!=8^}+02
zHDUNEvY|D{-wTDaGTA}XvImDF7cI2}yL;99|8}Z)si=YH28Vhl`I+a<0Qt}xW`WOR
z0>EwOXIOdpw+-F3N3c9B-BekedLGLdnQ$2C>sWK`HiE=GByTE2SZVHM8hHV;O7ht}
z%dWptUW_Uuj9QNX<go4(CAfEWT7Vnd<&<c!5*y4){Nr*yCavl*h(MD}fmGS&i~bML
zQarOQyC92VbD0E<C#&xp^rW8{=uI80Z}o3$JOC^i^irZB4okv|9|C3%NNy*yX=HiY
zV7P%=r@^P1-!VMc%*5{2VJhn?0dqR8ci3Cu>&Y#F$cCb+p?{?y1lqdjd3-`p@yanS
zZN{PNf$=(#vfbC$0_6S}rD3n!f5LcH9S+gm*(RVm(7U)k7&_Hh(`0@uW>&~$+3*{!
zUneog#Y|Sw@Ut9QFF|3J5{pHI^(p;9lJ5RVx+Mj2Orzm1^(vY$S(XY3b$VEC<o8%|
zvZmL;GC86v0JY%#Y^Kx6Zd&<L$;V{KEerv{IqU=;6ysQS9fvPp?77$}r0jxQ_RPjE
zYAZ=8!0-?%t-o1G>(}|~Ib2~^@U$36vQbxo$QKYNJy8I4jEh%QZ)4JM>^pT;;Ek+_
zd2p=}HYWk3y5s?Y9~x5sRtHOY)p*?B*w3YliwxbvPa`%my9>^#&Z5t$t#o#Vrw+Z-
zW=60R`e@1hxO5fyu&-FowlL4GKg0fULx4QEl4Gpzbwf!Seu*pT;#F^X-GuC6=3*_;
zD8Zr*ph}L|3-Rj$=7dUXC#C&u0rD*IH{I&g$Q*{kQs=8tyx6<q#ywD~!|5|i{RE1)
z3~S@JJVC(+YmjGIaL)Xz*%)+wEIip^lb2^&UWkv1TVXc8Kr;nY{yr1R<x}ZvL2sKR
z%Q3b6r|*)(q#Ej!GpXuY842?$!YBdl3Rt{6v7N}4ZmQl&Oh<(|Uo~?;b9MQb%FXGo
z!hf7~%{$!Yy9vKfrb<XAhFsXEne6GNu!m2INTF@)68mg8uyhbc;UFl_5V9vjb}%ro
zyj<e%Guv%bL9o|c!^o9cg{P~4qX(>WHN<c@^z96<WKN;>7Zz>}jO+(LEPGUl*ku{H
z-19RYNR4-);&@w!u;9dUEUn{*PaGUT>#jW~Rp=3|=;?@dHfd5O^f%v-*TwiPiBV|+
zpICp0pJB5*VA|VuO<FLa)y;puLex{Q*LtGiM;JgqO<vJtl$a9Ou!`!~CXpJypdB|;
z*AzskJb1{$9xG&d+${wZtBfFun(FB3Ssy^<Q=Nx~db9AEXr~Ts79tjaZSe|VEw48C
zoEYsdljcE4P*H<NTyj-QRO$M3CMgshuLyBD69k0Pr+Q>hQPoY4g=Z<r8{cDWN><R@
z-uRzurQ8}aGRCBVvYvEYI9=^;hI1?>9bCWiYe@fbO`mLxmQiJ9hLNhb8>+kinFb{q
zO|nCHlXd;%_^NqRVg2Vkk6Y?qrzAhq_=n?{IIxK0`NgaxWIU+J|1f$h#KR}RQxs9H
zTaECEG*74esVFTTdwoBnqC<%nP+_x>agEGfi2I|J?BB=_4@7Ne_n_{(|8g1Q>=LrK
z_yDP#*Sh(+VEkmLrrq>XSGWX{xiyT*yliAnTak?@W?X<=DskxTg<Fkn<1VjBB0Qh#
z&}JJfi~^fS6V#T32mRc0%E#gb0%3T00>-=!ffizSk^GkGnXamocwS|Y&?}Ng_!-)P
z6=hXrGcY|X%-IhCC&1v-#NwC$3?67Z_QX?Z1<jKCdaU)>>sm5PObArzTlm?ZItC^9
zAl)9AWLIbQ;o7(p^uI7*vW3Z}aTAlDXuogXR=z1`wsD4KgDhEXtlYl#(k^h=XP8YE
z_rWo0B4~|sed8d*>-ag6X(3lg9?{XyT^zTZm20D`z$7<cfA%QyV;S+a9LV4gX*{`B
z?@HGA#QjHa$}&`i(~}agxYeI+xojc<ApC8yl&-IwQ-e(s4bl^VpZ<2A@}@n%1Dv!q
zDo(!gVA2oWzXi2Qe28m#eYIPpAfc5kIm!~jDZ;hx@x4BloBm(3JMPe~<nAuRWzIl*
zyVxb&YDcEXbVv=&f07|`ul`E3@_wLz9y`qIg_vr@lI@>h^NISF^vWumBB?oLzDS8d
zmb-C3vNG;Jvvr?jdD{U?F`<qWRidW`4`^cTR!^<mkDqE~%&2mA?&NP)2MHP(8c#qi
z+-H7ue$zpmp4qIho_l4jhLH2|Q>+5pgL%cB-0$<~s_-srGf{`*MRE`!gp7muxpTNq
z*#++`Z8<oPqgY``LW4I)IF*Xt>r;?|$$Q09mbHJR0`SK!=3Yo9y19Kj6~dmb^@AKr
z3AVxT;DGi21L(Yc!q+CB0NTFTaihKcOMr0R4IZlyPzVLhWqB=QxqDPap||i!LqQr+
zweKWD@Hkm=Gu7rng_n9xhPkONkHfz^ldcmr*?raV$EQr=3CQH7q?8azl)W_QpWmKU
z*@W8sApNTg==x)U?GKBqmF5*TOHdA@!_(1%xtG>_Fk`i^cS<d+Da<T~^_hC{Ds1M~
z*{p}<@<#8%u7N$mu9pDP?9XY&py}B4)OP|EZob)RU0(^8%!r_qURFUj5V(=l6@vpM
z{J!eCoIW&pwVaV15OTv}B1TD)cL17h@kX~P-&Q(hf(A|>7T3LLF)l=ee(nQjuvUsL
z_5Fa};O+J&B^o$N&Dv7Aeq<*T_AOnP_CMK4{nyY0&HSJu*;l@LDf8|$HAFLVb5goB
zo}!Jeyc|T(Ia^Bkdw5~uHFOpm4~9ex6nuu@p^S9OQE4`<dYi`f>hh}HDo31#?L9fo
z2TR~9RFNEJiK_`V#*4XR{OZ2=M5B*~<D7#1clTPSn;)QZW*sPBR~@|)z2~!uWeauz
zuylFKsCSI(NFHhn3WZSSZ${i0_vlZ2#!;awk^B+PDaVR()9Nl-%@5EWp0qmXlvxrP
zDw-(X9D;f=B8Ntlt3}}_Y8V%D3S>3ax4gEFmM<X|$!2A7?n_V~e$2s$(Z`R9ZFEY5
zk`|?$^j8BwV~-bD`xL&>Mt&;#iK-sIQsiKj5M`4;>g|S2lnX%w>bPXZ^f-l%k_1%R
zmL#z3y7gmBqA4w&cHWzc)!TzETz!VDB%tRwB-28gRY7Ko@K16QqVNWaUtU<mS3pun
zG%rf&d?iV5Pwf2|Zy+*{`+Ox<IL{vNw?HNGf&&UAp)j*y-CZU86@zANB(p;F{-|c6
z2T%|qADG8k9Wli#kt!G4WTP0e%49-6plC)sOEWDsdxam6l?(X5n$Y`)C|KmTLT4g+
z;kmS<&WK`jK=MMIY@v&TSH?GWPLWFPQ6WLRQh?UZYN8v!v2SlP?$d{E_a)$^%T8V4
z%bWiR5>8Z;Nm5d38eL>aI`EdAd-6KuY_2V_j<rEipBd%(S|HTkTSI)4;0xuBF)0hS
z)-U;ULHmJQKANT=%sj{w)3lvt7T|L~KcxmNj8Fp8x(LFrg@m4xX*JP0mxOYJ7m=9;
zc6I#P;10c`I)1euS%6@KF?{XwjsC_p+(Lyb4}_F|f!h)ZXxWogI6Z3|GtUaS8l5em
zB!0&pBZKw#EO(#5Qz!@XcbvR9(?Dv8WqnH(YH5eXjNALtPp{}v{f)g&Grp*K<yLD9
zytL>*cx9Ke5cO`&_e#&|inmLEbo9+K=!zI`->Pw6J|y};A1M(%1u+bmzM{9TxhH+I
z9ny4UxLJhr5;N7wq7{MzinnFqs|k|HE5B-8>KSX()F;>d)>+Po6o7?M2SMisiNhf`
zYKk>fvvU_KXh_pmVl}->RAISztZQdgG6R)$J$f4{HZQtH$+6B!-`|S+j5(s4x!*E)
zbuXdD5keEwml|v1N9d}}it1#&xJGydh^NBS^Ofyb=T&$qN=L!B=HASZLafMM)qTGl
z_jj)qMT*inXdmWI&t#AT5_pq%94sFu^JnV;oWz<fk%4b5=|gl$02x)?@j}-(zHMG9
zUcGwRC`N_JU6_>@ci(}RQyO5OWxubzN<f|T&?)G$4O;Vde@yYYf<M_HZ&*2Vi-v4U
zUP;xCw`SY|koFvCP2Olk7%-|L)IE*wSAurCRx1{1>E1pqTci4$FP}xo_V4mK)gmM3
zXBCFA%Mp1^+Qlgq4o=oS$v8PTz?p4*8#?+)F_&qvscvR)>j?(%B`cuz+3suT+pFa1
zPgwT&Sd6LHWm`6{>YZA0UW)iQZy~=F<FXVuS=RDOwdTtlhFdJ_Ymfyqg%b@yMAe4e
zdE?t~2Qn+5$%I2Jt)Tlg?On&KZjk2muHEa}CPfYPR=l+gQ+~fQyG>V0!kfc#NWGBZ
zK^edDnXnjnj!3uJ*RtpZ?#JIU=xcs;(vRwl7=vBCi%%|w+Yq@B*uj)}r>EXa;VuIb
zuQ3cNmthJub44fQ^Mf}|sRXn&N5o-?U;~pAW-?J}T5+P^sezQwoO$V0E5T$PMogLr
zMj0&b_%c3cQdH_sCMJ#(6Dh>u*x`{kVTYa84nsSCx6A~WM_7m63S$ln@(geM3=r-7
za#=qKpm+RG`Z(7Yq<T96Vz@YD$85`d|7nJP{b{VMELUpVlNL%-;!hg6@NiA~Ps3X`
z+UfO=6(+6F*#Z;lmtfof0ocX_y*!A1nMw|?M`qlknK4ea{iDiZw{kwWh(qe8(_L9M
z-^o7L{{b$1oOxLPjZgcf8x+eUyA(vJ*i4!DqLD+$Ods|ca(ZXkoP9ejvn8Kc9Qna)
zNbAD;UP-2TLVP~xm=x;w_jV_<_I4wG`M8?!L?-b4#}uH*z^^iM$MbB;S@TH<K;qe(
z#rs*8IhEYZ+<iS?Q0>y!>ZLZy=jvDMR!Q6yy)y4BR!2>_7P;M>9<&|5c79s<(crb#
ziZJ(bn;2x`o2Tbv+d+P6_{19-uE#^~x=&7dIC|YgihX5;%7rg0tWVn(q470f>RGCL
zer(LuS|IJn`Knb)6A$Z@uioN#T<%ka57XX!;Iu#nS&@T_d#${^lCzjGmniiiA%=S-
zL&MxK!00hwv&!`ll%(np?H})QD-k$5KQzDThub_%*}Hd(+*nq@d~vGoT1bCkI-&d(
z_Dw%nHeGcwN=l8o_vMe}!t$Z4N{-MSz*@On?)1*-nM+(t>M%0K{CV0A4^Amz-&C-b
z6oZY|8o-A+-2*Gk?wdB(a@II9Eo!$Z^ODQVJ{eWct}RL$NK2iRda3n}EWa@A@4r5}
zN?U)dL2x_*zP9~@YgYm)3&_ulDzuY3SJ0!scO3C-;lRCtpdgWd#`2bIHCdN0oBON)
z8t@J0y|FUNVqW`>QW!4ztN;3mc}Ma=lLgH#h9kxWKS0&%YvVw`QUZnBWLhl&A#<q?
zXD<E5szhM#?V4IvQ(X+fELIiE#1CCz*=JU#ZF1}3Jc>*|*2Qnd&R0l6>AtKP2HB1L
z82Z%VYvrdM3?X3DJg?ieI$f_0gn`k?h+u489w0%pAS^y_JnUZ2ZV|UaHpy@QS&Do2
z6#|GZ@Kf}@&BGX5NG434kd$GS-9XDp-UI(F@`^Peg*@Y(nYrKHX)*@6<38%A;1G_-
zrldCT%B6i^)%lMH%CXx}23Q08vZf_siy3(bwD<9UYs15_QxDcBwMWE<sQ8ZF-^NQw
zx-jc0N>GVByQ~g1SwE`y-OuBg*k{F=73N!U-*_bIM(T|>2dtbO^>2@)GA&x1R(k%4
z*{i(jE*`4Y9;oQO8@dkKU>Ucq<I#T%b&Bk}Q%>QL{KV`kmrk#UPf(hpnP$EE#SFK*
z9;21rH?d$yI?bwl5*XY3nGwST3iPB4=V1)a#1yzO8ROhPckyn2k9?L>C4OyfUR;L$
zm<kqtcqDB+k?0RKRE$emVCYu=64$<J^Ym8Gk1*1ex^WD=7xZV}LV2};b!{}`VMfow
zGM#o*=+6QH=%x|@tS9N#D9nO?3JRy2CXjLuji6tg3tg!pxncWI*gr9Oz(k_Rr|{O&
zw7S(`EG|_rf}<z$D_9@unnn?hi0xFbQCl{D9(|aU41A{KFnt7e;zTNbd-|>;=cRTe
zk7qoY1Go+~%UyV1_^%8t6wOqHjCBJ@3<LGQr{>*jl3kGK7v8Fn9UcVFx8L=FmZyJm
z8Ur#LmpxB;Ztyef)R`@E#?JP(&Y=j!&j-o<S>Hlwd7gQ!9wu=|cw1e_Tz-FWvzbp_
zKqn7_ba^+lHZc_AOS65Hp50IKZPkm9Kf2(l2p_Z)Ju3)h@Iw-h4lg9J6yny~K}=J3
zRVWk(Q@L^wGp3{&hecq8lG&2W_}2H$J1e9g;aC!|sNbYce4QA;oZ5+5gSRcN#g+WQ
ztk=2|^pYaK*^#{fLZK2ED5*2KsE^{K7O(dst%<CE0*F2;TN6Wfs3dg;BO6$foB_b$
zw7$H}WA+I<l}=5|R2!2e3`?_RSt9$IWQH3GHB06VG8G}9<*kuazXR!v8@lh`dJMj<
zk5%!(kwPkb&`k-6jAZk;FcAP}vZrQDZNFC4Z_96FiFh@jjSo_C@KqS9R+UQ1qzOaN
zU<b$gXW1^FC`&07A|;-Fi(4CB`bY<FjeGsta&ksV|29rswOYlg`1E0a?d?=8^!VW$
zJSyHSo-571_esKY)n24d!~{LYMNWehPE`bCg8@eE1C9z@N8819wghc^Fd4V9OhZRf
z(_R%>H1zz?)c-4dA@4LUMg+U~{={eAwTqp{X@M>6>7)L5FuHfMa5+FalhIPCX1Abb
zY6+R*lF*hmXk2;M?=GCFqm#u}<$I!nb(#T5o~d{VnM+C(D!1-Q^Alz-zz91q`v*eG
zW4%~PbbGZ`rTfpRe8et-V61Zn<C**D2t+kWPhgO{NpC5bHJz_*fRf53#@*>1x<f4W
zRs31n82GhHZa9AXDz*Vm<zgjas$BZOsHhLE=wbqdhzTJcC}5Z|1Ph7kp~@(;GC6Z7
zHXfFl(pnClQb40V+3ORmA7|%X!I?$e3~TX{QvtANoLPQg^}N50<51?!Yl8anI~)5>
zwz-$%S-t-QEL89baaOAD=_ubJnFbq2=H;r4x$&^}Yu<ZS>XCeY#pCZ`NtmDwv147@
z=BI2wHL_gcRTl-h+AvdBaIF94dHMp8gzZWPZ&q!(rADH|66)e?Sh4hNeoKMr{zmnJ
zkMR&jLD-xp2{l#f>!&cYshWlsv$o`s9UV;XFlwfQNZ?ciLihQpz37yRBH^nxYZsLy
zoPWmCWfvAsj-`GBwmV&Tc>i{`2Y+IQ^6FQWtaO_lblS9++t(pZfgDk*>&}g`d)0A8
zMSdRSd4IUDjMAEF=Oy<TVTfV;@cgsbcb=M{ajz0aidY4d2w*obL9G={v^}vSW&RSL
zx7S#z+W0T%Q^r4m@`EhjsK*f<`JIv5jS)}B6($xdWU^v^vh;%ciFd0mKQj`1`Nkfk
zyLxnCUWhb=OHzqPU_X{bp7I%enhUaRwm&yp_|lXNBA}-pF&>_eJ`7f2xNtJFW;rvh
zx72BM+!gis$EH&dFG;Xw<+7Q@q>{1Dnbvjb@=fti(Pl<(NVEO=X4&6w&eo~Z@c`?x
z#j$!L(W9ic6_wpMUpb}Qmt@dgYO5J9uim~^puE}g30N&eS=7FeCsWo5E`R>Q#{1%0
z73@)cH8wF51*5+zocY|IzQ4`baC|ccwbJ-~u6U3f_F(t+L$9krf$xV5avwwD(T8$j
z74F~NJb0T3&D{LKqpq9HX|KP>ySEwI9~?=q+-st%!jdg7XV@C74E3G6O5CxyUKBs{
zJTHf@-qXC|?vdXz!zS`1qXn<O16{S1KBMcQ*9XH29lrh|1Go%~cJMnm{cLr?@j#!E
z_$ZNj_WnfH__;s)@s0iW<iG~#x_->)!L)OG`9U6lUVGz@-B@p`2-+Wr>U%%&1Cr2~
z)4Nu5Xh&>nNB?PgrVRbr#oy`tj_%<}N_2!@qW2|$n_{_gi-v1mGFje537O1~lp7p?
zMl<VZ3Ha~3K+S9FX*&Tt2sQSOMv%!?O!Y)p2=|9?DS-&wA&^BOvu~MA!f}vkp*kut
zD09y3zzyX36442e;QSv|7S);bZG~@^f}bADg87m+Ym6=_?dX#=6JI||Qu7|P+52tj
z4!G^-p2>1&Jh~|<BoEc6t7+`71j|6AVlbqE%%9C_Fp5up1)uy+uSX#_?CgMRaD{`T
zB70mxG}2AUNA8<@eWzWB?d;ksJ5E4&j~&*pw1^YyA2z$$AQjC6Y#OZ0Md1N@to6mG
z(_R}LI9a*<XV0@Ri}CD4`E0jVhr_04`o#oLZ}XCZkA1jFAQ+m7c12}ex6$S?BHt_`
zU(7ozdnX#8yYm&8Q2t=VRXzL4YZ94*%SpKNU8VfD?cU#(S;6w%!`KoPwm1B0jDPx9
zdU?JDJdTDUEmi{Y%u?)ZvTvg*X3>J@yJw$Lmp+}ew=Fbyfc$z~{6)P`pFHd=;z}N$
z32|#l*KhKsETONp>Rs*l=iucj!}y*UKKWAtxzZtE{HO{VBcn8b$5B_v3mHCdTDYXc
zWf&fEAt^)$-W$gL3S`bKZLUtYOT)G%223l*0JqGk<AC7)(@F5vw=<DT$}xW0O~g-+
zf&Hs1Ym|it>loY(7~oEm=CiJ*1xfyuA(icAAB2P6Esf$!LrNQNc#O)j4`6RKi?z$~
z{F*i@3iQZ{Pl$u{^acls%z1K1Xpu-fAbv`VR7)(y&Zf=%V`6T2K}!BSh7f9~Ci^K5
ze!LHmhVaI?EM5|dMuO;bN#%l(yu;1{NKNG6+be3LJ*2K+_1{GDW5-(ku1sxw461%e
z4(hEWey=Oy_ILw9>)*AXxWPZd$c)>@0A{^qw6j+XzVQn?N4D-Gx|r{yGGO&ZOjjJ5
zXKF2UWy`_`0)`rfm};1{#(OwmG`RbjXxc8EtXM@FBVI|UIWu2b)}1LXs!6hq+9Mzc
z$bq=twr)01=bSnKz5>!d`FD+vrb1t`I2YVPR5ZyyuNZ8=dSyvcb$j5l&K5K$fUR8F
z4WO&xH^&@uH`r+NC&5p3f<$u*fOB?Bx2h?WrBLh7>8~Dswx8kBAmp<U{~8f=hy$(G
z@hX`|(k41$NX@g>C9eub4EKmXk6)8KX#zM%_p>v25C%)!ksjeX)90RScgsl45i^dE
zG0;MAKVJUTD|SvI%Vv?4VXqHK_XYm+@Gt(f-U5)w{|bn5Y6RO6MB?L_o(pdjDsqYQ
zqoKD+kglu}CL}FS>&#OuJaA|YaH>7<C{F~}sfQFkJU_(%>JUV4#hn>O3%J*Ru;_Y$
zz`3SJRmc`3q+yd5B6y%enBMwWEc5>KGco5*VtJ<H3Ouek|H><)#3b_m<p&#&LdZ@=
z%dj8hj7P${00A<$j`CTNodm!qqk*VSI%7is4I(!+oyhAm$DLS@)Tg6u2t{8IH9Hmr
z#5wh0rTTGBwp5OQ)u+#8m%UKQ8)L>}P#Gwm2j&qc6RtGn+QMe1$MGj+lE9(9$QqqV
ze(8s-Nq3nYUM-?wp|jPZZRNsOt1%pz0>|hu=7l=YU8Ou#w~l^r4nIQ4g904;u(-0d
zG-$w3Mtg3-*^(;rcg7!g5dLA(ht~x~P_N}DLT8C3s%q=&8NT0zN?n`B>Z+r<{|DeJ
z`#13X^<8P+`l@N860j~5Woj2za;}5{XH)hF>@OC%`|<_S?g!WR%<p%chr<%@Ix!Ec
z%=Ih4)@@wb=6h0VfOF6N`&)8;*!17GLr%iJsyqMZml5$VqdsbxPo(vwn{2sv2!G01
zlcGtv;$yiQx4H3YxB6PWbZr&%s~2ZS<L0F6f`_%<9!pP3@q#pLmaP(3l{x-Z2p0cs
z>xu{~&v>A8U3hkA7q?p|qnfjOcBaTT@3ZQS=N$^Ly$eL^sf5Z@+)jrcKUAMw|0ezs
z#l4-)Bv9E92LJKoDf*zFZ`bCG`=jyZeiey*i*HX0@CkjuyCuf6y{a&g{{buoEc?6)
z+TqzY92Wv_VTcqy9_$0l+d}QgY`~i_caC&BSWabao1ENO#mESDW=#9!tgN#gy21N<
zukiW1yB9;<XbQgX>NBv9o8mPc&MZBtZ3ET&uLC;0JvB~eB5s<Ff7}=?86{6e<Fvo}
z+xUu96tma<_;cqG)s}8lLlevMuNzR`z3Vqae~_S-sS{#V=Peg@3$5mlVJmHF3inA>
z=fwW2Sjb9uv2l=SN*S=|suTQd{$Qu>1rgVIVt?D~eweOzfULO7ed7NBejhj)RHr=o
zm|U|yF7Yu>18wFO?>&?;zELmx_+ygO?AmLqZ&jRO8|v@q+2I+``yPs~2{{zml(+pJ
z8}(;xp&J-~5_li)U^U*Ja5&9U0%1KySA6!U^B$c{UDoD90fcvQ<Z_2zRtVj%7Gh_%
zxmYzCI`&28?hGQu(R-(^?@q5?6K3>GQl7Y&{P*WpQsEzq$Mr+#YypUv<+nSI#otA)
zE{;ad)ajz@P)!jT>4xQsWoLp+iY(X>e^Ols6T^>8g7rTN=8cWk8A3?Nzrj}4yK+$+
zwlKeMmg|-y^|N;hgu-feHGEB`jk&MC-LA7LT((2)a82#q+WdJtsipp2GhH~R|1VqX
zcYPze0C+HB;LF7R{YTjq?!T2^-vxtTE>MmuZ6+?5VXca9r|p2wX>nGWmf2p<<?&{s
z>Fd5+)fg(Ct5D{sL#fjd*BZV6IQ~i1V0Cr;Oa3@*vnMlfI0pQ5<k8w^h@)7r@rwGe
z+wj~oI@!e`jCLGORqE#<@OX+FtT;~f7=&T+<3aL%62mQgNR>3rWbJ#?@+7KmJbJF9
zvCI&Ff{pVL3k3ijzY?zRu<}7{m;eYk=S@(5uo$N7%eh4)gsPAjdSO!6t0<+@(q`!S
z{G$BZGh^d`4TO*uS=8f7A}L*6pk~iPD&>({8Wejvf3HyZ8*O|27u>1xX4yu0mP1ie
zX0=Lf{^9)~o7+vqW(tqi8PBqTXR!&s$Qd1{pF%*50+z=b_t-2q1TQ5TbyTk!#_+;@
zyZingLw;}8Z@r5aA@}||y;1Ee2V_M44-jk<etz~I(SPhBEdJ#AryP-&BS5eOyM%!@
zZrjUF_JRu;z<zXCZgz#~6y_T&1Rxxt>ajyQtkEdm9LP5N+g?804Gx2GtYaFl^rJGb
z>;U(aigp*K06F|wqzA@J9{>wkW77d90>{!F<c8J0rWk_NC9DwW;(|;}Ca^Rwe0+*o
zg9pc87zt1l>eD3Pf&?WjD#&X{DJKD)dT|dgP-mTT_4{#wyR1szulRKT@Kmj0ikgHf
zGxX#p>Bjq^UjWLX$A$kegf4K`6&Z#S|F+cYw&Jt#jA8>kuWEZ_p8E_|K%MbbzWJBk
z%(4Rj!mYbmQd%*?ik{L<Q+)T4;eaLZ$WBXIat1Mk=zJ}?(Elz0<PkqU>4o~OJ)NBS
zj3AL{xWxz7l@kEY9UlEx#1r$|5dIv-mGHRZ47%iz;bR|Vv97unTW=C6kt_UpSUo!C
z`n#Dw_<=VU5gW-}5lmt=bW*xw1g%@Yt5&n}_kus`qkM$X`lOTl=7~H|6vmwGx{bYV
zp$#T?%V%k0)c>x&bi;Wm(NM>dP+v@dFcz<Z>FE+1lIQFqtCH4LuLtt5ymuDBimr&H
z_KW~k(fEI5N<wva7sRmznOkP~P<A|L)BDOWsrz8;j#AS3f+u#j#V)bW5#GOf&$@h=
z=p!N0snQsbFz6q8W!YD4ohLV?yObVT(Jo$8f=u7FDnBvMNZL7wvr6!cj1{`TjP^iH
zM(xu*9}>|43T~#FnF`S%!sQ%5$H>!(i;c0i$(v3$Y&VecHNvHCG_`F${1GOCFd!ZL
z5iJZ%A-y)4s2*3Tt~<@JddF;3qtm`x$BSKo1InbmK|~tkU-`<k<xR57R2`;p$`|YM
z`Fw5wus|<8-@?MF$)@AZhAdlDY>4PZX|ImXkI7zaf>+&mRu-d=&$H67CaX2JEFRl%
ztskI@ql*B)t^~IK2Jc1p-<VQE%*}~j@qO#=&S8`^m6Xzd+YJO@E;MJ(kmYUEpj=v(
zkCY7@SeCgL_suQgKNHiZq*h5y)(;vVOFQ@|D5*+{nX;+EQ%#bXVJ>}fr00o*wr0d3
zI<TR0ZO60xuEbFUX<FAJ<{R(xcnabTBq@N*`E<H@g<NV6(d|w67k9J7?ly#dii!9x
z28yieAAU7bXs!lkXU|0GSR9}M5?%LtxBBBBB_5sO!KdPmScnPR&Q-kpxc>p}eT!Ew
zrd|{lin#Gyt?|?1^tB#hFE5B0#{>Rq__)rbS|zMD=vg10xAAA8YjQ~4hf=X*j^(~s
z&Mx5?1IWFtO)F)J9~y{uoCI6z8$9i%&ygIz9-Y5RaL3`Hs|zUj`94pw*PvbaL^2iM
z`-yw}SId*k80Q7FFly<y9#O7=lY#I$KA8}HWkDW(GKL$Fv^ZCT9)|53?9K@}>_9@x
z761w;072Nl0_?7A{CP7Dms=#~$1hyPdG}RS>4_}vXQ{Crm>NTSkbb>(D6B7_|Cn{#
z(nC$4iGT2TUWCf4JW*y0+=91^A0&;rzuznT?Dcf=j*{rz*;$3fWKD7N?=AF?Th{hf
zUgQqVQrt#h=jKeQWu@I<X3n(3iE*fQ#T`rk^)yxK%BF0&=1T&%2ezJkXquECN}#N^
z;FK?Q4{1bGw7y8qSk0L{`F8C5V3Ij_<c&x7j*mKi?fnrL;(yq1=PZDSaZx1I(yPrU
z{B%1%K;LUfMIGf?2ohZ8SkaUbkvYf#_bq$I#oUtwSf}RI#duzObWcNfs8YMOK3B0Q
zEbIR*fQeV0e)7YjTKhHAMdFHg&AfP)3H<8Ao8{--d2V6#tl4kfc?p4bq1;u{R)Q>_
za-d<PZO?PJO6-@7)%b2%c0ciavHI|=o${r3(D_4YTnAK*HyQZ?!LsZP3(9ckIPl_I
zTm7?lYcjui^?}OvP3flA@o46l&NBo-JR0}{`Kn*!c6r9A`!wp@myMaVI^L!}hTmG#
zyij6Gy|i5<;(FnKwc`^9hD%Mp6vfErCbG>x@7lAB+zy~%Pi2oU*gR~N=DB5$;Fiu1
zI2-Ld?>5JhQaa3TknR2x1W6>Sn(oWAp4zjJ!Rp%Ysg8j^Bo?SXZr<8oRohGT`X2Qs
zNNQIIwlpM&{^$@+iz;2Cx&0ik^Y#D&tTQ;TZG};oWUR+v`)`)G`|;jWkD5pn$uGw&
zjtkZGZyg(10mEYA$Hly5n<6G2wi^M}ti2Zup85+C1Mz&`+AkgIJIi!9)RsezUo&gb
z6-acyx`lr^(^}BfH_LF?62zwWx9xCU`zOu-u6L!ED&Jt6pL#7u?m3T|N9N_cXkz^p
zSg~0b1CP-!wz)VBT$#M|Fjua8BCoghHHkMH*&FooYWZfFxrgZK$5bZ4ozL&{<%=fY
zIWO9KTrz5vrV9@u2lj?suYG@b=T^Zf%YCZnv8SB7u-=8N${#ge^N~frCur{V<G~21
z_Ky}L?Kh82UB*WK03Z73q^<nNo;tQ~To!bteB+or;_w5bome=!Z~V>zb)C4SHycKV
zeP1p;e^w>meAI?6?e-6cz-50x>4a7MEV+8!cC1sa-{$>=lHWZZtX~u!J^mk{Z64Nl
z;neV@MFgK!iSy5stL{9S&#mOtT^qe2!1$JAqT<G$f)BOE3To2Dtyu;~CG;hA8r)Qs
z7}<{NN&pal7({M8u^&zwOvny3+&tqg=Zj4rO64&>N2g-O0QvKp&na`;ndC>QE?tXF
zDHlg2U|r16|8K()!52No@SjZW_9jBVcv-P(0#ax5`9q5H-7@EPesJ_~MN^gqsg7dD
z{ofXE91jcqlLUCcQqUIy<OkRePu_lo3_ww$3UgYm&L`#?{mnYb**QW-zTEQU>rju*
z=a3|OdIQt2vj?W10bmK?dzzgo6=fYlmHxhJV)oXBwgd%0-jOJC)9{mrLl{nR-!s2~
zSVCv2DCSDdji!Q~$O?QqrA$GD%nyf{gw*zp))BDTx=FH6@Lq#l>SlGZ)}&O0t`f=U
z=+LbvkWk=UOktTPNA`mjfS3ihoWJ&KRZDBC(cj@GHD%%P*^6-*WmVV{GkG<GBjwAq
zpc*v=v=nWbU$2`<b}KNo49)N6qxVjk3ce~_L<kH$o+~Z9jkEPOj7R;}Z7xD8_)S{{
z1$3x1C5T0h$}_28HM|K*2TpDcwZiO={S{PnqV(|Tz*63{lve2<<rsKdpXTY=3NJ3K
zcajaVXj%`hwZ$!}uMp^zsr3`65${$r52kv}vhg+1#Xs+?Y~6sY-+tiNV;S5;0iC|Q
z1|g5w0YlqT&faa(s7l!3$+<wfke0KjvTgFKm`AU-JpV&&MLUpX1YVPj_%EIT?8jcK
zk__^z55DyB!{Y{f>mp28h~L45%TMFo*L(MGeha^{3ybD8cX(y{%G{mvSO3bE<>@8s
z-Cqh{_v4;3^i&)i_M%J=GF(Z^B6W+<<&U-LUgi&$`lBUdBMNNP<IV!?goR_&r9(jD
z4!%mibvGZoLHg(IV<ExniRE(e4(weglSz2vxWeCvPCI$sLlTv$Hp8_Hm)s*lcAB&7
zx@Hu7&cZXbTF-Yp2lMmauxM9by|(X|!eDPlDoVuUWk;*XLoc4s6E9{yy@!~{-HQE`
zDp}>h9{H%ZY+tqaB8lnJ4Y}3h^6asl4Rn?j;Xk%Vs$F>`B1+WkWzc*8vNvjx%?x|A
zz@Y83@lkD^_=OMtKD!E^BQzlgi%M1Ja9?z$^_;97o1_gcFWYz}B$njvrZW5Ijt<LS
z%R}dSqidh>u(l0nFzcS-P*|s3vLQ8~#!_0@yAeIduU`sD^K=r>`}4^w(*FvTavMWX
zm~ysk!|Au!;nUa%z*cgT+OvtI!9-TRrRxY1*={xAmh9c?`k$H^954&<LgRCV;b$ve
zmV8#4u{0YqVj#RhzA~mhAF;YHvSKMG4(<yuAf2vn78Q<zpj=q)$=l8sV+UhQRPx)T
zN}_+x$w;c5P8g$Y<Y1FF6(bx;Irdh*-YXuf>8#O6u8&UP6UGQvaF_G7??M6nr)QY2
zWWMB1d$(u%+1q`Z>6kC4mK9f4_|`uT-7JX|z@|;br>~z{KJi8Ob}6!c7}B?p1aSz3
zEN@J0$P=(mM&oMWq3*&y8v^E{$Rkkt?N=rNiDVi-+$*z3(K|!+V;1MIeagKov@V}-
zQw*0ymKK>01dN_#W7d%{#cG2?G(CB58s_tapoB~*^lg9s1&Fcz5iEWN9={!b96sLX
z2MYKt7L^qYYzfUC-p8ZIu+d0#r>U>c_O@p{T$1b(9$}W~<@qxK5}6>@WobK#ae#)E
zE1%JiFgPY>1#UU5?))~L@IC?nVU40}OFr8z;wF9`72(Eid|FlcTbY<D3+*?{v*`$8
zzW$w07t3yG5c;Z(JFT7m2PntR>=wT=T(H1FJ3@GzN}kmTZArao!-H0Z;s$<K-FVHc
z)u0-JYrj40X&M<1E4kMU?e;w$SSyO>VJ~!KN+dhSIThIn5my4FADGGld)36X1nus8
z?AdsIQ_)6YPIfx0{>GTiM{muu?N4uexmNFD&*t>JxO@^}`BY;_aANa;#Buq5uk3^F
zAOl-=Q1WcK)MM`)0i~to3K-Z<DPlB8>8^m+wEFuOvdPmoCLHe^_gsmdctuyiPO-Ah
z&W(*R>~hMg>}039AZz=!m@@>StE6N#>1%;GU-S$!<KzPzxTdk&*`ZKyI4QH`_NzjV
z2k({@v?h|Aa~LwdxZV!^9pVgUyR$LfXlO-g{#;j;JU|V@1=p6wV`2RuMcF-ZJtuZ2
zQC+EsC3n@Dos_oorj`}f%QoID%ot_m=9}r3*H5$QOSYBQn*eD_afuW4vSm94+0oz~
zWdH4u6>glF++Q^#<07*foL%NmtuH>so+szt#Hd4<xY=(_DK4{(7TZRBd<n;L<0KX<
zS{`#e64;ho+q3;nuJ|H1V2WOC9)-9&9J`)&u=?6tCL};c#2uplp~c~#mQ`Vn1PC_~
zxKaG}&GNY9at8z;$-h<Nm_z)}Wr%T>^jLXoK^Wfs!!+#6<Nr7Y`-PKz*2p8TEDNsB
zKp8%>-<^IbJYZ7K6Lc)^MlTSsO?hLlJmM6uK1~LuEPj~;+P6GSf+!kt@$1woOvE8E
zchrCurN&kFlQ|L0e`gG+(fH|Q^nZfu(T8(f;crDN9-uBw{#MS~W(WRr<pJur#@dY+
z+T4x!5h*3)vvPX&a@N5(uzQx=C->pMl+D*)Hd`Ll!7QVf9lx35@^|WHu9r7t!%i@=
zn`RGM@N);ODkvsg*3!$)eS>yC?LS>|<Tdc?O>T|M5}6vFgXn!oQTo!j>K(K0VE`Ag
zE^^@N#QnONf+TwOv7DWVmyI{~glTDhERWBrh37$B@51*S%&Z=KrFS#&ywT;3$t?Pp
z_7VbHxAq4@Zd}LLPIg_|-i74p;xmsn+xK^ELx#sYv|#G7e{1S=+Y%(#8HGPRA}kbI
zHlLP2=f}U!n^JALyi|I9z9sycTP6)N<%9t31Tr>_pHK2{FoC5Lt+DE9Z@@6h3W*mE
zZ+lj(;)2ink|9(K&|g8XM)pOYLw^IUisGB$C@!R|Y+E#PAxzRyTL6dW2{i$*CTKk}
znNLM(lE;AQYN^=Mf7hxUoE(P{XP+d?P%$vEQH!vp0}WTT->VhfmmHoM<wh!`psUQ5
zU3T)A-^9`&DaYR_`JeP}WCQ2hMatUqYh8AAVsWF8IgXd%O{m{|O(dd6A?G^?cQbyS
zth2Puck-;;^eS1w9t6p6D-Zm5YLMn}n1AV>Z%(37*R;*J!Hyrm;f85rp=Sr?b6@S3
z5?0SY{Mj<O04BJk$5E7a`^@8yVKZw3R>3L|a5la(xh$Ft!?GXe$(JN}<-(envtk{%
zP|=4coEhaR=Sl!7$=CWSb;F$~(W&+IPJ57$%KNKST|O#N@yw@gGv-fONJgK|_O60T
za&XnWbG=N5yUW4&TE*Y`o(CzKRXa&l>=iL4-H!1o?dD;>*0?fP{f?yiIU%c5FTc|f
zZ!TWm4gs6{l3rRGjmmkN%|{`!mwj(9t1H$jyC|vxSD4$#dLg{(_7<=DgR#Ki7zh=T
zknVh|*PZj{g5heVRd`)eim2sX<8OXS>9KKz_Sw-&z9CmFW8fHxPp&_TN-&UzGq4Il
zrXj$e#Q*F{h2_QdVVvfKb4OJ|)B15?1tGz&{-pwkL}0XfyEgN2IM<lcJR3~Z8#6w~
z0XXHSD5&s!DU0)<J8DY#qMBAu0bqQWUjWefT7rv8@*L~l-`_WtpUOp%!AoiubIy!(
z*;VU-k#Xf{d-=%wyp!W9xA;|?RvOOC!d3gTOU4@nFQU7jzBCdU8kUUHFjJfsB{oph
zWBlPYX2==4T^?DoQ}++qK<~UXHalI+UY|}l_7x{tO$?jXH{L}SZvf1|*;EbPVc&`=
zbpmw(`PFfB_O9NT-=Yus*gH@R`6uaw;It;*lrwVZ<Y`_rQ=$BTu7{GJ(ly*hK(NQc
z(Tu8ULQ^&)&Wb`niRQ75%Z%8~bWt*fGkiF<foVQ~fSf2xFe6QC?W8<k!Q)Vr`8jDA
ziPRkF<Z~a><_UCmY@(j^9mDN(Y!_|ctY%ivX_@7v8H7X^H8U`rzG3AjfaCmnZG#uL
zwkyMQ4k=y~MsP@ibVA^Py2%{nWl*X*dN8q~m`BFp5PLz;o+~p`4Tns9Di~m|;RIHY
zT1_?HNH)~`SD`}oR{CU*aCggN9q4!SKq*0cLI4@;B61%>`wT7S_B`MjA_^D9hE$R*
zOTS+y&DuJ#|GD{5FH)$BwN2)y2IYD%Aqh3m_N9mOP%^(_po-IB3d9ke9dWv35mfBO
z!_lX6>5}X9CzV7$E|ukFXA7)JU^EVU(#{l7&ODz=prj3Cy#xp}ZCf7b0uGkXmi%e>
z2<mi=b~CRd2m7l@v+1Dpa(0-a3G}ubZE$DcNmIZaeib=9s+{{-X%GoGF2z8${V*R_
zV~DtaJUq$;(by@6x$sV<8$3i1R&Un=PbK^aiT#Pe?nU=q$Gfm%r}9h|zZLo_d7uVT
zDNMX>PMHYK!N=a7ReanJ1n(}DFPS4se5Gn1-C*ABIp#^=f}o)ylsLeJ<I2`MOn3S-
z=e(eZ=~wmqgy~8O{;IA3vF%0Mp1)Bx&cYyo<+M&=^OFx--bUPF)TN3bZrd+rsFg{L
zMBL(%Tmp9Db&NN2p)cBExR!}~#!=4Z^`-N0h|$bN@Yd$TxRwa{=pAG#OZqs>%@2L)
zUF7`ds2h{40gb?h5ubxoOJnc!an$SYwpVGS7-HTPdd$HKZM#ol7X$}N+KW>ok;@e?
z_(Tf&G~ZOZ6bXf18@anY;;AHkxr#G#&cGz-L6Uiab1tLuNUln2SUTYxj_l(b8Nbl!
zZXBALoUKQe=+>Y-m6z1~3X?D~-u-z=b@lmNPUQWCrFZm0>xh3(8cpoAv>w4eX725o
zRX<k#uaGn8j{m}Z$UYoITK}*`)?##^xdvR7k>TuJuA}&0kC#BO$ZwBSPV@wDMuUz6
zn^vCk8be1nU_5p<a#f~~)zGeU#_XOT-6P$USytMCp#D3_{{!1VB)@`Ae<a0)%mW@G
z#FSLXYhb{RV=7iTskdn|4<^8O;WL9;&|SEYEM!6b(GubaHxZeIpVT#tq}{A`=a5B=
zLU_%mEVUK-hO{ziw(dC~;bZ_@>1ZZVyK|geUAUh4l_MG$KqSvGb=4%1$8z?oy*97#
z-9m|QKNQHnW!vbUOF;$zF)&P@AFAfMKTLC}{{SDT1Q!qVfdIh!!rai3Kxmsd%yH(h
z@?F)*!(0Md;3>q!o1dzU>?9VKNFC%)_O}X3HNap2#l-i4=X9$!rBXIpFK995Jr-&U
zbj=XpT-X4}x%`xrsD@^k6Tt*oJuOVMZ@lD{VUKlZ5Z1JSFDAhvApy`Qa1sa+DBY)N
zBBr7HME(|&2g=s<?31-rbcWDeLA$R5!DVYYo9nsMsDG&Q&OfpZsj1h}(^GcW0M?5P
z`Nft+PP1CHiZsi^jBW-8e5~r#9ieHhs{a5|qgqI5bcU?v^<8&Sy$wFlaN;(!dVb2L
z2(__L)u>3Xs@T#d8V$#s@d}%@`W+*UI2E09wZxe}RJ8pDwyELTy-h<0@HPM)){dW~
zX*AD*#@mtsKR8c_Ia1U0J$|L^W{*Tc{{V>m01uaM1Ujl+b-I?(b1AW|zTi#2DP0zq
zPR2MsnV#ZqKK@GOW_#xDnR846KrolF>B}CAt>3!7wyT)oCCq`e_>NWPdpW1nP-}hl
zMm(Q|;pBtq=Cm~xUq-7Q1T>4Z$^CmN_S0)WiKqU{zY`ml9|%)zJw`a`l{6Dhu-zDT
zLyv(!s+!UOF>4WTrwe_Q-CCGqn%A+yK{uN$-G54T4~m<M>3{*nlwGPbUveTdn7`lk
zPTRX6V+bY;@C|`EL>&Z~5>2dSBS_%Wo-@QKNRkF-*XQJo=5XxXj1RjYaTYd_#pmzI
z0ByY>2_oOkB0UmoNU$v=_80t;X*&t*EaG$Ml)!>O#GFb3Duyk|BID(YDlcq26FuOc
zzW)GKda-E`98C9%Ds>$X1Q`RoU%_5I6-`7K09rX5ag}v~8*(6VDxD$?ftb!d{{XtT
zPFy&H$$=hFrz42m;vFFPpA}v2+UO&Qu!#E#_Q1ooL7oU7cvb4sZBQI77m<RLJydcF
zMA}SR6}=;xUeSVmN~;B;1)5Gt3dIApz|F*5UQtAJx*vt0fM9nf>spXI2+52d79NtQ
z+S54~m90v&HzNF~X%@DC2#)|%701K3gTX3B)sQE=UfxP7gl#3nTtV+By$HzG9kKwi
z9r#$ai3SXqk`1THZ0iUB$eT_6{^4TQ19Z&Iu`-q-sA@@w`GTuB5<dvJ1`j@}#-x&D
zUyI09+(<Ct0%RPO(T_}WLi4o5P3%rTFBB6lk$)qFBjgE~#{mw~9S*lA@8qp3`Y>!r
zff*MIQj?%!_qKem_m!Ka4h#SpjPWZ)gW!Ve;20VCerpVxVjAWQMe*dT%*YOaZf(X?
z^})=<<eb8*GSCUUac}696=IPgmM7>!9YcTwmc;N0x%(+Fks@uA09)_DLs|16_JRim
zI7*e=W*~q$&*p@Uz(AiPC~LXf%pAb``KZNfK>5ahWv-F6%0@^edRoguh{y&2uqG_B
z^zPi@CL@5f=?e*IHXW8(M!2Q~7e?Cy1kJwB=(Myzn2b&3W?a7aiwK*azs+iBfiM7z
z;Bun$xZ9%J0mNqV6uXVdzAxDY69X^|d46c7<_~P%>J)ec*!-@T3Cpet7{YJ>wA`Ly
zJZ(1~B%wqB5GU0n#E1v-^hp8X^9^Vjj>-o-1A=**m{Sup#99T(5TJP5d!N;M(dSUP
zz{uz1rlJ4?lQAj-a%~n<>07!0v2lPdB|2F4)S&IHX^DU&N9wGLj2}xVXqaeeGDV{5
zW{J*6VS>4`Z4Bvbh`)pzD(@|ElOW&_2)eNI6CB^a-IZrVNwti3P6`4WG2@<3s*#`%
zA}nrxshMZHi8<yJj0PdMGtVifKzR@Vm=-b5pLJd2mjEwl{S~LQ&8-)bLakN6kvtd>
zK5C&<B$5n+An*v4V@PN540Cl|%>a-LO`zFa)9nCWPpl`AMb>T&0NfEE=Kla;83F{{
zc*wE}U^*^&;DsRqH$}%kd04GZxwr$42kN7y2z<MaL5UuS>k}tw9l)QO3P1;2GZ1bM
z$y$E2D11-`L|snjZ3Mw1feN(~4HLn%Nm8GQn@PCfkv;J%JKB&|B0z#fSc#vXs*W9r
zzoL=hw%g!IB@f{MxC1t{n<{}5oJV68k(2iyRak8=yn{1xRMlkM$AM7?2RoSWlPc4D
zS*98QAi%@}`YC`P&1vHQ0FsK6WuTl!L|f{nfw+fr+SACojFe}SKLpr#kD7Mw1Ow(J
zGr=qaK2iB3&xFC8U<v&a0uVs_jQ#x7#DXo~fCZFIcrGA<40cjRcU*TcC+qnnWI+YZ
z1Lb)22wG+dxft6rJpwnuXmhV7Bbo9*9kk;d_u~l|m;oelAV|6O?1~9=+BqfxNI@XE
z*!%p`+?g;Z10GmF8)spFL=1sqJyew0Qngmoq{8REz<v5pWHcx{QC1o7+DncaJbQie
zx;-^IIvNv2o2%6LLx{V*BH2-D;}@yuzk<`#ruY0#8=MD!=?aFLk=#383HmKwwl=-2
zao+Mj6xbiq>Yk2+gTS+9;FHFC_$r+y_L{E7QTIizY>Wv4KK}qU$@Q&3Xe?`|n-~Mp
zbRAbz8%@&u9Dh~9bqz_YSo~QyD{mG*S?6M->Jnfat2$akaF8wCK}MeAClkjDT7@te
zM%#{J%XU#;THwk%f0`~Xs#9meNwHRXlc2S+Y3!bpL#c=nA5=hrlTE@&B&nR;tuy_U
zCfb1m2GSxuRiNLG;`ATYGo7@$OPg0n0U}u<JNh3qo&zgEi<|!dMVqD6{{RWA+(*Le
z5N-n4A(a`pM3IOj{MSBHRn>-Dgyu^mP5FU^TB9u_^N6ulX<SANOsK0gHc1ye92F60
zLuT!qV(RXk!&@gl0aa@^0b&RilY*`3;MutX_w)VHP9$&+sd0V;e!nyu8pj+#$rn<*
z68LD75KKqxs_Qikpu!?*ICrynxGHg$)5^?I>ouoLLl`CQwZ+=8suxsnlP)-$qkTtI
zbHAE-BIsInl0h4Qzk=$=8H{n~m$e|qwZI#ZkQMHY(_PfH^R)Y+cerpB8e!#*aWTl1
z)_WhPH1st!883MP!{`cLOub=YH(J%Gwv`>NYfD4`BwN$SD(q*vtN;gVuW?|n>fS{F
z0Jy-%)73*wf~_LQ2qNPW4;h8WFV^dNb8W&b4T!PzQq`u{)QJSeppZOlsOj|->APdY
zg-MBs9+*<St@orbl1>9oeAT4pliK+t0N&6ey{x7>mQrL3krGUT2Z*=PLTWgZ?-&4>
z$P?!aGh3?n&wUoOHLO1yt|RY{p9#oCtJPH0xRUL(m`?W_5PL`5EZ6m*$2U!&q8{O3
zK)<Qx<f>ZjL$y_#qSD$7ID>)6{Fb(hrBl;s*-D!xqsPQV9iy4Qm(5<Ji0xNJrPXy>
zo3)Pun-9XqpTv2;E27iro2Ou=83ER5Y#H@V+7zi%4W<BVY(X*fKeCmj<n3mA7?qvN
zYSGxnw2=9kkO;@3kTgt0v|N{Pe??f;*iPqsL-7zKPs{QuN<@=p$>*{C)TwTmYe0)R
zuwrER%9DQL1FR+{-<qoVY&v#FHXl_O)E7C>?V#is5#`?q2|9smbna*yfVelmuRM6E
z`a7EaF==(L@Zan#+WL=pHUd5vG65&=`YF0&stW=(oD*a6`6r@W2Cjy*FdFnZrv#Sx
zt&KLOg;p0d?qk220U7PWrC&~AwOT-IiM)M0(x*wm#-^3xo6WY9@Vn4>StD2K-4)b}
zt7*r>m$Uv5v82f^+XQj|k>T@QcAr$r`?S?r?y9oRX&?eT2+h_{Rp^x{?$l}9xrXin
zNFTb2rZ(-S_nXdRx&rTxCOl7nA4MZr^H5SPadrzpApDPqbS0#|0OlvT^YTOqA{q!X
zAV|sw5Wol=@nQ2zA|PinKIX_?WMG~MJ-$etg?H5;KyKLJi~Uu7;m-tlf&wb09VZsB
zpUGEfaCEe`dq>{~=C+X)dVwbY0L_6s1$K+J-Nrx}%yv|0hinI(#0$sjt^n4_gCmo1
z_l2HIiOT_pI`AX`c|m4^0hkdbMk_ctftbPLj_5Zz&k%FLI2ip@NmNiI@+Q{(e37t_
zCl?ZJZ@BuXi^vjOEfN6}@=`F(j~5q%Dw4gYue6)`K{5CGttM+=wj6jd#Q7{{OGqL`
z<Or21ac!G+i-1TL{>oLh{S8xGK?l|%RMjei*fSA)-1)3M6<3p)C-D<+FU@I#AVHH8
z0}!UPh;=S5X>$k=f0%RO-D1@{UU)p>-R!nCQr1Tt0Dmv&v8w<<jFU2OjC)P(ov9?+
z@yPL?s;cKf0&k8~_1t~rO^>3g;y{o|1F)Z;$#i1I_IV<2^1(@@2%Z3f074Go2HAjZ
z`J!upa6OJ=l}Tvn2G*H&4h#>!MWak^7sae?<g@h61OVPMHv7WRqyZodUgtmC_X`Tz
zbc(za_-=R;WmuCoxFuMwNjq(8$s$LhuIX$8&9Og!eUgIqc9PS!OiW4V)TnE|P0YzR
z@AvvCWbKTQcn0Uw>?mtXNxg>vkE|aBbF|eu#LhXAL@1;S#m&LEly$(}xXp+W+m#L4
z0iJJ@e3i7;kiDibLtytH0B`2CDG_dDO@P1Lti3B{9T|&SR;4&_f+7X)AKwcM`)P_$
zNOh*)MXjO)xvrjO0Ut$|OcLM@JT3Y8tz8>--q!QK?y{{TYKZ}pk!g#fBuKxA<ff)Z
z-f`J2(nix0*hk%zs8B%@e-1E^f+XC6LTD04(KvKk*AXZIB*#A@Qb-vaLGeikxbXsi
z(-1pM<v{X5Gi&k6n{+U7009?O-a#au46j;8osp!Iad^m9w9)X1H!~vNRZ*@JbNVZ~
zQ5Gfu`Xerj9@=z7v;jP03RobJ*ocEYlyvMZaDH&59fXcb=B?r<xPWHh4nC?9OQd~s
zDrdIVCPr?nJP=K-d)~u6)k30S1cFD!$v6s8cpRKbm0Szh;w(;7>}f5MFZm`=HC|h2
znZWFP)DI({l+T);q{$h9c{lke9wowZd%=ZLk^^uffB+?2(+Rl;$H79~DRBgho>kgZ
zwpb4-ADS|vEihxa1j(`|PTvFLl+17dVECRqf@hLREw~wwvQm{}B$zNw#pI4qQv-I;
zGs*6btR4<#)<Z;scAWU&EBKGAREZK|{mFo)CPB5ti3&CmwBF{}&(#&S-zNejl?x2d
z!aZlYjFVxR9zRdb8-zF+=KOw1o($Z`;w*{SL0GulTf}kQLdOE|51q_@ib|x3ZU$i8
zM@Yk_BpDc2@jq6;7I79fw;@;!`r2}!Ym0#r00jCUlC;815+L}fwQwe$@puL}%3kHg
z!;&*>`Kv4fBxA%+%^kKlo8yR4vs4U#Ow9Ho-R_EFK>@b4?SJlx!)wVmBe4AwHsVeX
zBK|k*m@0+93rBPJ=%C#OPduNysa#2u7vz)o!mjdMc7p^P@d=VfI$L~^!kbM=`kLZ}
z7Rhz5f!<G&uT7+g7bXNWO4_YO+6^5IH4*;4q(ksaXNls)Dsf{$PMN4_G0{qshrDpO
zd$aNi$MH>etkf<$M752KcYve!BSoNYo<1jlEM`wP<CUKJ)2<8?6qy79e7k+Zjcr>l
zp;zBIuWO(T+DT3C+d_zN19OF^<f5cK`Dtr}wcAN|7oPtBqJFFHjWMys%xDdh5##e!
zB5R%Nn&4Vw`G6unMa$l$LgUGstv<0^d$!Dycq*L=LD6e-*>+<!7*(Am0Ap~ra<nPY
zC9*BdAKeC>l0#{WH<=3ENo6OKfFJ^4D|t2sxxxB}0Ax1=jHO1fX=o=W5&8E_;2o%R
z=e$Z6waj=p5y6>OpQ$T03!W?os(Mc@ro$F)PjtedeFrp{kvF<(4;0(9jMx!)2tTP_
z1DiRVTMJjB)-?g9`<BEWNLh^`j@m`IB~7B%jUJ>t{vJWPk0jjTrWaMSR*|Jph=AiF
zZ5`BAm;fcDN8$jB{XX!o9d}RHX}40S4L~PrtU{-#X*5aXe|60zb#7@@jf42CCp@c^
zMK-r_aEQtKFtVy@X=&O6>PhU|_^R5jrt0*CCQDlvj(&?zO|rg8i%V0~X_xJwek;qC
zeOK-Qvp4yw>~t^yCVOP7Y|}lDW1ky|Fa|=_W9t~)u1GZiQKm;O&w{T>q%(9I-^?ks
zwLRrfI2HmS8=3Cuux*eV>J_v4VmE1t80{<2dSgA;P1CeENMI&@c?*Z5LHdf?dM&p1
z07vFmf9{PIKm?Y+z7L=am(PAS@xaz;s9bDd*dOSkAkudCNo9nAC*@>sa5x)8SOGJh
zKf0c?J?mQMAH#bIkI8ejM^c!m-+0pxY>;4FMeQmUIia91NX_g>^!?SHB~~`0cZp+<
zJ^B0bQFR?L>Gi$toR=~90_DDT`|ht4)lGh=w2Cyr_m0OPe`SHyb;grMj)%ne8=vT|
zYHPi$tEeu6dx-=$o_rx^bgd!P*E))lOO2pGpX|4sMr|IAp{Jvwbrpcz(Jn4I=dt@M
zXQu&Ho3)Q6uc|?4G4PHf(&x{?P}e(exY002y>OC1kkR8Fe>I^}u%o3LL;@fZG327t
z=yfYO&ol8j6Fxp(pPG_)84?K+=rJE<QKH~w&WAAe&(0CK<%Efd!6snye^oj?rL5;R
z<{;e8N34W;SBHE%Cvp5nOc+#OSk!Q4NxX}Q$D+GM`&!1dU9&9^C+LlE66$L>w_|}V
zn30ebUCwET891IJ`zb0^dx&6g3~Wr?{%IO5AUs?;{9DozfmR0gmtJI(1o<rvo*G><
z24Km^`;`N!v^2b2gWf^n_VGnVsbCSu5C@;<$^&)PkGL!jXmklC;DuGSpGt4I#lQsp
zAy@t()M>UH;$2|g{jDBKdKIZ?X-#qOD?EH*l_O~JKP4u*-7R{w>DAP|l!3b-ox`5r
zuP8NrcBYoGEcb0KHV+u{IPSSFz3oLx8mg2u;+v{6J|iZ4j^T;z;JK|wRIR9@=GLf7
zi6oFRPjBw8A052W?Wd@Ji|KlcAE&2K9aVOdQmr-#o&?WmmClwHz0D#V+d$)sg_Z1`
zN+(-DQ!iyeWVmzYf1=yh(5t8Fbt%3iyKo)Hii<T>5fDiM#z%nT=VUG*G=Y<8BHwsV
zB!UR@Amj2uSe>yq;KDQ&DueMsl6wL^R`Bvr(=zZo_<i^zstE)FCONz)*k#!CTTYQz
zXdDTEP0F%NwnSJ!1y`T|X3~8vU=-y40H+{di9C6%s^>u7;KED+js&RPS9M#6CPCpR
zPVC4WPw6T!KzWi#BE-axPhUhr+G65h@Gb;I{{H~cO-SxC<C8Exzi_Lt?vigF77FDk
zlHwph#1f{O%{eeY_)UjDvQOFwZQx$RDcIRFH@pA<C*_pRcf?NU6X#@teHn-C;2Vo!
zYP5?)2<L^Dq5!%YX86L?Yd~a!aw7BlsL8G<HJ2H@XS&6%B*~KpIa>PAHWRzjRy|?x
z&+>~H7)B*o)w2!|4hU7BCD1Ph)0H)M15+1^i-}MG*MkNG;%>TUOfO-1i5QC_X_qvA
zD1rew`6F#6#Jli|Tc1uanY4g;ARIuB_(sHOX+RAY$77$T-*tJ00Rx=QN8jwJX>J?a
zFh~%u28bY&6L^dL{rF>(S<=06E!&y8fzi7+7Pb`nkS1iwwZH&+D8RncJ_JSl(uw+E
zBpYCxfuFzGSFF*yBoZX{KYyyW*28TP;b~Q?2?X1fP*qVxIp>%r#&6vaq$6l(?Gk-I
zG??SO3A_MfkGrY%uzXVnX4W5?t7*vtN}E{bz&lO^;a^}j<~cESM|gce8e}(-%DqUt
zWRfir0}DEFH8C#}j`4-Dr6I5ngc7nT1H4F?$?xQ^>FNs;j9)*ZmGs?hKxu+`Gjv|^
zB--;5vp<Oq5?yY1kE(O5Y<?&>z=D>KYv#BBU%k9ge-R|yPj3aAQ&i9l<nRv)vDSd|
zAB1?L)NVJMS9B4Z$0;m&8u)PdkK~d)M&4N9J=plJFsry2y{9pCJd)W0Oi2;*Ro%F_
zoK9e`QaEoxXy$L>*<aEENP;t9ZT;02wWYxDIT%;8qTozPh$&^!V$)#&2^luTLXtN@
zF>cnMl8KT4-~k>#Fr|_vaxO`_!Lk~Jmd_)$ETM6NL|o=BK8jk8fI%lS0YkhJeupZj
zNyzks?in&;!A{dRfi2_nRp~lzaw1Hrw+NE%EgXcz=${WF3=?5<bo8k(yb|ttH~OkN
zooiU=I6k33&Z0rFnNQZ1<9(%#YzBxO;(aWuzN<zp6Y8>$RCWQeAj~T6*SXmlJR)FM
zEjEUxEw~>3Ba~xQ*B^-Efy%^Ys7#l0VdABys3gcSJe2(l*`-y@BXmbNpQn;)Bs#;s
zaI3bg+GL(emYIk%bC@?OY&gCIL>UAdkPLkhkT#h!jz#{87Rhd4o=P{90A$8U<!AL}
zE*Z_p$e%P1aTk#TfD;!$z6GR08jb)N$r$dO9}qrncq6(BKnFze7B)($!2oUK45y;$
zZ~#D@o)zna&ev&4B0w?RiBqxAU>GOB6<VB2d5jT~RQ3bP@fSZOm0E9l8cRtX=H8!@
zk+p;p55>&M1MlxDlZdp4@WJ^e-Do6&JT3>EC8JpngE{aw1p22SkY^lspRl51!6PK*
zV<XK>)`Rgmvwj50;(mw-s)FlgE-xQHnt++jz~D->+D@%qwAe(vzz|1*==9wmMbg^J
zwa@W2+xS{cHK+AP^fbundP7B@#?+=mhx<Xzu|541nmXfG*4!L-HH5mO>_d(OcuC<d
zr_`$T9Yg6;sUQe;(jl$vKB^v$TSm2T&~~!W2{Y!WxF>6x=etp_eMyE;1pX+V)*c~b
zXzIAs)YK-x0dP&e;b>Q?@8Ii`1AD9^BhMVH{SYThR796L1AF;Lnt07YMWmoLG<7@P
zSxT83oXZ)?p{UgwdJ!hzLr6Z0q1G{)E|%RrCbpb6by93x?qK>X2k0MPPRem%CL%ml
z<7Udv9G;=ZwxEG>56xZEX`lGE*ocx>MWps!jRv7)xZX;!Tc(OlwneQnwEbsDm|v`&
zHki|BHYDU5kegNmXgHF2jE|bPSK+ZS2a6w71J3$ogpl4mzRD%!oXZSXJF<J-O!6u<
z+j2Ydr_^YLHx~l{s=pgrOH6@~ww)ne;I{ZP!kUm02{wfdO1psu#1jh6n1aw<cou}j
z?K9j30!`%2%$Y#)c1?&mIe?zKfP4c0hyqYDyO=hP{K6Ho(~(A@uNQ5_rA=E?yG{P-
z-frQ#6fb`FZ4Dn6WDji?IEVy}#VtX*rFR*CZa}hjwA_n%r0Rjcg`g5M2psni4CjQ<
zzodAk;c0YO_UF-9Wp|gCxXei&3XYYhYo18}3wJ20z61_<J&?0hc&VuXo11r59?$h4
zoD8ZgB-Uii%-ZWjycuvLTc4^$8eI~Qe_7KoUu=>%`mW9HH08J-`$A!J9?NNsO<VOL
z?f4rIOi$nD7gFNQ;qKYUv=6K<KP~?NX83fDTI-MtOR@kae&h4;S0mt!z{miGm_L2h
zT6EjIyLL>$cqh;+L4Y)n2H_d~)+$zJpK+-_44EX8A73RE67uSW%`Obgn727sYgZKX
z;1-8F00#j-e<jIv?z2vd9ZHj=yA3iN)6dpcxMOK$WQxAfYBbdzDQO7?typtUZ?Ltx
zZ6Qle&}<0c8~ymMPp13LUr-flx?BLyYkrHg+wQcxqzLS7`Sx37#|-Bx^at@hUcFtd
zG^jK&vuQlcXVo^otkx7*ahNk7B`-@Ro|AN5p)W2j7wyW+uT(p!*K~spAYMr0DrDU|
zS`2o|L9K9Z>-L|&)kNy$)3wm<T1=0X`zlqnO)e#__{p^Je^qBoNW8HB05(amA2jUI
zG<0-ZQj{A=1W1TiA+(Ejy3H1tzt7j6$~tZg+FV2*#e*oELz+On%m-T8_2&yE8YZCZ
zcq9A%N@g+}G6P#^F+VT96m$t|NZQa3{6jo`Ne~T7AB2KOKYyK&L99WhND&!}Pok?z
zsRQ0P861f-n7698TsgFh$!HC|`SMb9-7&1kYTQ2!>99WY`dh?(QdEibuNrM3I=Ugx
zdEZPuw;YKl=5NoU<GrWrHD5xVI%Lx|988HDfUr65G2RzN)_TaG=Q525dz<kWI1%B3
zXXd>7tZTJ8?x1RFo*waFS^$CQwDDuRvd@T?lSAl-5hcT$-_0s*-&K_Ad@LsCpF|oO
zjUm*ITjH&0s0!MV4g5C^e0+XOd&ETL()%Z*(?45MqZw0Zfae2&j(=ryRM6dJM$>ty
z#*)^##1coQZl$N9sY9c)P;B@7RT{>D1Hv<9S*FBs;d`!2&28~4z61f>$o&)*ofF6B
z=%yqLMWlHTMHOce0E_q-{ZaH@lqy(mXp_v`6Xgf;LruUL24?u<)AdqRJ6n5^$sc?n
zq5yG&erFylPmKPq(mw>)0h@E>^5V*3&_IKJZw5I0lngE-IKD-_NwSsh+SnX(AJJ#^
ze-QvS#wNf5XXv8=+lT-k4E<4)Bk|_YFhUlAdw>Q2i>67K*dUPGc+4L@RhAZpfNhwA
zA9gCMOJ|ToV8km7B$>?OJ1DVBO>iU$FfKfPSxC4agqwqSw2wbU8zr(xi01eCCnfEN
zMZ_2vzl43=AiJRr24e(<_@6bV$FwkrGA;oVZdOj2+gwCkoc{n-peo1!<GF%<N^3}u
zSmGNe<PYYuD{d6zHVM1|pF!ZXb#)AWAtwWn7F~PTIL)Mq`XUO=tjHt*G08Hc1|7kX
zl0gU0DoU;fp^HGVJSb*N&CfSm&X^`L83H0FfP6fAqiL9EF&LO1(LmS5z%qLe)AvNj
z7e4SwzA}SoY4;Z6z!D?ouL2yxAi$Xd-(No-RZTIL12SU&0Ft-^Lu3+2GBf4M#`e;5
ztz3sbB!ESwMj8d=$vyu7sw&weNQuGWM%M$3+af-PD2e)+#l^QWZgCPkgesbg0(@rH
z`tYr`MKA$?U|i&RDpkoMO^t;1^i_{#R4NkMn}Lho=fOLxIJ<B&I0KF_hN1Y$;8=G|
z$3%m9;K7mLx71GR_L>0zpGm^eWRL+Mf*=rY^9wsnCE$`E@(0amvEKw4z6kC2-C>t)
zaj94caCL-C5@+v)ZLc6i1K5szLaM+?24*{$^0&!M1Gfw0Vh`W<QfX?{GC(;Qut3`G
zEg+IO5o8Y{K_<X<ymExuqS>@|Os{28@(F_&o9D~ssy*%wu@FQOY){yQHA4-_nGtnT
zp_2q(gBMOj+D_HOtVkX?Nl>cm+t6MTSL+k)T;Y($^D$)$oJky$j$ugRb^-;%aG`ON
zAmagc#uUNCn~1zx>qAUnNFUW_f*gFtEVOjdZ-&{;l;zRm+Wi@cIpPX3V&up;Aqx%-
z;QbKQbO107<la1%4Nz99LA8&9gb-UZ5!jTf87BEMK<t1t!;U25x%sUjRP@|QnUW4n
ze(36*w4Clh$_gz%gyzI?D76hQq(qWVC%Ru#@rPKtwOy^e6>mw?9{Z613zOYl(e&si
zaD&Hr{Z`c)45k2_cglHrG;;<Ux=0L*0y~3+R<}=canBICN{$9(Tv|jGUCn6_ZU=?Z
znbNqqXVQBK@4-ykq!D@XHifmR(>3jh5&^trW9oF~lZgU+g4SK4%`MWeAPmnug=v)l
zAm(_-_E@y5ZFCZ2<gN<A!U6S4!5h>nf&nB)98d12ah8%VH;EQl`aMVTkuYou)?o#J
z$qqM*U+A*RG?XqM6oCL>_E9{!!!b4@%2zlNMkB%$PlE5nNb%(;ccpin$mTE4qIx%V
z?ch^BAc&9*`6r`*0EjU+SMdw<Ogc#Oi%%+02qXY^ALj4*c2H9=U`_ZO_v6J*#{mK^
zX8pMOEUQQdNp^492?8A#BN0EUk;XDfpAcs%#U`{cJehGaWBXZ6x`9=c+#EPKwa4TX
zz@5?!?SuFAR~fP9V8O5!Bl6?TL0zCc5ZEU06wJ1IKAGm8x|7>?HNeLs_g6Z$x}em*
zOdat#@wm^zIa%}z+pDJb3x|lGyDPf)d)}*3v2?VyM5E|L?{}utd#Y@_L2&{~9+t?W
z=QtBCxfYXzt2yqYv|OK<{E^V8X=&O3zUjTcR9)zO7f~Q+*-Ty6AdksG(<wQkm^a+{
z#uSyE???{*QN_Xa{=YJYol!<0fd+X`;QG?mFlr9ceis>oi2IbLlUxad99z*)I?_jD
z99~L=`k>%IGA<_xOlbCvN9*X*WX*u&i9acYldji$(-GS^3XZ2%3$E;(2ogR@vnp?K
zGil_lG4bgaAy#p9_V)ynKTj0~=QIHjdl0Sc1QK#?LGAWbFQY$-(P^2Vye%SCDi$~v
z_wnIVbv><k%=Y0}tQ`@!TM=uj&#MiX+<99>v-K3f;^NcYThq}yDY%@8zZ@#njNIqM
zt2C-S>Ds}N4al({-7?3q)@ukZagCxNszf#Q4%vWJ$I=FwbDS<DfT5)3HOFm<m_5*R
z;*ke)MEEEjOLN>wxZz%)WJ8U?@=$8qwy|_TF+9QrQE;4`a1pYJac!(m$s6f2b%X=i
zUJPwIY-3EAlWT+&3<^}+Obd*aQoD(9<~a)WV*vSGXI3Ow+IR>Sk515PG=a`ay-<J+
z<G48rj*-(@vEP+ypc~1={%APsy^?Di)9Y0-T<1mY_4OY$y84pllF;`=HKn8gJXT*q
zrYLl34J|V1aWOwXr_pIc!?#_*q6~=s*<2a_0Mnv$dTrX^3u5y)`}?x6`h>tO+D+iz
z0*n%0fMV|3$0t5ZH?4Jc=_#5_hg5WYE;$&n1J}_^#@aWirR@h;W}2JY$aCjz!)WGY
zE-l8Tce<@vdz@sFRQ0O6Td6L2E&#{Ga*52@VjJ2gpR&7gp0S19X><)ZY}A6{<8RB!
zZ*>aor=l(00sjC103WB*`l?+flUYbp0tgrp_v3W^LEigSrluwXq_n{VaAw{ME6z)k
zQF52NrKW2GTpdlO#}NyfW~ORXa3^!zNRAgpsNUTjSMT_P34j3$oX|0@aV~6uc?1Yg
z0<C=xm^`t>NhDa{ttxa{&<jT<TWfjTD*908RAT_J;2HU=q0Sp@M(DtTJM;P|%XLuu
z+DN9`5Ml@y`u<8*w{R_Vf<%J?-m~{AxM*X`iF2dyfFgUxr;4y%%8dhYZrcZggPYj?
zN(^oP08yd4qB$l$ocxdo(-0qvnIOmCu28f+$89Q3tvC!aPtV80@>0?1I=xjw{Z&TN
zrr*NHnHlHEeS{o{!ym7z!1l#6aL_~n-|O^K>+96fQT`H%YaHkV0Bk;9J(8`erKO;0
zX~;DRg83lFly|;($9hd=O0E?u6THsiWo2(Iu)$MQ+M0@nrEiJufJun$^o7Xvx<;2#
zHpsc|yLZ$02gPs>M1EeYl=haGZDzbTyn#9St^7H1pEf`0!_&FrQrI_fT~A2R)UB+h
zec3H865=cuHKW>fAi2Vr0%ZRHRoQzzS<R{)@0(SjmT8Cwx8<2zd3xc`gQTG9p8I3k
zT}j7bXatDIZZqh!^_pX()fPcC+-0ur6!h!Yt5L?Q4L1@bn-0rB>Hh$Z(rj=6!KAU#
z3zMd-ZPE+~wguzq`KapL3vrWjoQKkTsk&qsAlgrt_2ElOiK_*|&B&Q0R?oI|Qs%V4
z^0B{~8dk9}B5!Gxsa3h$8{9Mx>*T6dq|ig!KM@xk_Tf36({ZzrQU#_s?k_4zrWycj
zkZu9U^Yl$bkv9f8jPc(;L~oMQCleyq{QGxH>8m-Rw*xa^Fi^3;l4KHi6MwQ)k7I*-
zhYl?T13&<RCBpt)zTr^`hD6Q5uvS>$NGARlKX_G|(hNa^^j4{~ZNq^ONbU9%)kTIu
zn032B`}_Gk$N{Fp0h>pXYD5W}2+1}mqFh5@$%p`gKEI-nmXE?=tv#(a{_5nA(qiJm
zc}1q6*$~J$m1XppWIz&dIP&#SCQRDymdwH6=2m@pIRpSWIsMj_uAB)3P9{I9%hVD<
z;0%$0-{zcE>77)P?Y00-h0xL)pAH7xf&`;`L3Z4Bv?$tkmqt$}(djFBIAoFFGX!=3
za8WQsIFdk{_f5BO8=?TQJfmxXacq+r7LMJ!B~5=$WPvk*2j%*!aUfU&ao)<U21I!a
z$KE^@ut;H$0pywbEPl0}DpqL(foouW`6y!?P3H<~jwYn-5IGi=8;rU)Smfa}eH@a|
zW*P*>e*XZvr&vhvlXLp3MD1k#6-Sr?^NW)lr-(o(G!o|x#PB6KpvN_!9kI=zAG;|a
znvgCXa{^%-7%9%;(s-VI74EB3M$-br5q}dA^jZu*1Wm<=kJ(b9WyRJvF)}|@afAsZ
z+#U`>iXv)qahW{9g*#2bBXP0f-@Xd02p4Ov6rshUB*-@+E$3jQ#PBzc!<avKLQJ*@
zxjs;n-!8Dtpv+Db+6@JLeNf6xK4*c?36)lGCP0(l%|Ta9SdISbpyb%X?=*cIrgW-a
z_gA)~zy-wcs`X!K^Z@&<288N}8DN3)i>Ek8Z#k$Up#_l{nF%d!qtqK#q4)hXQZx9a
zC37Ovf_W`%=E4eyf?!0=;|tSY(YjCYjIC2eM7**?u_ilttookKXmwPDZ{dq)1)ww!
zUVcj)KmLXlreDR$z_d7@)GaMB@qxsxJ%6*bXuP?B%-{m;$JI;F{hO<<Zu+&Gr=$)}
zPoP~He<KfHW;4Z<x>F9d&&ZFwsOr7}F(%^17fq}7TTM!lG&+Nv;4)fA<g+yX&1xz+
z-^EsK3y>!6cQXqtf0N_CTYg!w2nIOs-53H6Bgt>G*$q`^2I*@Dj$A+hddjh1*j;5R
zLF}u&(If``SN#<K0Gr|7wpYWOoMCB(_Oy%@a+M~S2a$q-+B(`zU%IJE9vy%)<g8Vt
zsM0l#twro>{{Rm7A4R{sy&_|0YgFkquuccmDC)cd10SBiSyFdU0j4r7AmK(eVZ?%9
z9(#W2`kvM*5M3jY7Rm>Mv=Lx!${%_N+&+AR;G~V*LGDlJpF>DJ*zIf)g;Q0g1*O1q
z@>*m-jDyU|t*NtMnQ0iz1ydrpxt&ELPN5@@44|b@fpZ;*;aS!}+}OCD6&8YDv{;@N
z^QJa5^iP+QCx~0LOPtPN$B)zR3ooYh_Yt^^@JHmgDd}FufB^*e8Cd6L)r_bMgIyuK
z-Y!%7m%Km*;NQ1wC-|D-xBxu=09AKN`X<hBFnPL*c;QM1K~cg3OpKUJ%_Epei=GAH
zboAM$b289ub0_AjYP9s+Wt&B?2{Y;Ul@rN&KSnzVlO_g8y_Btx;kI#b1>>>&loi0H
zOf(1_5D?PUhPoVL!a0JgTj?_CXo3I{adJmKir${6a|<70GcY751x7f;{4qHZ_hh$0
zu%8&pn-sRvz~cg7@Fo&8ce;Jw!Ujhf=fwkS21J28lPKM)V6b>G0@5eYsP(a19V$$s
z?Zu$&1U6u*+Pz47fnWjz5)8_1POK4UOmIwjz*&_mZ>c8}1Dp7(#~bxGCH7aDGHu-D
zJF5c#9%KM-{c|ddDiP_DIsMXgZ-yj@xEJ~>^-s48Z`1;0%$^TFaG`x-9r(YZtm=dj
z4U;fUu7}=(%ughzF)gf`#%$syNIkp#_$t+PL7;%_XORjD`kk%#Psvh!R}v-BCl)1X
zF;3ID?tA_jyqk%YGYk<Y0|NI{n(d?lIKkmaYqSYz<B3(G+ehC_-onxk>Y{yM0Nb=%
z0Z?|U;j^Hh8z_#To6Lx@DSDpAsjLH>4sT$nx~aYGI7L>eB$6}TRo-`P`YW9#nyZfN
z;Mx_66-b~WW@3A&UfGH5gmjw?037+iOqSSp(>=#^Qx!2LbJ<y7i0%2O_df%P5J=7t
zts0B=*K39dyp;N=sN=)~X_a4npt?5YPj|Eyo-jXjv=Gvy=N1U~cMlbM0IK&9U?Om<
zX&OyKCm))c0H!Z%$vmLvNGrIRB#1aFokATEligUV>!0|0W>s2*69WM>4^9WYZSD^W
z^V;HA*phKKl}dpd#o)-riq_FnHM)+Tn6Py~2|Ri!w^38KZ=)uTmW47Pwwp{%kFOW;
zR@fke1i_ZH7$@uGqbzHLN8yMeqQOiVP(dyP_XLRSxGy!+@mpJ~r=roC#oq23d7J%t
zTu)K1`i-es^S$<HE-u<U36-<gbuQKG2A#54%ek#HBLYAczje>+9q%;^q}d={!#_W&
z>G5N!wmDE~sRpnYUVHxlWj!i>_ft)`Ah=0CeU#0l*3lps69i)FUYSF7v+%%UNsO&+
zDce^{-KGGNJAri`<5#M>0NTbyq6gRtS|kdjG@Bclg$|r=)oN-v&>Y~=9G?LTooPCi
zX0xYO+DRpmXdVk2rMepj9r5C>b=U0Ch8G`+z(E-P<x<c!%mK}1jo|*vTz};^w*3+V
zfOA3ekt*Kfc=(90X(XSa@Kv<TYgjFFllj7t&x!c#4WsYwRyApw!si>nm;lYb?k<B-
zsimeq>@#!BzyeR-?4YYx-P00xJm5eFzu%I#r0Lp<hMiZnwPZ;yXghmB@h1L7E|0XG
zEzR2cfRB1|;t3!q>-BdxRN7Bz)E1E~aU>4&e;-8sP1UqC{{V$<pWQm6h}~ca@{WI1
zOG%{^*i4x>^I2Nk3>KKiRjl{2kfhRNHL?Kx{{XtPENcKZaXgYf<wnBVO(fV51e|_W
zPWKJ6IpB<}Ep$o1DgZIh9C+Mea=z2F7Si})B+O*4rNn|@+?$W<<hieD2%2pWA*8^M
zCDGvj0H!`~`Et=B{g$6a*HzP38z*Z%G69PZ6_`XA9=-|bxV6V=B1g@77H&NGxbz;6
z*C=Y7*Se<X)CYA3-hEA%LB54*t*Jv<?wkkvW<z6vi!UtElGc*KTK1WdXe(1q)%t@q
z0RI3RR@epR{zHY$my4boOr7Ua&{wTU>V+Tm8UT53^Z1DKS(Iw*mOCW#!ajr6=~Sg%
zy87f*q}d+%#D@-Z=B#V_eveb6cDJY1h;H*pUvqgPFtM`rT5VDA=VIn#H~RNbR(n7U
z066AyK=V|cUac}nc@csw^-7gp#Fslfl1izDlXLjJMbl}x8M!0w6t67;&56y=lBrIq
zX&)CHz`8GfNhFW~?tc6I5vn&5m;3;4^#F-cT22fLp6U})1eajUcpw#`^Tr1Xv|C&F
zT?FKr5KrIqR;hq6d)v3|g+Y~p=J__CHENwkx(GIl3mH*mIaXT+z_i*X$IFE+0Db1-
zXK@xdDkeby52Oh5Qc)qK$dTJON5gMvM6fhlzj^wr6)8hE6CZvnFHcdz;BAh0Kfm-=
zDrz3;WDAZ#A!br3>x_s7V3`GFRwtP)Cil;Edb^9s#2J}c+JZx5T01~Q?RTXbggB4~
zi0l+@kRW89ES=<sPU+@Jgam^CT;_iK*78<Gxxj%uW9+HufNBp*&*{&}SDVD(+7+!c
zVbLPwnZ)+DuViI7IMkpBX(VzWV%}r-R$fD?mVji%fhI>jYPA4BkRWDG3bNqn06Hfe
zU(+iFv!zO;7eEqo0_3Q~I2ibWIh-PS$=XEYAAF;v*)bD<aq~?kE%<Oro-g->M!1JK
za4rGmYdNK~<;3GXmTs+}H)y$Wk|UJy?5cxxFQiBp7l57Gnw1{}2<&)7-6KhM>V>j6
z1pVPNwA&1|(l~IyTJT)0X@MY1Zy4k&tPly>Y%yc?`^u%Ju*Zlay1da1+(btNSgfyY
zF)hXqYeAE5ztte<9mza|4s-+IcovmDgGEzcs%0uoGIQJy>ZEOBL87a!)CRO|U_XjD
zU0|!D=sh^qE)BeKxyOi5RT@5tp?AG7);HrZ1fB%*bw^32t<-CdqpoaIAMH^%Z)<l<
zyF}({y46ElSmR4h42?KB^H*C=l`-yUQELFeZKKgx6koq_{071;A5~{i?IxCqt*EHl
zZ6D0z9w_FQOdg@rMN7G&X66!9I-j@NYBgO?MWt<4gZRwB`XryON9`7!qA$=>b40gb
zlH1L@{ozHZqUULv&~aed7VNzGpIYjI3w4!j1UGbj7Ozk1-DbT}Pu>ck4~uXBecgHa
ze+S8jJ9Ojc$1ANbbrDgmcH&&pU>GnwrD^o4E<G!KCz4|V-z867LawVxCsRy3o4_vH
z>pnaMkD==wN1~Te0hQ@Zn@|!ygb=m-?s#ey{{T$4T}2@F)M<NxB$x*BhhEC>)G?H|
z9oQ{cb-k*6S}vU?+JhTi@Q<hJwLP8dI=y{rbRR*EbBu1dBh_<f{C_4B*`_{RZw}+8
z^yarf&5Uzi8YDE`L#Zy)vERhC)Z7066OZIp?vqzTOG%8W@XLP`LgM|r_KKY?LugTB
zDzGq@I!%wF=JEdk2inZH%5R>_r5!W$RiCKpT9-5yf>_fQ9)6!y#&w>fUtOwwYLy&d
z{0$`hJ)%%l>c7Osz!=ehCJFhK4N@P0a3Ies-0=86JG#R!B+78lt5x^8gLN0bcq2H+
zWj#tVNslg652Qn^Ae&s_c1s-gRjNA3w3BZ5T6${1&B-Kl-E+=s>6q?pC0x?#oYw){
zH_BN>V|jEBg3D*P<!9>%Yb-4W{wh}2+|WgbfK>HQfI%kGL`9Ti+f2G->d4$6i<v4;
zm85ljKqT9`uJ>=NbCZ7s(R6xDYg#~&YlW^>$mjaq>AEJht&u#;WMOGkp)DYr9uFhJ
zeo7iO8pa5AFia+^UPKdtZhrY(cXqa#W@~g&E@;kLBE@7qnyNJk5(t~$^IPXo9OEWL
z_Mbn?ls@j2awVY2&Hm4-(r2m5Ts%JgG3a$ljds_QA_1PuOHb4Tzy}BOQ0qE-OE)#3
z$8J91W;L}a=N*;n9J)`F#g~n*si*H+T+#`_cXQ9#SUQ(Ji{fxh0JSMqI{-Hw$JIrv
z)3&P#13ky;s~#M;YWiJh1_;~_(P?|>U?5Ch0XJOLM!JncB=!VlT+?fuKn2D@vb?aA
zrcFAUmo@fu7K;xR0oJ{<eDXO8H`U%RljK!;j@tVVhSxlVy-#&>-xmu4C!YTRvH(ai
zxC_XeN6|q}sTVwb^XQ$`gGBe@GxJrbOFQ5|F~lfb{iF=uBn%Y^)I(i>coM5s)I^X>
z%=)52;c@z&*YGwVD$msy9`k}Y^i@q#OwITNpmWF>!JP1`MYWez1b4mx$IS=4+-=3|
zs~V$&5!y;`dGTihn<f?weW1a?oWhOOiQ)uU1p}NxoXJBZkr@Mo^hu3Z2rx+#2zQrA
zJYG|_o16dx25h7nI<6D9JP#CbdaAE};&1gyvH&1uDzqO-`&=M|oc*=~=fUtn3PCm#
zZ#6A4L23B{6f~X#iJzLBd_o1UlpO71n+u%YLCUjKFg9CzaVoU}<~wf#z2#eLcQ)j@
zZV3q~mg;OEybE$83d0&3MXo2D5rre^JW_q6PbjJujUz#9-5Um?8q#~55HaM2xS4B2
zc&1?uV1w~?AqjEXNio}>ssgn^2L3=wf#3`NO2t6ZJII;ts<?@W7UeX3OQ#c2z4%=x
zv)W->tFoaID9eq+ADO!6^o^R0F8nhv7hCMD2Axf6?%gZ^1GqpD<g)VK8eutf#h{kU
zLGXzAqI2CwjZKBlU>m*?JcmDh&@{t{Epd_M0S=(Gx@R|U#6f}#5&5n!t^PS$&)==D
zP#A|1c7Z;V;<G7_f30;u*&x6GE453TO0)7yiIQacm0P8->~qM;nV4OZh{E=sqnyx7
zn`yBSWlhqC_f@D}V9Rcw%Kea5WH7o6PBIETGsOxUo$!~Nf^aYry&Pyzv2)8oF~=R0
zT|L3{ed}Ncl5aB!-0hk=^@D|&V30-p0v#y#8a6j>;{wFr=N;61QjcGFce!o99tHUH
zwp6+R16#CSB$F0Ydc*hYDKaJm#f|=63rD9m8YINylCJTJezZkL25s$WjDE@*wVcL+
z=NA$n1K-s(O6HW7{AJgJVpQEbr`6Rw)pdxA5h0_w@mX}a$TZqom8!74j@;{!6SMoP
z`kiGx5T?o%0jk*~11GjdG4xmZeuN4YjT+S7sE>hw$UWpwc;#abn^R9*`|<v%xxtTL
zKh0&9P4K|`>uMeEYt-K2NdS^#ynWxrOdR&KJ8X#p4>wNM1_0L&$e+L2OvY_DA~P31
zJS=T=NM<+RQX$`n8Mw$!(hG=>h#$mx9%#1)i+&@Xe(<8F1@hA+!v@y#8UBh4P7NpG
zXmQQCKc&|**8z242gS9pAlu|uPUeW1fNgVw$)B3%x|15}uPrh=pQ_)(+aE7ZLBI#P
zNU`!!wb0;rQ|boV&TL$nlAwrps`RH%I`+}gG?%(TA;XRpih<0iA+{pNx;~poJou5D
z{S~UZaw@mBCo-bt%I8XNFKY>%ptmcg_E%mTG~7*AI5mzFXyifkT<)obpf{6v>=xg$
z-}hSLv)tBFX_vX!A4Q&e((yM{(~TceuSakpt{_1M2lNG1r&DRSA|v-(-E*pRZFY^$
zp{D~_>XKX-8e_za*>gMP;yek!PYv12JnY{K!o7f-ixDv?8uz*5o)s|iIl1JX6Nw~-
zx?sc*B&X_IOYcm8JTfFA6>;#vHp+lIiJtrbm`svQ?~9|>30E0eq2Nbh2fDV(@e)kU
zghI-f4D*#ED><g_GI_{K;}<J(jZo;3VUS7v6`C&ufx73t%q(p_pqP{9PwG{wdYCL}
z9sq)WRgPGyWZb1!ZN~(N7AMh8YKI91A_y4+@945AYDU@IbB@a8D!ZBt-Yo|I040T$
zzTxWtZTzQ?zcrn!05Wsn4T`@}{{Rf!TZ17`)s50Qfdt3wo;{XK_k!k^O~IKU%&5eM
zKsh-AB}-CBxwmfT$pc#^_>4m7mxduMgWLnzThmLO8Fa*h6X>h#Zrc|S41+WERy6+r
z0SENR^YTkBjy0&jXe4F<iy!ORSaX395j>vitu4kc2gWVq1Sy_nkVW8J8xpzN{8f1Y
z&NDtjAmi`NOG3`TwV-f6RSi#x!!zbSsHqqr#%~{aO)J$NLuM=ujOAw3yRUiX(hmyV
zSNuJJwgi4_H&sxz;la}f%gY`<Dacz9W6S0Xgg`%<chi=c7d_<t{{U45J!yaoNj^ON
zQhun_CurKkz{v>pqiM9v4FGO{Z5jEiJ4V-7d4MnXDhdRcvA+WG*;=PcrlI1MCd-^d
zTG1qv;-=kmS2{xBE&wnQHX!5f7hBPE-{G1+;+#QygvsD=v)Ry@rlKqAtIa`*Z2|%F
zIr)05uV{UtdDF9}(j|q$@xaG)VSTzyYS64bqw1!s%Z86mY(L{Y{CpQeT57qWkQ&<n
zxlV(up{D8Ln+ODpfeT&zVupuAYgBnI0}%wJcfBr2rQWO5w7n}zYFB$i3`sNSyx*$(
zL#pdqf*kWofRNf?llLK1>m6>tR@FK6DiF|MFdQmWodleLdtr2AhNC-Ho2L+D6p(xW
z0C?vxTZ!(Hu1kh(Ccy-fZRWSVoo?Fio6Anw69??9YEl{`#E6dHeoJelYQKcl3ax8g
zNP^RY`y<aciTEs;^46<HyD68|cf&9VB1z>^tES%4T-XEG?+W&}Sj}B;;v;0VH()YF
zuWsIoj;6bs*fxvEvgPs3Iy||1DAdzZ4j=;n+!(U8q1C^pJ+&w|<7osENWYT8)^!rf
zhiN-bZ*>p&tH<+a&2?WNwB^a~r>Ofk?HzuvsTw_9-vls$bLg-=xzRgM)GB+*fu^`6
zq9Bov(P8v{rB;TZR$!Mzf)u@TRjTT5_tol7*)3_bPhhae!|CMVd1vj5nWs_ID><(=
z1j!2hI_+=<K?EBUA4QX?Xr%ia{z+-;{98^BW!aSBl*-1_8XRZ8rIG&t3<>-qLFERG
z;n%d7Apqk*9KTxtV7BC73oLHk6IL?o&=PMVZ7Q%kU27i;Elp9}ZINSZsvhX?$IV=+
z%706ADml#!`2my~z~CY;7(c31oZ4h=?Z{Hrbvm?xoaD^HB;;Ghoc&p4LPic&mrm+N
zi2(30s%mLBadSx{#Z{8mSlw-@W18MwzL%+czzB8XB73XMsB18^UB+>6eAgAD)j5U%
zj0s!XJ!m6oAkFUtEOPO#WVu<cY}!X4ew~!MYK{*8LC!Dcpfy`jka;$OqorGIZL?sr
zxbW=od1S4&wA5zOGbGR7nyIMK3%r0Y7y=co9cDfl-XaCa`YZ0B+Kj*_Y2aOB?QyS9
zEGLr}C0*X+%vfL#zUfh}A9%c8Wm@Xm7cd>S1G&P&p;WT1&QE36jrN;(-fthJTIyO*
zT3!wNtWe^|_A8}%HFVAb;0zuYIjkEhW-sJI(-$>ar(RemzcXZ?S9=BIiId8r%A^RK
z*+^=S!c55XznZNU();3O0WoZ(`rDd7G6x5RQ*?K+<u>_j-h0mo(A`ezut}SqP1H}S
z;(HS|?xWmfat;UxcLHP6H%5sebdMvMOkV5}Aq3h72b652_d3m-Su+g!PcWO3Vq-F=
zI%fh};AHZUq6MyI=pfV6zN=?#gj`{3XgV>j&Bx)*id{2fD-l;{Y5r#kk4{%XrXqvd
zFY!%@f<{HWR$1j^Ok`({n@yBWu1sSec2{(|I&GmQ#wKke=AO7Bp)Mpzuo8TKRbX!v
zN%(e)0|3ve%H>z6<a)ScdWP2AKoUeBlFrmoEvP}yJ_`Q;R&;ej@_z{sA^=n>yi|3)
zCo86O%A;)(ZZp8{q+okrGck+d8wG%qK8jXTFEA`ef@GeXn)Wn+K_69egJq^n$jBV<
zvZy)Ct+rrI{1&Y@1~l4aUM{Ap6-JY$aToF)zN)^Yfu^P#$jKlpZ*V$i@JQfTs#>BQ
zq@~~>l9V(d^3W#bGl(q!;%p@|oei)Akpd(0P;POMa3%omtveM#*}%lem0ZDiwSp%Y
z7qK2Gsa~Y`+7OlMEDa!$-s6?m`zFUXLs9OtUr+!@&(GCz46_Ujo<Yj*dKD7RjZc2z
zHynXDSb47vuxNq<pcpfE5Kri#)Uum&RXz>Co0m)z;zli%F2WtQWC#RXKc;@D>Na$>
zY44wgWZVvO!sMN__~3elK;0|DJ}G$sG~>Fl(^wlpv6Iez@~d^X6I25*=g5}Cg?CLX
zqY;r1EKduw;~${Zksvez9cU--l`x=cy`yaHi#GG<pd?h(HLYt%J?$lEGKi$$Tu5=i
zGj4G{9I3a3VuPr-w?f@BZeLV1h!zUa0Mj^V?>F+gudIjG(=@;Wq}&U_vrt1xb?5NG
zx${21MJDI<jrYA=jKBa&)usa|%U^IL2^LuJ`kh*^7n6c)2b$c{P&%iztZcyqiO-_E
zxQLyubd4^ewN~l_H*6x#AObu|&6S?BsMWH7{t}xhweQ+ZHjn}Gi>S3)>N;%`w6Cj8
znt}YJgMTrVTT1;+Y808MtX$fEg{HuIE_|=oP8e+-hfh|7I8=V@mjYy;KfJ9czNJ?a
zS?+NF2|V_nq6X}2fNwEtgY@xIOJqpfBEW(^!sp8_T~cT+l0n@7c1FgM;MpKJ0z{en
z!j7mW@(Bb<&(p|-B@h7{@6E_EJO`?q(WIyj--JoTk$)Z#GzQdzVg=7{l5*p^#$&vn
zuaaADxPj<IpOz8=faa1U0A?{iW>yZUZlw3X1kNB21-d~LT`&leNRbN@S0b4P{sv4f
zyj`;LbmR4~-9aO{Rm8a5`72uX#Zoz&oTw?Z64)1?n)K&SI`-e_)c2S(J9%2wDHha_
z&_i;3Je5v_(%NO`;oLw5Kp&dFs;nB)1EtQe2Y|Wq{{W`5rd2v3wyWL{;>W%($7R%3
z&_6@$^WR3Fy%j)fi~+D`9NlDe?FCIs*-o1SOH7dGMDhClS4ujrpFvu$PwET|qUvF?
z&`CY`?Fur=`zlGN6}JF=VnIOWxVvb?V5{23Q5w39(w+2LNMntO9wArSIEafI{!4jx
z7+8o17|$e&gy2aqOvR&wL_<J`gOCDt4c8Z(K^9FQ4S)!}@_A0{nHHRgArJ(><A4eL
zlh}iRJEQ@$3j>@2kbxtQ6y>r3Hu@CKVE_|xfd+nwn3-2-1PtaaDt78v8#fRD78mtZ
zK;0P!82wYpIOh;j)Ei%+T$2LBxs|D=c`!0!AYKB*(+`WraC7uqI$h20AY9%DuPdL)
zBYZB`oIqg&L=1_NscMYo&7Row^j9811*{VsIC|w#)Q}?5d?v%?Z(jwTJ+#R|&S`9J
z=R6<kj5HR2E_>d4p?>DO>1La8z(cqWVsQiJx6>qnrKa;Ev6)sVkBA797$Yj1pMio+
zq`>+sdSVNSi*g0NifP8J5w({BdqBUcDy{$zixLSoQPV90!#0o6Dy}<2K^uVPFYC&|
zuf=%{AOa>ywf^v(wF5~OyyE`=d??8!@p}+3e)y1xIJMD>3mCBSDe9zko<+An8MmUN
zsc-;H?lz2P@0DV)B9MuIE;9vFsNh5!`dqCsO;-*rox(4U-O;hYzr5rC0tkRW;E5nl
z)F~;p1d9p0URIY$jU6xm2KY>#Jxbf?f|1s_&$ER$ZI1Z-io~Em0OoTaKa?WY>uTvK
z3bfqeP+);4vgXeZej9I5XRdYEv#3-IpLa+Wv`N2-_57D0buZNFA6@cG&4DJv%}1}+
zJ)>zQ&Z&13bD!v>(`$6krq)$x6KPxc{C=3T-qtj_>a=XaM{A$a7g6d}A4lD~qnzoy
zl79UV)zVU?;i#hdU@;MeMypLn?r|pn03~uwi)y1D1LejVZY|2x)6z9s?{?kLB=`kJ
zqXX%Nx<ngK9)%Nnm+;j6-4|^j--1HB5f-%JtbWp5Tk}a#e!pu04JF*jNi81WnDPXX
z7_^*xmbQ+h9D(LxW<U%UyaEVQX`be~1i{SR8FL0Rmh-BD1PhMXRclRFk-I#`Qfs#n
zCII8_$R?=Z8!oUmk!W1M{+%BfI_)qHB$$F$T~C0)Jj}}1*Vd*%6LD)@W7IP1Tzyw|
z9XRv6wR&EqtuW&dK5G82T-PvwZ^R7nxyo;KG9cWkAE|j45Ihla<fRU&<@F8X_*Ln<
zYv;FgP9@%`Mn~qWj*EP@G*t_T6Y3#rHLZJO84)uJAEvR+hJk($b-zl15d#s0&*b8|
zaN47p+($RzCQxCM`bGZ$x|34&(<TMGWPqwSUe>r@J>Z0-+P$O>wS|o*pBq_cn(iV5
z$o!S<UZJmV!a+O$vw?@1%-$Bb&KTu)cu44V6Wppb01YxZvb9p6Ed#eBDxFg%IN@m-
zQ_^@D;|otosV(L_d#rF^t28RXxhLd}2TMk)i$jB^cs9OM>I9ABD?3T4BpJ_-3tEY7
zYIZUY5VNyOl?IQgzKg(lBK(;8E$uf~VPKc%kZ13O%j)!{r*P*Aps!YzEGIcpmx^V}
zzr*~SjMa4=OQ=)SGn0P-`mRc)V9i0=P53Iy>(n&ho^J~-xIXmDy;hz)H5mT@C(AQz
z(AEv0Iy^a9+Si9Tl4RyiR1B>H#2yN2&Mq=?d{ybjW<TPYx;MIi<vtitw8w%tB*%he
zQf(Y}3E%@KCkUBE?lFnFzoqEfolV(9mNzao1JtRxV?L^5mH5qq>kIgmW{*#;REP!+
zAS=z#7C6WeI2V=C==2BpDo=GNkRxneMqWuL2QcTB2AiXTM&Tg#&)t^3o2F8yPyo+}
za74<y`VD0^z`41Rg?mjwg1+m6vSu$n7g=TGt!6Gys%eEP=V$;M0H}N+#6`%M0c!O}
z2Wsj73`}A#^I1i%1c?(q61K;tSqZK*t_|C8NabwltA7zpJRgV~?j*U#<-)}U(oCCk
zlCEiL3M^}jh-;j87TA&CjG<!JtmYc6w2~%v*!`7lz_y+~C=d;!%~WH92m}H$05cqg
zX0zs&-s8&LefTUdZDWn<fV-(wk_iJM1=SZa^W5cL*WajAJ%%ArbI#)&E}7FBk(-<P
z@U3Y+D(w?;d@QN4`J9{(GOSUk8sO|f#AQ3sout*#G_?iI{3PZg3c06veJ#LT#L3|n
zwVwA%9Rke|LXW1j8ZRbGroix*#eG|-0i**#CBo;r$Sx#?8^MTM-sn3|01!owW9qPV
z18%9&!L^Z&g+}ps?mX06WRh><6{oSqve>tAg%;4!xbwjZkyYRK3Fx#E2)L9L2Iu%I
z6qp(&4DfhCt!c-+b1642APKPG532NC9+c72QKz_Wroaf3mFK!DKKofhm>Wx;05S*F
zdP`|+rrCF3BL4tY%jd1I3<DcpM-XrKWOWQuc!E9~kN}@l8=4FEIG1oS2sM=pH1$3?
z8*L*WUglRk?W2e5tu5*_*$$X89N_DD$yc=dz0M>C$q@vFoz_E7Rix3hLrjK$Rij6E
zrwnWW0f&7TXT~2<Q2^_j;Mf2jBZYr-RB(tUWEnH)sp%TyRlvF$F%fh3;<Ou2-Q0i+
zq9Dl%GFB)n2OVCVPx6yFly7kWhdOyUA1_!?J9GM}WNq6_8<J1-x;<m6vrka&X%I*P
z;SkhxJu0@N{`E+gimx&Si*M<FCA`AUnN8*7?xezYfB`3opEZlndcL<yK)ZWePIk0F
z$0_waN~c-WhkLP1G{c@8em>SqE$vv%hf2e%*1uO(x2R{p1-QnvKn=ji=h0TvGiYmC
z+W})7t3YiLAQ;8K`by@`y6~EW@LC%NK(&<H#EsX8i5{vqkVuOWo0}h(-O@i3<j5ni
zfU{^|@wmDG0N@291DtmNAe`WM+sO+Z4Kr+r5+}~eMwn|%+zw(7WT+d4=NKYlB>v$&
zK_(932^bUk6Ws$uP5vTt0D<51Q!$~`+9n-<VtnOKKrb<+d8lBU06Sn^W9y7D(af1Y
z+s$n1&e|c4AO{<Pdn{dhfMN2q2>GqNU9$3Y<GO{ap5&erXxYA?5i!Cod77Y4JlR|5
z^tWmo;d|a!rN8xXS<9f(()Su>lE?wOjGtf15c{v<Y7+gRJj*A6=B!pGv2LE7%k1c|
z1D`2Y^qLUbjAd4Wz(Cvy%I3Lgq=ikb>D@Y`-BW9xJ^%!})6r$>wT{(R-%z-c00i=^
zYxg@)TR(v90NMwVqp)s*BoJ;4LU_sAa;0KK1AYewlm)^-;PPaBVHA8)0Rypw+kfIP
zBh%)uh=9S`Bt+a1`=)7j0KD>#X`4lff)Ale^FNi-+H@R~7VZs`!6F(xX2lhNH<%G5
zrfu%wY(#UyK8*x`2IpRWiHHWx@|g9)LBcZ-c)?6uL4bEhf}%d#Okj6I19X6W5`tO?
zKQzI_b37ju1+6)d0mzFFos6x$7yttx9>?tPu{315dz&6WAE%<*(mK(R7UKT^CC}t<
z<6L<mvlBJ}P}IB(iSH3_qQ6j`n@4N-{!#aZLr}tVU<IP_$orL!J+;X|wV}E7LAFeQ
zd}P9z&2e;NGi)g0Z7~?a>5Pbu*fEG!^sR?XM{$@`+DLp@3uRx@2sVQUo5)e0M>}XR
zOL>vOpV0+lAX>o0fpPnPMJ*{b;6=Nptk46D5&`vCHTa~!0W&0?_WQt200tZ|Gc)6B
z_@Zba`F7{B5DxGzLyk%PAzD%DBLv>}!l?2h212=56B0O%@%yS(Ko%2jMU})*01Js1
zC)PZQk$^PqCg9%x0HTNz1;;rDJ(SD@7;u}w2ajLqnSE(#hz++KTI(~d*DZZIV3J**
zxmpyU*o)-Bi>`02>Iv$kL~IIho5^>kUbXwHkUy5$P)mWta3c3ZxSEb6$tFyNBTSHE
z1TMXB#iysH>RMnN;!C2^n^fZ6jTj&jN|C|Or${$|2^=A-Ow2e*9fHg2MLNxS0j$aO
zQt33UXZn=^E<8l56ezjkz(|;Yr_$(&Ad+PMDYADjg`sJ56srx-U<~pZNlU1%Hi`_u
zxq^~YbghiiVWQ(4juA4oHUM22xs_MrbvbYlN6}QPQO$I6K8sCk_oqchu&88t0#69F
zZUT&(5!l&g^vz43lZyyib)nRWb;uqBe3v#;SvJP+7d5PVz2?wuFhpfnYaDawb$?pH
zy!V-ss%`+=j&i#3>B*O8V+fh$M)rX`PbC?u0L%n2zDq$Tx}>3>o)d=OD!|gb_!3}v
zsT-yZGJOcLTRksy;6zy5Wo}XIL`NWSu{4B&2agMGghhr1ZThZHE2qYx)f@zonaG>2
zaqn=cWAe)A`kPCq;fVs?3!JS;weB%*$!p^2pObq?%C2yU#yBcK3yj6jVxgp9i8dLs
zw!jDyE#4Na<yU(N$+TcuRd^u8URC;_Hp3i+QnP+uX~Mmr0TWC)40xvp=9w;0u2}h%
zy`!)2G#mDSxt6%cp9@P%s@MSrAX!bDINH<^R*>NN$I)igqsJT&m+^bVP6kO)nxVj%
zx$KmR*j{$Dj}Mqxx`&54#aOLSnG+eZrBTGX>Wb27lXOlqepGF_5D}K%1<a<D`du@t
zn_X?_`WSQC>coafE)4=e<`i8bpSozh$(!C6MM>>r0E~Cf%q%>aUZY$j-z|=x?AJ^8
zY^|oI*YX_q?mnaXEe%$hmX}OR%R{8V5iasv?JZN7;%T_HFlHl#FTQh}NjscwTyK9e
zyzk39mhE?b*!t>34$(}I+sP71`{iDHDpS#E#W(<x(hM0t-2)!eJ~=4f<NPsm1O~cX
z0I^%{^ym7l_sKE?j|AKAD*FptOgP$Gkupagk>zJSzVYMU4}=+<*<BAMwFv|O0BI2;
zfn{Gra=P~J)W2`xlb_XRxvh%=VQFhy6`Vmg@5;?}+yNh&>&K@)K?DYYHYVNGx3CQ(
z3yJPQ#&}hOW=o`j-@3hBv8HXW6K)YoFVJyyI&KCeV{8HI%Cz?bNIC#<vC7Y=?{)k&
z6L*FJcDg|HR;m&134mvke^9csmb0fPu5|0Ff@TZ?1S*aMxP#AiHFy62-l~vnOsE}G
z;w}~^!E{cT-qJOM0l-EZ03Lf47rvO_--JT5V)K0XCQaH86|}*nw;*G`_E69s=TQ1_
z@d8g}7f;lXKqe=PT}bB^>qFdb;UJ&C6&hPfuH4^tTmZ}-eHImPMy1Vz5>IujOp<p8
z006!4vdg#Y97&K3{8O9(yu)H3k#ZGnLyK7xd;0wp)oR6t0Pd;QcISxW)lpMYXeLR|
z)mS!xf)KVE(cipTAvJA=jwU4q7y1iDe!Hb?*{<AYj(<hyPjkKy*&<Ay!}MHFvNWfu
zxxxu+Y+x8Z9hYY35LniU-D4L%vy3i3HU9u+_;hzYfy6XRj}|lHtMw8_kd3(l!pAXx
zlAVkL!TgX(pFh4;PNQFS4YT;31e<~Cv&XgGoS#~As&%FUUPx?1ni~hm56xd}wvaoE
zNCNXS-Dm6Ib;i0R#FEDs5$aZzR*|MH4BN>B#IC#352oqBxv#3@V(2(p05fcv1l!^4
zsAvNQnWP`YNxpM+cA59C0oOJn1VHpzW!p*8v)oTouHfTlhKRT2gxbcr+Goz&f-pTe
zPf4yC%|k>G2ml-KBJtvbUis3x3_#4xdhzO<?3kK{>@@UkJ0Q!N39{MH>DtO6tsD#p
zSh@}Ry<4joBz_`d{{VJfHkzOMA)riwGBfj0<*mFprJ@5LahS-;o}|cwV{r~L6EUuH
zV$nWcaHe%J0Knb%-Ev#Ys04f<5p#JUW9Ec4pzbCN;P(0W6(MPzz<fi4&!UaMjKq>X
zJ6$c9={z~2o4qHWRMWTMX>bvdX!`k~2IFwJ$1&^ZrDG%y2WRgIa0D0x0Wmi`&)?*(
zd$xw|1d=icfhxON4S*0hwEb0&cpJ2|SPAk`O<P@127%^6!Rl2djx!+6Y<w24Qg{`B
zm;tgP&=)1u`h_!0vXi$NB0-q%`L4eP8|BUB%0^96YwC!|9Ic*|#!`}OKps-E^v!K5
zF0Aw3$Cb0uXg;kMRj3G?iIFGluH5*JIn8TmXjN>uox-!O1*6yctMzM9WnDccy3HWG
zm^b3%Fcpmj5mt>xv;yGG;qfuqV;Y3GxSNt>MprL&uSs=mBE3KzFiai~Yx?r578x^Z
zafnj1$;^T-X9W-+%F`#06>{n)QBcz$+T*umC9v#UxwY1lC`)#mi~(^dbsBKiheQEs
z810WVOw;PkEdUn@5o=mUl4}A^W+0H2Mv7nnZV9<UAKlP%AmwQiIR$`$>EeL28=e6t
z$Lyj81QWOud&&xuN!ka1JpK41Me!WL1msvnp(04<H^25nVgMt=0+0~mB<A;4kpM}7
zXt$D32!bXoW0c{bHbIH8pT88G*AZesiy-uyNKKp?1bP*@quN{~+r93wv|zNE;_!Q|
zoggzJO!GNh{zl$4xBdtgwqRSxDmsEdA+NYGd-w5INg>h=^BDOo{X^uz9<U9$S!3Hu
zfz6ni5MpGIKBx%3<0JzUpQi~u&IAUK2#cFr)d&y?2OXpGTSOisi6q6%hnl^nW3C!3
za3&;tjs9wr3>%oA3l*WI5<r=OH}B@BE{-%Q0&X2{<0(~aQ;z+G{%Pru3G(iqqo6>6
zJ?yY(qG`?MJTVEW0Ou2HlQ9wklAt>vxQ{cH8w55ap8{v*5A3cb9bjPCiR2p(u&LH?
z0NeqA02OaiUQ81tb}Efh0502ZZ^F|QP_S(_Bo7j%q~Kx=?=WV5{{Tf<q?mydi>c_H
z)+QY@VG6Q^8Z@V12Z218^<2MPrt#H@JdOreQ%(Q|X2)y`pS~47tJB%3{{T`Y%oa0y
zto(UuuPMWh8d|THvfvo<N{*hXs?oIOPGw!F`kGf30e<9uO2(GEDl_oGk&MrZ>bHh^
zSf)1AzCf@XA*^>Bh$9hcQf&Yd;TN(O1HGj8<gC`3fNLp{7bArhr&ii5Ev0uJP2oqV
zuI7Ro=#g?22z1`xHCW@%D9a>;_A@8InMnu$3y+#hR$8KanA$NT82kAwtxK5a81gDN
zYMem?`6yiZ_2oGZ<+hhjc@Bw>6}!4<?gAudzwWt>9YWW{ljK)LLHp@jff5Lix%`us
z{yW1Xs5%J*M?Q-<xVsqfT3tnex;Y&8RcL8T?nX#kVbdJvD5=sfa0i$c;ce(TZsWPw
z<dHv=rK8YG$HR+}E2h!um?@3cfC2lQ;YX7_ba>*$tDzLzG5B}@&(&pWU9r{|Gw8a#
zV_Mdp1>hXza{BfS&LB@R1{3k!jFq(Y$*l(99#0FR(Z6$AM{9x!k67_stuk9%kU_LA
zi>K811l*GVOeph4o+`1arMMeh_=~I!Z2-gZ40_#k&mgpt1c5L~5U_fKnp|K^ctTz(
zJh6;jw8x6GN~|(FU`nY_!;SgK<r5vB71HNVQm*G21p3)msANx!%5q#}cR)dlgaCf1
zfgJWhc}@+`1|akEMhuHsPjwhQkpVp8!2vJ6cp!s|NC=r##9I7HfRyy!0fU4E%6ezk
z5(haw&{pa1V3;G(Y4q()HncXOvF<{-uJ=CZyKsDg0C1;#o}95}$3e=mr=ni)P2;jN
zO|@$g7F(JP4KUD1?cj{E(xfdN7d6otH<h9r@Y~^mVtb-(5CDrw?|;b{Y`De8gn+s2
zX*x%_;nTV~o)6}vYo0)ke1zRCfQ&%`B>srlHwY#z1b8Um&AheK5k165--mT(j;eH(
zbm6k&a`DI?f6+i#OF%7{HawrnU+G#_yGyC8`g@3MHhcJ`bgX+vOidQK%&?O;CU~;3
zQjn*nsZ@`N+)c-zRrJ&DhUUaX0Bq%NUeGkfga(%aTuwpl_t{U=ojDy#$*AcW#{p0x
z3>m&j%q=ZrO`8s|w1|?ZK3d>hIFCiyk4yo!2!_BWA#1vksQOgE_(T^(5=WQGWm9kZ
ztZ5|fa1q!oijEYiRj76l8x0PSN2v1q;ZUT?zr<_m)qq+Z07NVGoEt!i0FXf9RW6>!
zwSXVyIq*|z9k)vN83$?gOX>V_RZP{XKk*ahqJmF{ikYA|InFXkGEm4Ci_gt$$eT#S
z_u&|&4F)bn{1b;s9z6C=Ndoo<x&nG+iVYy$rvP3NRIVQO8_3C&I(8bN!)>Dl7w~n1
zW-NR-ONA~Pkw!yFfpa<estk10H&$u~H%$(-+B`g!0R4p9&gdISRlo@&(5k@8_eF;Z
zRja((M=*+*8w}bmLS!J~=LsJSV4Nd00R{{az~m@*5@Kze@Qea48L&OjPN&%VL9Eo(
zb|5xQ{)?q)B<_P_gU>dO%MaOlMZGlCi1&`0WRH^H4`^^9vTy*g`}_0Z@?Z9J<Ivjj
z#u7jc0^ms#<7=#rt5vkAQG<6pxznEf`K?;RWiz?JhX?=?0azV;h7CIboR||fG5RCN
z+1{A?mY4OKWQ?}Nv=66((XH-<0L|7hd9d|WpG~gnbjcf`zsob~Rh?5|p^an*92LD}
zw?>BDD?k9if!vJ#s!B~X>qN+i5dcT8nx36EH~}uOxsefzTf>yqSQ<;)!wG-@1MU`E
z<$oD^I^wtS4}0$Fgn(v87XIqLTddY;oaY;x0nEpA{Uo*296$t@5(KKA&>!?zA`6{~
zT29hk6Hoq}ktZX-eygIQ2xzqC*p->l=sC0)19C1_-5NlVaccuJ9hWzkE?vGU!fH*1
zkVMV~?2P_JFky3vBF5m0$9M>~4bU6_1B(enz)UtQED3}33!81g=rIkj?P(u<`=%`}
z1i_5lpT1R+HWrq*;=~I=PiThQ#F@eJOJho`q)Vp3qGtChRxmUZkYgeMQWl28Tmb}R
zoO#&y?46;YfNmzh^U1O;0j3-c{{WlFf!gXv8GVe%xx{?aJA14kUSQ+Z1KqY|v5yiy
z@`*N^cyj*J^*2)0KTS@~Edj)th08Dsl|Ca4ZhTgswDo=OT+|l-0PKhy7{Ap}qfE49
z2+W1+c(Ich^E{bvB$XQ4kEf;wU~vSKd*OC{H%MDal=qi3unTZ_<JaW5j)|oHH(ctK
zU9>jfksy`adzWc+RNYRXcK-l`5H8;n>-1Uqa`<>gheczlGNaxJ4?AE0J|7$|cU`9x
zdWC9nX1stg0`W4sDo*!0Y7HM0X69Ue%E#*s4eFJz8;p_Z#g;hL)@-yij1CAQ%1TaX
zf5mx=h*7#p7>^zh`@joeV}djLEfKz3lv>BNuwq17<JJ&XqQ<qr*p3g+J+iCmwHsIf
zf=L2FkGMrsPy=r1%$GopFtXPzH^V1d3thFKfIC@J4l``xC0^AX<^lPtyDecc>bh~f
zFrb(Q;v)ufkv)&XIGiGTXOl77{{VMGk>_(M?iQ^xh0MY13JtJgCVO$}x(=JssERGr
z>lk0W9jp>PNt^xMTdUDDRoLN1o2a;BBp<0<_xU)?wCTtA*;%(O7a-aOB!21l5dZ=>
znHTJ(uR-)`K9izaNHHn4kSsS0cl?(`%2~q;HB;$<AWfr?arR+r=;UoMOxhu2X+(PB
zIpu3<ow0z-nGi6r@-GFahz2j`%A=`d?vWG3gDUIm2i|1G;FI#Qr(C(y7b3z;+vKAi
zw5SCg^#GA<oU3&DX>bQ@z|2`gr*QX@Lxg!}E2GmopSld0F*9$=?+Is}ogvIs(;61Y
zoCs6tUoYJQz2L&s>N;jQptNV0AG*hN7g2a1Lu~{Mr;c9GZ<ehZpqL~c1lU3<)d-t|
z0@u%y&(hbD1BK3H&z>Pp_sdIxi(&*Hr;pW6s`gGKw9E-O5I`y>gD@L62FArF)o5tr
zz>tBJL68nPCfQXXY&wg$h$b;Stop7XcL#t71NT<k;P#1p+bXSM9W4ZWF~>iB)|jIz
zi+994L`C3_xmGmivB|_LyJQcF2)Q6gD%PE{=!iIxH&vHO$F`ji2xuZBi3UDjqI#eJ
zC$<O}O-ewF#7T}m`=_j08Td`F#LDOT==jLiY3SB`x}Gh;RKJC#P2vPcc>!tF7dY}c
zKSfHoEs|~`AeGaN;f)3H8GuFq05n7r>*5HL`=DR|fiO7-`Y9k88*e-nN!q1zC&2Oo
zk8ES+Wmv2SY<5tXjC!pnNh)m{jE%mt`6Q&K1o)(pxk*oyz)6KDW#HezXw0aRDq72b
zLs*aiGa?~jkQzZ9*0)Y+F*gS|S?3yIR;y`C+~P#e0xDX4hjW|ofzB1Zpi`EPF?mqc
zs2XJ2c^}n8J4VWkq(Q+so65A#qY@fG2l+-JWK?)@v7ZRxGYKGVi{J~Y_1b-!TAgXA
z(n;OeV4mwEQ0>H@mt*oq_dq0!92HW%Xf6iis>dA`)tq9mlh1Xxp>Wy`aWl%m4&74{
z1<jV;jM=2-GH|2H)5O21u4!(xj}^~rYs<`YI9i=XsUprKf!`~iuHs%~%%jB*$x#z7
zD4GfILE6|-Fy3PPcrBPFp-8;r&fKY=rf_7AJEf*{A3s%cnQ5KE(awIUY~vmD;FTCx
zbu9(N{M93pnID_wNG(1n0Go`d7#d8$pAHZJ@Inj~JDN<`6dQRb=zw<Cdrh*e{{R&C
zGzS^9d8%ckX22^HS??SKSaJ|+x;g}qU2-olw{+XvZVlk@sA#CzZnR&<e3sUll(c*%
z#{lEeVV!A=KIj1B$k_RP@}qrn2$2AAC-?MLDpGa97Kx6^g<=7WdF5w0RZcY`2JYEJ
zD+re42rz$7RY+dvk}?rUYsmw^g<i2cUS_43F7UmU&qvbLuR^U&wHnaUTOdP^AHEkX
z93~nhlK=rXT}Fdb{teR{b;Ry@8)*}Ej~_W%=NE-nqRNJ+yK#p#(F9=6AJt9NYP3-z
zWVmGVtGiO&M^F_h0LDGQfHHl3RGRrZR}dmZNaNF+6+c(;&g&fyH;~-PiTk>y^3o%1
z#ik4ub5_$)?F3+05@tM=I*tGUS!{W(ta@|(4FFEEm;hPQCejw3wXXgZz;1%*FaZR7
z{FZKz594X>G6a~gjx*}C_fmBiG8woy5DSU=bNv#?%Syo7EXB?T7mihKr&J9UR*1>C
zy1Phip^F21#QN~EI_tgN2evF-;+8zqy)c?UryP_JOvU_Bw4IFr+m950BHWG)t=S!|
zCebn39m{0#!9pAXiQ|q@NC5mhNkCZ88LAuq02QuQ$GU@aBv?Yk)3`RPNU%JGvRwDN
zoEx_U@$ykuI#KElijWV&GvZcjn&WaH6N0bx8mx6f(|8vkl~Z&u4n`p5R83WIyn>y$
zkTKm%_R|)No)kdHfHB!J8yX2E!#PhkA|!VS#1TC3o{4Cz-QvOD5R8PcM>}_&4w$9U
z^!M6G-NYZ7(`6tIE^j%G->R>tAZcipZ3Af9(jo_h{z_&L*9R8(&BT67g7aN7qSZc|
zsxqF@5M&9rq^vHc-+ixR8UYLfVr}&+VYH^J>b!^pCh#(DeAX}zsaCe2m&=JQi;^*6
zJ(iw0St8MGezvVZ((R1`1B-$El+~CkYN@v394C@_!SYo)B~w)D9XH=&v;*=BN{Itj
zDoJ+HCT9FAy`n7|qLV`>2P8%Ee&dA>zO#mwyXByYL~R@h?>;G>$2rB$879pNw@^Ok
zr)mfeB(&nvR7uM3O#YJD(Kq6Ucyg-sf$uA6*bUt2Aml53DDBa)%@*g)ReL@E09b1y
z@QISO-jN%+8e3?90TLqo@nxXL3Q!3mM?C%G(MP1E{+`<s22K1Fg{7gw;Dcgh1;Lfx
zPeU2PN`DcPfq4qZXaJBc#9BXf1t!1^2O>xUC+xzo&>BsF5qbOjm5pdm09bRzWhNN_
zkr84@i1{aRbBkabT4MmD!2k;a3`pT17c+4roJEdNGz$sf-s0b7Du!PJlP~}>o|{hH
zyGV(}zJVZK2LqlZSE_y>0kcG@-sTq{hjHTfBhFPFSq!C8p&$ma&IWi|c>e(EGs~x*
zYij%MrCO}myq@VbrQyMsbBqC*Q0*j{xp!r!(sXqz>X`Ce7{>x)Z|c2$`r`8Y$<cd1
zqBN*Bo0%_Z$>3zIS66LqNoyoiV}A^o2M6o<Cfsz|Am)#XtB7a>k_29T57i9-Y11Xc
z@n91%=j$ty^LE~F(0%FAZ{jmD2j6v9tLf`j)ryRzP_@kfzk+f3ski&>MLJ?(k!*s0
zPe{Vw_e`s$_eUEnM}IZX<??z)-Y*~Nbm41ht5F2iqFULuWAjyB%bsiqIfdDcJ8H4a
zs5b{#gOls%79DL9X;gD(@R$=aM~B}lPZyhI<?#Bl*;(QM8)uP!G)}AwB%Q>JOxw*{
zc91oohLA0h0F?zw1WocRFZwO_7;34Ly5MBz!^J+(E(Ab>60FhDyHiwAXt^>-z#Pe2
zS}u)BzNwm8RK5G%_@3i$8)Dz0k0+NLBJl9s`E%{0Ne&_-h!^{W{{X`g2VsDGm!r4Y
zo}WUF!J=(PH!pOOeyc00bQ)A?olcddt!VhE<`*VE&ciIdTK@ozWZF6uMSUv^Obw7E
zV5H46=$9Ox#F_j3)lDX<ww%LEwD992)9S5O)jqpOXcXrp45|F?_TG7<$4+-&KtEen
zq#ualGN?X-nfOe3E$u}K_=m*flLXJrQmaN9C9((t03zRZU6^I4;m&UDUqeVbpxtl|
z4A{ql-qTT{>O>Yu@izJ_CO!0>AmGWx)y8#qF@dF)7&Zyzx-qgZJ4w1r1V<M#e>F!_
zrL98F9|<`Yjy~mTbiGwAZBWAKcVZ=I>Uw{^rtsJ@-Y&D=t4(dkHPHD7;fs;a*;i=m
zp3r5$f!zgOn_aGOE!sn@*dpFa7wDeH+9Z#}1VXMWD=NwAji}wPsp?1v=s!hXwzZ%Z
z@7xb2)=yWawba9dZbT9X-;%4S=^GC1<V;@p_3%|DF*aBsTsg#k8NtCS!)wKhyr97P
z6`tA;pg|5U0AS4g)XgA}8eon)dMecH<yn1LCBW{Ou)HX}>1YANAa)V=@=}0W+gNuP
z`~A?pp7F9z1^}7xQ@On~_2C!11lS96JSblK&OtW32>n!UdzyB}CJ5)cebQ_qGIR4<
z$i~rpF#vf1WoD1U@MdR`VpVpYW(nlQ{_Lb<h$b;NF$kwD+ex)yvMdeYSdW~_3e}nb
zfIQDWs;|9qHjzC2k0le{=R0pXgJ@aIIbX4=)C3!o2LeB3TB+ROc!4o>DQ!q`+Y!j+
zK>BfVW^H~x?6rur%|8^6Nguj7+Ab~39H;<saSk#GfhUAOf+Ew*j4IdE?5ccIE}7>a
zd?>csBp<A)3uH;af-fjbdvI)c{)(V8cT8M_lZKIDZ#0k`;~EJ5i1!BEDBAe_k^t0j
z0@sLIAw32fwHpFsj4iz-AxsYW1O-b)LK+GA#ukBarqRLRD78r|RffPg=ao&><iY2^
zDXP>ik^m+>4|NE2$@+MwF)LDnORdg)6YdQ%L|*_zD1-<k<b4!1t2hG&{B}=7T8&2?
ziBu|8F1CG9KA~|th5l+V(&v7#naGz+(nus)Vpi6eZD=PxI14*W&9>(%%}T|M+Akau
zvCEd&aS`ikhdhY@9tHtfPj=^$r_|NlXUts6tv%M5%`?jKNZLRbx{W-QNLXb+(hS&l
zR$HmsEe3K{eBm^af{<CORJdkj+;&ys@)DEtP60S6gNfqaDB5J+@H-$72)8#1Mu|AT
ze#ikPEJR0S?I7_U2I-z4<0+cx5Ks~p$&AUu($Hx@0nQ+iz~|Le(o!#UkV@Ur(Ig9E
zE-<4iykgd#kYE<E$l+>ffNLz%Za@K3Ye<3w0cB;bA-h0{%t)2XmD_A$N4gqh0!X=8
znvh)ilLvxTr`0V5t}{FffIf<wo(qhbi+QTR={B9GZcI#(9;j;4a2efx1W%fmb3qa<
z5#*EPf=RK&*;+KuCDfkMEJ2w+d?^F9TDM*AsAngVj-v~odj}8)GPD~<W0=M^V89Kr
zKAt5kvXX4E$9GHAI(1q#G(@yGkM{sSht*l?Vf3_40{fep$JJ=+Dbt}=n|8IY0$}m?
z`YfKQSA7@%05KqVK*GZty5A|Nz^(wK!)BPyexG?%o_D>VOPk=XR&e(%J7f!AvZ&|=
zKsGku50dN0r!_u-rP}akY&4Q{<c-yapIjyI9wNd|hY9+7rn}tcNo!*Ss-0Ey_(sAx
zlzk@C3)(6$&gcf@5(?-2p?uW>a%F00H6v4{6<7mEmbt73!^HD{&0}@9Zq@CP@U&PX
z8LBPep3(zb;ldr#>2W{^6Dh!R%tF-y4kY{|l`C3JiOgHF7PR0G0#wWZcR2_9pvpD?
z0FM=>%H+{&`$SA#XAszIx(wkH>r`V)GSd)e0U(1@Q*~lzCP^HqgXZVSC;U#6^h{br
zTg@^%%sBpGHZVU@m<e}qiUr{R07L}ToZYv)_bP28zwe9<X7Wr1kv}2es)|Ik5pKa~
zbZs-Lbxk}C+g=O~6y?#*-Tu%iw1NO=gK=@$0RU1j0{8$xAov(kxBx5x1lgqL?5CzE
zz2UDdXke3N!}Zk_wXYO(Y}1J^sb~lAj~_q398R-Ynk@&rmhU6*l0&@hm9R}W)OMxt
zLw^W10Ct~6#&ykc)@w)w#f&iyf=34He;j9>X!|t{(bW#N=Vbhr&V{b4R->FqXoiAB
zj`sOS_E`SUkj)$SaN}jW)84|@I6D$#2!o7zrI}*1?J#p?<TcI&fL2FeA)P9tXWkM(
zFid(+-Imw9YsSOD$T%O|eOEc{GmAQVS`3HO;Qoqm?O7nux-^5}k{f}-kEa7$s8y*E
zCus0ZMp>nqwXCf4CIe622VL$a+^f%sJac(E8bC3EEI9)oc2W>aSVZ3NE%QvqZe__N
z4^O|{7-%3D86X}#JN{kQoqKu^K$eS<0!$kZOrmRxoarJ!=JEDZ#`_WKCLt3<gEq05
z!BC=Wx8&U9MiGlR2{Ll--iSxx6JS0>eE#J-Ttmr|0CFNy1mZp4V?Jl>{;7-D0T4G1
z1mEbH&i?>X;s6&>f<-_g=5fd0=&C`py}$_$k>9adUeMP5AEj0x-QYABiJrk~I5o7$
z4fHmTSy_E=LhhkLl?FHixV^y5#~+{3ZQ|Fq*!h3u#OU-@^}2%1J9kD*fCBD%CWPo|
zQm0y=kRiYTNIl2Z4@~IvG%bGD{6yW`V1BWSBkF~%)KjTKtUGKut#m>A3$Hx=xN&2S
zxk;>b8rt^?PnvDBIf39jPzQ>-n8<G?437T*nxja#FvkFi7oW*l(zsJ2al4Gb9-Y+a
z+FA#`uBJmafr2<3b9h|=)lsI+zVr&Aw`u$Pg_YCk8g*s4Bz_Pv;y%i4T@uctS_H|F
z1R3RX`EZ%KJUQGePjxmh+&GK?XS%JY>2KER%Ni$nmpBN4{nT^=j1yrHVr2SRb-v2!
z+Gtj+MZ=>0U%4q^j+NxkeX)<S05ko=xZK+-1F3X+x+;Z^B1p4{i5^~y)JX;5rJ^Gc
zusyBPQ#vM9+Ba_7jBvE(GMcYH(6yR<YeV@21KaQST`xf_({+YAqoJjRqqP10YX-dl
zNpWD2{Lzn}-PO*Wrqt?5e*LtW{P8ApY<-o_=bq8kh1Ao$)Kc8h;lMB!UskP0OVeua
zr~z{b1myk7f{krgyV%`KxQiD_9u|PP9=oaP57au2tn4kmz)1$vZuVRrALHD!JhS!1
zRhq}ssqnav2oedfQnyr3!@D^)6ZiTo-NX|jz}OBx@`<%zcOX~=?E3KGwWR9%qjO`6
z!ISs-DA}ierO+G$23#?I!B=HMNVs=#*>32lmuRS#jO2(H#r{hlCSH-RiwT|Op6-e&
z0$Ex}xCBJ~Y_e+6sZOJ*)M056GbR^a-4><4hLJ5dkRm-jJ1m~Ez|BAo5KBhsu?jq1
zYi^o(JTgtjcTcJOKCQ2O!5JWkE3wgPn))H^cJAQ*>yef@!KO(9W;x*n9d&&_!pD%(
za7kSjdve46U5=@=Dh<vp5<822ODj^^ZeiHnz@OzXv4>OCJb>rh*58`cpyO4RGEKqA
z3M|s6Dv#C*ZVhmlzyf3L=-pn8U<80CKb)v_)L7CCT;It?9ZA8i07&4Kaw4<!t|7&Z
zxF^X##}<~71cM-hKYS}X*U~SL1^L<(A(JJywT4#maL4NNO*+)~gYg`LY4z|-^43%C
zPi&H)qRDh~zzad0Db<)B`i*GdM(g>kY?$4)#@wcul_b29E^I$v$p{g60B1P#{gv%j
zhOV9QPyz_ZGbh#+eWQ_fj2;hy+F4^9rIaiZCvX>;j3)qt8)WknbnR?5;K?1h^HMRA
zVq|vgp5bW1i=OZW=bq~29V?)c!()?S`KF>ay{_+qI9hZANf2bj!La%Itn$gE#Z>8k
z({U_-V?C8hv<RPqBZDW+Z|!SA1c)G5ef`R#R+|nC-0|H;Io*qko~uvdGZHOj54#bG
z^1{;8QUTBltT-LK6*tmzW(B+k7evc(orr*eVVhVBCyN4GJ+a+L1KVuRd61e0;s$<D
zJqmn~4L!s8LGqN8kS{X@Byw&&vP>)bT`_ccc`Dj^WzGaGjXMKdF@cz}+M*?<0~!T@
zjudrXM2rskPf_qpeT7-h-SK$jB_OnmiN)@dL`eoD&G<|WgyA`G4L&%w6%Z;sG~e}A
zD!7vo<Pq1ZNZU9)vY}&3$vFck0SzwVB}~SEKq4cQVu=lsYyyeS*htTW*-Gb2Uhpxt
z?yJ_WU5igVs2xy{JP;C2Q6b!k&fIW?Jb0#5%!+uNB?@LTKqv@{P^)gF0L8}&2ev1{
z0U9>M+~qV(DFuc-Y=BS)r+yTza53fbQGnye%`=Pxd!QZXb>UgkbnQl|rS$0%z;5WU
z^;fzVLVr{~`mI*d)3m{Ro&@*+eAiW}IvpmBnq519q&^};V7&MbMUH-5T}GJUw=YLb
z?K{mUM7JbCTH0oov^F4~*-)Bg)k~Z|5wxGxTGN0C1LB;42LAx1l;wC-nqH$wxO^mA
znV&T?=sBd=m^g!FKTV_yfFK#<apm<2{e3WFZX^!)A2rQ#<-D77I+|{4T<D(=Oog7}
zH%-TJD{o&!xui4_O!*%1Az`_sIouO-<hR3^ikXjTWDp6=ixneWA56fwv_=(1n1L|{
zC0S_z2SnZ`Nct-yxrN5W+aKSBr_-y`eM(`(`^LGXNEh|v3WGp#Hs=#J^~$@`XdmJW
zrXjI7KR?-J<kpqr{{W`LtBUV&dv`l+oyhf{9y_evU<HmO5i$py-Dzo?yM2fbIkE9n
z_06X)1dt5cJ9sR7Yw|hgI%&9*02szQDx?`88y*Uf5mePJIdpyYK<OaB5ZuHAE4DdS
zI%6IFqoyE$Oa)V`3287S@I**gx>HZNd(41e1mb?mgR4@Of-l^($ODWeFZ&H}tI>Bg
z?`(W7zS3ZNtp2D2wJ&fo4<gH7rP12?;Eq5hF<@05w!n2yaGS;U#~qc<)2<k;Y@Y=z
z0Wpw(8%KMo8*dx(ttSQqao8f?v|93TP;PDS0Om574cH{&2Z91k(&?P>60K9BH58n2
zw!o^6A(%fFBeK8J+zkrAfJuV~1uCed?L9`BB0(`^&ZSVF!|Mx4^#MCT1GWN=?iRs7
z(>S+m+k|v!hqSoyfeKo>ceL&Rxu|G4tRx;nWG0QIk}WvIEpKNjKCf7$@ID$~ml4|m
zWbOo$o(F>7_DYy`zO7q8W`3TlKPCSFRbkt$a4na-zylx%`9g-UV`+wE^{r^I^!a+J
zhU&E%!0p?NouvIEx}mAnHIJ)Z#0Jwa!s7rRPh75CX4~7tFWN1C@Op8iry<U47P+=`
z^B~*$E=7$FB)G{YWm(tPcCT3ahSxAAN2H*n*de4zJ(pfHQO|DcccM9r(7a&oxgul9
zZBaXD3=l3(4t}c_qqH+l_PAd#>wc?tog!%gx?!O9u)4!O+Tw3}7HGL7$tHbPFSSlB
zqd}(RyK&$Pe(PneWS2PHgB(xXE`QpmgI<L##ALw{93#ir%cNB*y}A;8*tm(u<>I#b
zEfUHs8-EBKOiJgTOX$;JxpROqmDKdcmQy>WwgO;@`Q=ZZ@ut2B%S0G#IDA5GECEEq
zdsv<?C+wjCu5n|e`Ao;yDo=H?9576PO!#4NyJA2KnfPrcz>)GqOGLz6SORn4i(mjw
zTYDV$L9mhwFB~7As?L~%H;`unO}&0-3!3W#5ozbg>nT<xjT6D1Z^v&G&-hq4CxRk-
zU+)Qm0t8I<F_i^jR8`zv@Bte@yz#=bwi+y8azY(}vRwj9z>Ix`Q?XFdajQ-aW(dO0
z>KfXbZloF+xvXRvAOcPOa<xN433CB#o<ToCva4xLeN*aiT^F#A(QV?^-2Q9AsjFVF
zmpHYqzR*lhiXld&QF9`~Pj5d}q|G}WKmufomc{MhrlqB6ra|0D$!wpIT|16NMro<j
zV^|Eou>12?spwlx;qDg#WL%G*lCa$&V*p{0Zb^bj`Cs~`_d8te%@=MD>{QyNzM5S6
zvOksp1CLv&dXt*kt*8JS+a_dB7Eh$7{63MPqD#z?H~K2wa<f><4s^5@kpx6`g`H`I
z(pnXP1Ot=aS4-^QQEd}agUoMmAfJ-q=<1oc81T0$<26oe!-hvVQ_NmgSEm}(+YhuH
zlZDLtP1LE^YiKmg8Ml5G6FRo6)AxS#mW;`h{S{YLFC>=|;BrYnRIRkJk{Yu^go_i4
zL<P}wZ8Kle+5{MNz~Il{(Q`%tAoJS)0AEn9bl$IDrdEdhRNeTRY!-RrY-{Z4^!hhk
zJh|M#XaF7}%bE7jQD(A}Sl0W%--v;avP>;KN470AUG8W@z&3UQ<n_9$+UoADS=)&x
zu~Xvl*>=w>xXGwVCSp$_#U?aJlVB~}soua&GuT9?4I}wr@=g4{!tLJ-bSf<b=I1ck
zT_>|-gHSoL*2SX3y60)K<hJLNV}C`j(Dkj-R(mSW%7O_3!;Fk&b9u4$#kY?7*I4~c
zCSPd|91DQ3J*U%Ct3kS<0piXr=asPWrl{7`6<R<x1dETKlIH!P>MgDtr=;m<IVJ%s
zm&N0=CfVn>!7nZNrel+9{{UqP5a>MIND19Z?J^DG#Qpdq_i_x2m@o?T=MFo`>5Bo|
zfj9f}SG3xLM<T<SRGPyU7BDhL_E#B2^}!7rfbso)N-|zCTT`iZTEU~6z$E^DDz2nd
zXlZFMZeZi&w5sUED)gLK;Q}DZC-+xhpk+csekKvPP6~4=6YFb{f4wGLOoIRml|3rq
zwC-*!Z4kFA^yam%h>qoA>NJE@b3h<?P0H41GU=9B=`A|VVUjLRGPHEs{e-~<k#I-P
z^H>jcN`DLz0nSm;s#-&L0!gssgk^-VW0jTGd%8O6Adoi*mY)86tCT&TqVexN**y0H
z(pFzZ)Rt6i-KOVoxbyz2svSkNsQ}Of-Xbi#{{SXgVRaslj|_5~tbJ!mq}C7^8UT=B
z0#`e!q9|1$6CBtT&~-kv!b>tnB7Zf9SenLS@pE%M?zQmcmF<_C3};12%&;arBOf)V
zL*48fTO|6=saY&wv663Mc|NN`oj1P{;mi^weyY5c9z1`wx41kxt$;<my(jYQv))eg
zHY0;|VXSpIaly>b<<AP0QMkw!92-gUQ-wb!C~5*k0%Y3%0I#B~S#Ds&*ehb3n~X%r
zIsJ1eO*V&vY(V5s&2&tddUA%2g0`i>%>}K-<Mpz=N7?H2fpaP`tvtAAy@I!+8!0d}
z03ZNHKJK^ZG;JejfJhi7KEmPi{{SZ}I(GPfjgC{7H&@wxV=0Hxdn&=Wh>y8RbZ&FK
z<X{j9g9Rmo{$G`~{x6N>x=k<*4kN@XXKPqs$()=inj21#i+n#dR<%hm+fD%s=qE23
z4M%P`k3|#%figg{kbqmb4)b*28}fThD?kB@v}1(@T96I@0J@g9!Ohhgt}cCyf=~?S
zZX>zKiz#TZms-+!Lq?Y49IN!S@tpYbk$|uU%-;s~l~SWY1WnJ;Xc}b5jPOidRjov{
z7jxi<duNdrP8%f?nhUI)%SD1|2HyUVme?~HNV++cw*mbs0@X=CAUGjUMos|r3I>TH
z>g5m%cAxBkk>LBeiwjR>Mxg+h`mGnY{W9P9ZdP?s4(>t#H$pZ7>-M^e6^ce8M4t{&
z5(dLN&#6Eu7q~=zK~GMbt3PW!0UtH1r0HXB=NtC4eAL?^>Z)k;k0K`=0}FD7mtfrB
zNQ+DrH61Wzpa2Bea;!9#*b5xV`|i2&%5AKOA5Foc&`t;w_x%#n(IwXsP4X|>m1V8o
zFNaOQ{=brXVh9noN2K^!PS-nX^wF}Qfz844^;&w8Hhfd&GO~1410E;FB;j1ISQeP#
z#D09NFKuHVs?~;_)|-<amR#Lht#h0SB!k)^ZS_MtfHE_21n_Muo`LTS2?sNelIrnJ
z8CsRDu_EAPV5@XVdrL)u9*PPrw2>lm2j42?8X`eFT=V-%tchZSCEI3BBlKESDf^is
z1jgZM1C>`y*fIrx;>4?k@i(}=g~H|XR{kw-Y1vRx(^PSAl1KwGs%k0|Nzstm@(0tw
zD%FUh&<FDza(}h`6c23asyvcU01FSORpXo&lea#~CZ3|ZMWo0RGbmot){ziKFDl0N
z)UQ>6rp<y)rB<tPn@@Gs9JQ&@1Zdt~2?fN;hp4r#bPHl*G2*7_UlxuVa$Iao51rIH
zwz<`tA|${jE$ix{c7G3}q}J}JKL`^f$H@9DU3<-7`k~V}1k4W-wl(MYZlCV2HB6E~
zAo(5^E~NhepkgjN=Y^&{?KPGlc%mdn7wn@4j_AZi{DRR04S*s%Q}5gOMe*Q)&NBm^
zQwb(H@IV~NKa$YUfl8kDSUAP*sX-uG;A9`oYEq>=HI4pr@qPS~r;lKXe(u-vNv-#_
zJ`n<Md@ACp+*osiD$^?M_+}<P=gA5V#}^RcXpn-XDsKmD5-sSZd1I*VC*qltj|Br|
zrXp=5WiqPopc*7Ob>7A@kfm4#5mCR5=JVRzS#vwL7~2G7n8NG5jif(;toK+s;{C^;
zlFQ5g04nkJ)71X}qQ|~TeNtOY0RRslA^9vWyI=Sn54sL}Ur=!bJG|b@Ygb*4-KnhT
z)6)@5hRokFJ=Y7>HBZ*-Uh+f(Xf{5JKNde%o%p8Kg6oW%D->bASCtDJBt`hLw?)1d
z#Ff!{Wg2}kgzf?_iHrI!rGxz@?HgLdasY!Ej_aH04QKI8Yt4ZMNXhcQWz$VC(;$#Q
z+G6D9{{Yc)`8#cb8jWxh5MTl%-<|^FJ*BBVTmzQ0WA^o3rB{If8H0-)E(@*I_q|rD
zfv|Ozc&8=gTGPAU>L|7Z12Y^huSzyk5tD&<T)#|UsjC1CFm}4@bkw%d1;-*l9R142
z`4!RNoF=3jV9`5?1m?<cNT>ur<Q##EsA#%f0LbJ557vAXjRo%xBg|gje=+;6I_rtt
zAhhi_z&!rQUughAnTdhqlb!_5B245V20`&8_krf8KqR!dmk5qw7T{TaM;4Hydx$f^
z7r!28MRouHv=A(-K`AY0xUszU9-dq!up7EW37f}{(5U|ah^FBH#EjeMXZxuMYnlki
zumg~S5F0@y&<1(>s5UaF_(<GIf#1E<m5BE*ixAQzn7%m+FHKd{+XS7>F$59<-^U!@
zYSp1hl+O1NX^>#gq^ocYKbHXkqw_GT0$Rbj#E@p=Y<<dSRUx-%Ye+K&VQ9f`ds(^(
zwb~tF_4QR~2R^C3={cMzeerP+(*(qJ<F&Y2x;8PSOL1ctx+$zvYi&7b@9X-kk7{2U
z8V}p>hed<}7R%b!xS9M}0PwNBtzbPnOz<WgZ!s3(JZB|b^&LbL5#l0DdaIQ6a|D<m
zL;!AlmQw&GM9xT$)lSC{4X$zsh*xoy64tNrZXotHF>j%c6s**^y3j@-nOS0_cn-!*
z{JhkxsRY4+#Qy+woyBt|i?uB+limR$2j%?|b!LD>T<|-6m7Z&Y1?2W9g>VT25>3ti
z?w6_iEWO*Lhgc6dQN5rGnI_Ut%1rlA3ZTgjyickhd;_GLN1A#b$-^2%<dI@PAF6$(
z1X?EGPu>-8eA6v9j0xo%YKIXf2tH}*jl{vFrU(G~5>E<xazhP<E)ErE)&PdsK(yL^
z$~IRGuy7zi9sNR^6*60JaJJ$gT21~%Q+++onTg0Y^U9a_k_FBuCPDkUilt!TAjvq1
zN@J=mL}|L?Gh&b3mk`*$pv~>`QNFXOHf%?pK5DzVi`f4FfC(0YE&B4bnA)ASXz3kL
z#6;K|V;S=7w|Zudj0T4{-bN4KisB#D>XT}aBw$DCpH0<u6tzupsAJYgk}ryv=<j;J
zNfk6^t6Dd<_pvd2E@br@t*BYHAVHgbS6$HhyE?Z|m!I_;XGqThXLX*5Mx{dfQpY<W
zau$4jJ-O<gNHCJ;0T+)2k6TwyQCR!8f<>=<`S~gOze?7%zV3k!i$Rc2Z(rGD={jz%
zja}6#k&;e+^0&fcA~LqFxI6iqHn{{JE2`n8wE=CQ%mF@sky{L=(?|z^OMBc$ilwP-
zI!)XxNtiwJl;zoJqoAtRFmBHfbLM|^9Yd^CV+=fiI2c)9#)eYb1PIOKtWAEdt#`zH
zBLfhkcxAh-r<+*nG)Gc(N~F5-WUajxp|7^<7=kVNE=q+rID$mXtxXf&$=k^j;I8<j
z=^w4GiNHEMO@x4xDpX^)GE7bn^-HJeD%RRSmW!L^Xun1p+lV6Ya1TG4=F3aRB@p*l
zI~?LpKrV>y&w{ixMdnGk>}FFogTvXh1IvNzt+C;z0Jz8lm5y9=?(zP5-fER}Z=^Rg
z2f)DueHBAU?k>fyK?m;(Qm(1Mz-S=$$ys%Y4s&3Jo5XUj$8XjiT)cMw0HSI-uX5;j
z#64wgX}aA8QVgil;w%Z9`Yv7XID;cHGr~4jAOHqLL|CQd@ME84hnC{$H62=uoPFCj
zxSN6kO9i@%TWK-KFp|oD9~j&AwaeA43m_Z#s{Zm!8<Zr1*1+N<-2^zg=Ih&$<R%di
zZfAusZs8;uixCi@Yqt1Lbt$bB;N!?6l2X;Uw!nGbWmRJfop7wzJB&q$O+}FD7VmVz
z)}E6<k}fbu3fo50x+EO;QLqrff9v;UGZ^v!1J+b+a1I-w$Da^|^r`H)ndh>%?i>Z|
zF>IAjR^kK~W?%%R8jSk18eB_p*+p8BXaIJ&3Qgn^4X@^vR}F$M-?E{l0suZelSCV$
zc!y;VnxqMWTsy1Uc7gzz;>wE$#f5)NM2`%RfVTF00tDjAKD8ick07+FJO&7K+wSW+
zwOj$4<tTMo%>alN<sB9|?f?r<2}a@rM9Es+Ge^1G$dGtS-5xV(^qOGS0>;S#3653J
z<46E9FLUaq6ooo?&lf6%dW49y&EaP`UEy|~q+fA?F=b)3L$G|je*D%8s_e13;CLw+
zsB~D`G1)&v`!|JVw<hC($xby5YebQllYhIcs(O$>2JzlhkJN%M9pEYa$Gb(E!q<lv
z9AtM^e~u4e4Ry~P4*C1=ST3#!iO-RkM#}Kb#t5Di-x}wUYh75p8`$_PvfqkH=K~)#
zb@dwd<H(#SHo)3=99mW5H44;}KqtV1%=Z3@(;QlU5i`$hs;w|22_2F(OGHe8-srHl
z)1y?AV{$kq-ipJV?XDz?o92A}ap<x%)ywNKfEjn0kGNV@C8KG-a6}Thypxw#ho!oI
z9R_X_xR?=m`;}W#_O*qwV`!2;PwK2`NNo^e;P8DvU!twAGS`-kq9RZCMkSwY1^_kC
zPvRVyl+{|oTpBELBcC+@c^c-EaVPpKHAc{4)7^C9=gFngn(yHlCo>`df%>SmH`E5^
z1gSb`CqTjkfN*(0Lq@2!CP#i0{T~&&umY<H$Yw#C`FxgEqz~cuPG-wnsTVzz-3MqK
zGypwTYuZal;E0~fOjB5ZK=Mh>n<A_>)F5Ce{8orRfyz2<E_To)@dyTNJr(UWEtJJW
z&vFn1bl@|SgCZ5v4AZrnj275%s=l)b0GK(G!kB_+G4pYQ2`Xs?T{}w)fr5gKczA2C
z5G2`3*FYxFF#{?ZwD$t$*o$}+&1{Cflk^HO#<%&s2>yzrD><YG7y}R@Ri?SF9|_NM
zlotWqT<PtfA4S~sy-B0&>uT58O^=stXaKpm9C$8cq-Yslqw7{JZ3+ThT_8C2o)dLm
zr2TzOqy0JzwKfCzMiyDmZWWIL>s@8~+NPBsf`}wHX|WzYs~|L5_U=v5?IPXL21x{r
za|-d2LcH=CVBc&_t`)j%F>AQyN%cWZjRr`Gh`dU*Mv)}CW;<FDXkO_-^tDYb0Gp5x
z109bGMf%6I$Z-uYBn<Jvvdo}BtkOrQKb$MttuU#mrfJW_VkL9st+d5s-xts|3_$+?
zhQyzm;dO5Jnu?DC32`wiuj&tOmrV9kcH$atOv%BCpRl;<pxmBKh1Q-poZCGNZ>%0h
zK$9hPfbFD67r?)#>bXB>3x?y^z??3kn3v7OWRgG!)pGg&0O{G_{{WQqTMma$i4uQb
z$y$ep0^U2o`ItXd9T36;vhLmZ!kCzFjF~r!`Gv`ArXzSalRJb#DV-uj*bB^Dr*qH6
zwA^xUAAfS7d3hkXi=4>_fS%%i55)#Z7KFl$N1VX|!gK2$YC!>{x8j2bnLcTn_c_1}
z3qc>Eq?IarfDQw3IVNCZ@(OmaZ6hXR5t#X;7#+!z{{XZ=Qbe@5qDbVZ`fT2y9?BIQ
z&`5KDl5_hYpwjR)4cj(rw`526vazoNWi}HaI!FU3bWIO)m~Pit18*;?>%;zBep~*_
zPtv&2mYF%mJyJSpVf<V{5I$>9K!1p(w1{xABnuo|sf7q(Y&48adVZ+d7`kdi)!Psz
z<lE%7XwoHzb3q_T6OXu2Qqr-tH;HqAA|hk%K8nj-w4Zn`Oju3x>B>6-+~C6ECD=BA
z2Zfe5w9XFc)SBb*@5ESly!P;0>><!BOL7MvWn=qH*wRsK=AprZ>ZvY3(<Fin0B14b
z`IJp0k#H|)$@!_h*#J$z=OqcHxzGt71d;mq1zRl}KZlM$gay#xBzE|w>Bu6&;75nl
zWeI8BVa$1^R3+~q8JHM|{DL2R=745Mz3`MdxHmYP$Du<UocOAeCzOq<<Jdr(074S-
z=^#Ly3H<#OO*8PgK^%o7oyJK68HrC&2tCEeXucw6?+C?6h`9rv6mDp2{8l`V$qc&@
zB%2(4*)0vH-bn?fNxYQa{y}ZCFfH}@s_uKOIX93pm^f|cw8A}L+S_Z8ZO(c5Cpv;8
z&4+XM%Bl%1gMGd#ElpAw>x+Q^VLx6*>Fa8q_hLMXjn8p6GXpsO>a{o6Nyy<FUI!90
zaI}yMXcM}`@fk_ZA}14WY?}mw+s;%qE48CH@hRw?XR|$B+5%5|5PbYR*IQ4h>GjRs
zfOrrAK8wuyW}$(kOiip{6|1GyJ4R2tB)5^@pGC{%@!yJ<hih9&M!7p&+g#1QN;mYD
zv9i!F<Y(^x05zSa*E3SkTHzxg#LxFzIt@X4T|l@z0U#JFn^sj#xj$%Ve+WfJS*9d*
zTz0co_ta=<kPnjdUudYdwvM165(LjN=DfPKU6oxR*qF8#Z-<y!H7HhXRtD6uA`BTM
zrgn;(pPLLv?z35{aWNP_HD#4*c**P_Ev!oADlBY(AOX%oyP}{UR0heLlQ9c9qO-Q#
z!Mp{v(f$g}Y=S|#BnTcwWu4&U<<xZT7rN8S1_jH%zv!~Mzghkn=92_CoRj19`7W1C
zSaigbcQyds5VCr=NYiSVT=vU}%%82-7qpj?%ITA%*O=eEt|Z9><a&84P1c8rJ7k=!
z$8-TqO(!|0fK87Ir9-IEzhzKJ?l7;{9J*znlPkljX2(+wnIuS)DmTz$-5_yte`x(z
zIYX;CzYhM!NtI=tY=?d*BpVSSFUu*@OfrpE=^8*FW5@y9zvQdGKpSE#FD)e7=l3cW
zb&yO*wX8?x5xc6>drsRBV#deM`yx!$bJr+;13>v+KSY!jtL`Q3BwvRVma`E&u4;C{
zjudWqYly(^qJ4P-g$t@Ul0AGEdj$UYj2}b~)mkSdReRkz^+P4j9pwRJ)!f<7nkH3n
zB+Y;ZF?A-IkGvplp7X-L{v78-GbgdP%}=yc#ZsQCsE$G7y0t^7e0yDN0uMe))4E}6
z2lDfOva$7SbsC272XL1Z`fa;b9Nn#tm{e;5;OMzLUh6)dsm*u;=QvhQq{$O8+mvXw
zsa2(qB;;SoMr!T^PbhU2H$6kRs~Z!qbT&$D3CD^OI$+2lUDJ{TW5rWo;0c~px^`?>
zo=0+kJw;8T#~qa^E+9pZD)`aXt#B@F1zOYelE`!JPiZixaw<)$Xml=n!=gO-T^^fF
zv4Q|^q6!onNLT_#F@<CkiQ;i-SZ9^3vBySx1Dg(j20=6Zy;Ta@P~u3Fdke@~1&(x*
z{9;ZAp9r5o7J%zW$(tPAM9W8U+XdQ30%m0Q`Jnz3gWHb(07bm|1kV$1KfHRV9j23e
zliJbuWvucuxH6kYqhMU$fn;KpKy2wJ8Mk|_PvMC$)|Uyf#G`JM#0y3Qeu;jA^Q?zy
zo@DN@<f0T_2@ri2yEJ5t#~gtmAEFm&fxKpK2>lQ9XBIrVCU6Yl6KJ@J<lpWU(?l2?
zcl-OgMw3K*F+65JB>mv^RSa|q$s7d!>Br|JkgR*V1GJJ410BAp_Pce&LA?IIM7@vH
zIvqS$4ln~mgZ;l%cBSLIT;eTxL#JpPiEP9+1@GtbM^(QJ$OM67KYT8H?(3_^HHqN=
z01U$0+}Oa6AXQyuXPrCu0stOTwRE&;4G4S0&wTurH(09CMi~9AIc#NK472ToTz`mR
zHkWgl@>i;cxugIGVr;TY+$#cK%GP|5_;(W66F1{4rZLH(qH$)5ox9xOM*!9?Ye8`t
zBlS}0kNwJ++h~gkNR6b3v9w}+R$of0>Y80bqadBIK`#IgLz~UkV0(y@Y5rKB$!k`o
z*H@_lvRYt}UGfsKSkeoDam|ba;8wV$ovmPPX&m?}-$kL+>FwI#<qCsZNgb8WlTuJ_
z=ZU~KDq8j9;EW?<O*$pqoCs9jOPoo{WODLK)T3DYfX5680K(0oMb&Bm(TkN~)eB+_
zZ8-2j#IA4?v|iI3B<I>BLFGa3!3KFr)d9BY2pb#PDj7CEl8L*SQULj;6+k9(Fj2AW
zbyi3aP09UIy$I>+HPkv*lSa|B)E@Gn4-N;+dGq1mvnn@xoi>BTmW^^f_U5^y3St9W
zx5L#`+aq?{Oat9UD^%JVcVzJfVGwTVn*q$>OSk<=g5pi$!8l+s0w=I4h^a6KUQOae
z;az1tt!ajdurrkd8UU~|2$HSQAT#6md=Kr)ie%93(k}+U5585F^)}OBW7;=1c9W2N
zJE(z}CjxvYx`w+5dkJlam5PlVs(nXX%FQ&{U`Ub%dtbnnky>Wwi}|fO)QGI`8xm&R
zEbT~(f#w3&6xO%1uh~uJ)^NGybo8765<zhuAFfv?&`At%AMs^$q1zf98Z&W!)p7Z!
zF3%0&qQ^ztOaWp00rzA<`<w_3w-O2ay01aU-G^;I6|DhI#{lOzku!rg?}7QQOC2y|
zmQy6$o7~u{&2e>6YX@YE`Kf{4&<nC4Hee4#C7`l){u6!Hp7`<Xh-Eun@tB@LvHS8v
zy_<;LxbjTFKQAR4%VNe3Z3a8?qht3~0iq8Fx~*_c5aXEuMo9w5g}`*}B0(e)K3+d?
zpruvMrhViKSl<5tpVdFS8{x7*?sBJGjfKr^Qe69kYe^^Mk514VHWB=wK}^yEw-E&J
zFF$akqCu$O0B9f*4nCteT{!gO%bK}G<+`TBM%MsIBxji=X|d5z$OJmW7WsHAY9PIZ
zfPz|N1Cq5VjnrD;=_2CdN6K)DQD6qraBy$}&;^J<@X!I0W=RL1zts~<pGcQ5jm-vK
zpBn_tV?hQ$10;il!RUsuyQB&5$DNixw6JMu;9R(wBP&$63%RBN$RPe-c3FPWlIm0+
zw8if@DNfH$cVU8XPp{vqa>7i%k!u@%*hF4q@Q_K&%#>loiwKePR9Zu@I390^geH@6
zU<&~f6^69#xsm{jBXdMRGJf)nmXzN_5nuo|F>lF0=ER7eWKGtQr69Opg3=@Ep?xE0
z8L)wb=0$O4xuiNqPk69S;zi7p-rj2C=n!rWIT!VUqID+$ao*?a-CwjussiE+ktevD
z{SeDy_A%-epK=JZO}p}&-2VXV=Z;63e$Z;Eyi5_qnF%)Yx&by5A60D7B0HV{<qx<5
zWC<Bh+7^(o5)7VlWXGzf)DFf2pAluH?ud7uNjUi``rRX|Q6aIqZ2;V-EE;5rppY41
z&vh#*hJppbo2pf5R?|GS^xWStf%=z$gYx}Wyi!}Os(}H=#|oKwsCN@@6;!>>Cw4Ok
z+VUJY<Iz%AY1ANmHX^{Qnl7gHKW$P-KZN2vR(&nC7&aS@=Lnh`_*^2^Cc!HugKnQs
zSkJwn?XCcyhX=ap`X5xMRsR4Jq2PQ#m?z13&YN3TNbp06Y0NFnHm#Kat4ZG&Gi-&=
zpN*zDJ1(k6@bw%-?gqn^=Q{lh1!uf807)PP(P}!bqOPCaJWrD4HQM)@<(LD6c(Jir
zWe=v&vCi#r6J?;yI~)A7c~EGp2Rs;%4Wz(Z8ag7jrLCK8xCh6@UROzzb*s@mnSWCo
z{{Rb4Hq4RwEsa;OPo+rIpz<66o84{n%8bzIn#xZB#0iTPeY%3ykouFm#d6%+OkQEB
ze^cn1h^e{l2F4CMDJgqY_Yv<Y%<wKx&&hXPdqt$yblR^jX>e&WGd>H;>QZYfIjuj0
z!0bJK@~UHwbCwz7%cj!x9X7H^Zr2-t-~s!;(LvOyGOZ^A1mqd}g@-}Tr%)MyE^P|V
zqgPV3J+2`551%1V<*c|xjGC-#9Oi*Ab8D!YGX^HZ6Y^8lVcbkinUCnA*$$W(%!SuA
zp{j5UKy;471JO-M&S@KovB>?<L6K+!vAR+S0r!$}Nd+q)<vZ)S!tULZb25?w%YGp}
z!bxbf=XR49zi3Qb0%JLqV6-)YKoQ?6y+_VW_FVxBg}r`iT{RN-gT&<xHaerGEEc|p
zN(+Ys_mr*SQf+gm>3|~GTWr$or@XX)FNyh|*#?6~8rHqSK(-a_Q`t(1fFKS(Udxj#
zw_C}>D_f{z1=|rlmpiNmyWHSpe3pj2<@E>{C*-1~rixpLyk2g$$AnfB*jP#AruU=_
z{Z*}CXmKLZ*-@FtD_F>!WX~vJS*JzL6Brjj`g=*6k=ZbX6G1o$5p->7gW!T28*pYj
zcq>|Fm(~I<Ad;XB01IC+;IDMnmsBnWz~FL~=;v+CH4Us55@$IIcd)RyOvWNVWb|ZH
zE}tp4osvJyORh`+Pwkb<Z!P5D0J!*w@xR@X0%J39K|BP2oA&IG1`X^olHk}0hgriU
zKp?9_Z~z;Q-PAON<<|$1!m`%S#h)WL`mFM*h-w!cS}!NEk8#~`APn%Qs3JhOa3<;k
z1dK;uoJi@qO~s9hosqD7Hyx~_qQG7FVEEb<wlq1Q5GF<Peo8W{#tDsKXlY@vCdVg<
zw;52lHL)UdB&#g|NaT02f%a@V^J`o0D3t1}%LTTA4}7S>X}K{rw3Q=(C*cGUY5u59
zKcR``Uid+!nCB8?CfW$x06sBe^YT(PgB`^Br0Q3>v|zv){nOtALZrdAMWp=x{{UoF
z832pVXqYJ2MKIQr4gg?8{d`bWcoOI{*qI~K&1Ln|CkZuLnsl_(SV(CNfK7?-Wy|$V
zQ)<-OF9s}P%THaly(s)J2@qmA^;o*9TPXks#6-=^u8$nkn?BSY$vcg_7Rw#GO2a?M
zE<d`;f=1aB#FAE)jPKEHI2Pu9L2YXr8hU5!(zF3_5__qOTK5bBE;A)Zrf3wYnA`}&
zk#9LtF|Yn20D&0nvj~U{dyAyFH~CB(`1&jYOX|x>XuXZ&*Oj}}p4wVc_cU&N!$^qr
zvgZq>(n&FxT4B;^O>=$bl#<vW%w&a^NCIz#p;D`T*=wD0A_-JOgFzVb!qOt9s_%x*
zw_tFoxLp(9CVTRRON?j1GFZ~rZ7JQ#hN%vFoD71cZj^@0?cv!;OHA61jleWyplMaC
zbDJPzl)clyn6lA%qfMcr+}iS`ZA~#{z!8E^C^k6+WFn?v<Giaiw!WpT3RLJZ?i*bO
zeraBW?a=B=p4L=xs7a^zp8&hljH<!#LuTY&FCTxZy{@J1snMn*aAZ2=#Hwv+5>2C-
z;Y3-02WWx;{M3wS-y}?PHb^uA4elm9RQnoW#lVvX9zUX62HF7x5o-`st!XWdw-*H1
z$M!(bNe7Z|VQ-R^q`A=IE`JHP=A;~#mWi<E5)f7cXX3c*COp)SWu)e3y|R4pf&r#2
zlX8BUKvSv%oLv9|$>nBh+C;hpPH`)3^n+SS$6zKFX045I5`G{Ej4Q^9E`;C<!DzMf
zZp)_AusF4($dP$IvDtFH7=fvCaXhZaOu)#tZI7Dd^Y-iTP8J9NKmarx8T+zu8ecPE
zd&R<Pr))rwdlXC{FukpkO~CqJh05uLS{g|Xf^h=s3X(t#aUyaL>ErsTD-GehOoRBK
zNR%=dTG+Wf9C~ty73$G<x&%5V!~uW1C#UY2P6ejG;Ez;wTrdE*na|hnl@p9LpfnN5
z0Oec@45_$5A}^4CLR77S`<=j<1PSN*s+8)AVr)BsauoERwUXdKXe5|1Imeg?_2oA3
zu-AdeF6J%w$}j-<yYL1AtxRrlCGZD({RiFD%rha4j!ts6lZPGcZNKTYwHh{){G!qK
zD(0EP_>HbR8tciCJ^rgZa5Uu0oG)T;^+;+R?#++DHwK4ba)NE{0l<p~8-s5PqmT)_
zxC{(@mVS?1D>bF}F2s;$?iJc(5CroC#k?o)lmdH8ns?eP0PGbmuESra0!jY>OyvH`
z{brC1-^|SZ$~uiTI(25el1pYgl|NS;u%nAieSV5TM%TO!dHG>Z8UcaLjGP}nKJ1TU
zW<bTSbZqphDQ(^7Vl5t}Ke-@}%n!e^trpGZZ;YWWktF8QdGknWE4r15aFS!e5TF{^
z4U>GBg)2nl<GE4>1Dnqhjh>aAKoc8e{+Lj^O3^YifJr}+(b@nv&OtZuQEifJAb>yz
zCkM~^qja?t4wetZNE3<s@AOanEI65%ybDjSs<=TBCS!=XNB}NIF(ePa=$^~fOzDF}
zo1B@k2wSEDYyoJ=pS~6Spo@ZMf(l>*Njq~G5h&UEv;P2va25cFw+T?vI;~Og+#Fs$
z`1My_WC$eAZ;w!}^ubSgE^abFDBTkng!G*ycJ9>B1P0#Us>SPiK9ub<snF93WDG=;
z@?FDp%&kO_?8xm4BVEu#Sn7e3HU<b!9PGTOEM84pOj!9M)1K<IM1zjydKl=`p5sQf
zHkO`9fEPX1y_%_^CG>{o5g=dhmDi7!Ru>B#^B!33z`+XZDzdoR>pD5*RC`NJ#l}HV
z;^RM(+qP?t4qz>D&(q0CLs3$T!|A#sBf5s2aNrqKNZ|4lxw~P&$UK5>mV=_v^%`21
z29ve;f-U`1Yx-3^PsRWaCm=<aKF5~7dk)wb3vWTy1Hh$JZFEG>=(D)l(&dRXT{YCE
z@+Y?(`K=m6=-w*>J8ndgKSVTZ8bA<tH0CU>IBDxsI|u~W&S7JfzgIN{N3PbL<?s#~
z2=o1v@AitdMuwV_01*z%e`SHI(AT56^yIJ}1T5B9s`|S?f(?l`l&6kWd%Eh~R<r4C
z4sdB5<a#VOyLD1U;%*AJrfW?w+V6r|FS#{s6R4ku@SB1;93vf03oX<wY{84g%;htw
zxzilrso6{s1Ro(jieW^DV(~FQe?<0Ptm>3pO@?ep=g|+ja5%BxEki~QW(XvO8`-cx
z79wmh$Lf!9>sijE>5(LJbmEZY0elI7KSg-b2Nty3_er#y7ZJpp{pCMm^-^sy5?nki
zlDKJxfJmHOB=$7q9`G6gFrsBQR(n{{GGIbZ{631`N&f)Fk1)C`IIztLrwukaveePh
zrq=-=Ma-zG{{Z5B5s&v-=CR7|HR-y|O*$+me&RVdpBwy^R=-x#PQu(|9Q`blKiYSa
zqg(yc<`iR$+1gB}F5K~)7_yI5TvK5U_&^x#&+4on!|1bh--!M8T29Kds~0`aBbl;?
zX=#)4RvVM#qN=ux>c;43Gj9G03a534J-J3OJd-p1#-OSo)7pG<3PB>mM}#Bpg`bR|
zhDo<_eydxiF~AOIfU(YfS3%N$+IRe=Mtv(D+Ej~x00IwiVh^9y0ds(K18BHY5B3}6
z6jeU~{{WaSf33jkkDT8IE`DVbUO~HL7z7y;_b6F`<eY!I=$(QmqIXn+Z+MX~K1$$z
z?}T9U6AGOV_fMDTr#T<;ta6;Rg!eSF5g#wPQ2Wqt-uIiSpZ?Rl!luvn5B!RvD(xzy
zyEeg-7V-HjO?VR)&OZCB%0G!{jJy70`mFM4jo^Ou(IwI`Fm3Wsx}hQ*es>?yJO2RN
z{{Z42s*11TK4_3o?vpNy<k+V7gt)kK;zS8iHv`Z0Lfm>E$y#={*MVvD7E)7hy{_01
z5=nqydHO1j{71=N(Ek9w-z2i)FCNk|lovb?gqZ*zpVdLBku#t8dGO&~ss7zhRaaB^
z8TZ1=9^2lTPAA>fE@^0J?-TV{+L!y*TG7e5`K@h#;D3-<)qew*XN}F09i_~W>j4C1
zUFppwjt1lz0L|5EU+n(?@hf^vzsYFEwP*s`EptqPU;r|H{{Tc(T;1-eXeK6N0*;6O
z0H&W$ivC4s^qIX&P%G*j%5c_An%mENtjEJX0^1(yT3_xTGMiJMMYcUNton}IF@iDz
zuH!y)l{G~Re;2B>m%AkStGa%lRnXu~&B#^h{{Xsu7emv(@NM+`l-hDL^?EAJV1}Dm
zk|eC+%FVvh@?A!&Aw+)-Klza<xu=?^EhEu3%F$-JJc`+^hk2<f{{XhCC5i5<K$>O-
z2?bf{w5nC=6e;Z_mx~2!Q<nKDIw$^{q5l9k=$2g^?W@&YyIX=FT<r}b&%KpKyaLvX
zL5U{Z{FMDm{{R(V$bL$_Kla=I03cb#tkW0xoDKjTl)GB#hYRF(P*VIsSf%*=Pvlh4
zAxMA=J}A!;ikloAG63unci<oMso9tL6)Msv(hXt2CiBnDMANpwW&{}ucYnBilt=yM
z`Jc%XqC;41Y`De2f)AEe9YY#g;MTxCI9giY_RpC8R(*W8{$Jf`iBvrbk$+UQfY3<*
z8wK0YrYX`cGU9V{BraRo{{Z}bfByh=`mVK~?w>!=aepH6?eKJlsA+)VAWe<P`yX~r
z?SeLpUgsVrd?8n__{WozQ~WLe0F|o`jnlM2yJN%vFXi`z0f$y`5=%@281M9zEkE~u
zNvZxCzG=XMqgoyuZzSA|58lZ&%{!YIGHgifinsf}ebi_9zvWj5--@T=fxt6<+$ovx
zGie8Gi%w7XP*MC1^HM+kvF1NjQ?zB$!$>3-HpEE;m~W>zQnA2OkYG5>oQ^+8^ig;J
z0GUl!@elb{pB$e=9MUcc2LK2{8X8NQA+Zh^`@iIqKk(zsBY)xl0O9=#05r}tYq^2M
zo597rmdy&}UI{WTarbqKKk*cNmcEzpzy5#O1M5kS_ofRhL4%k+YBw~pX`Vz9AVO3#
z{{X~OSN{NYsw$l0$fc-zE(=ToctyC2WMAZ)RsR5utM|!BemZ}Aq_{HuvOp)ma#SXv
z+*}$<&5N?5sgM3Y)ib}{XZ*zhWv+3tbcY+0Bj~1S+S!<o<rzGxT7SgVY1tlXDxjX|
z6X;3hS2clf+(qvOz<kuM<Nj~mQ>pk*pXi<?Qwqbiumyn${qQgew{m`H4?bh_37waJ
z$`vPRO=go5G2@iC0BkNr;=&Np{$Ke@jsF0R2AQC|SX;3l(G5z@(l>KKh{R*fNu~TJ
z&-79CKlaa&{SiAXuA)wz?FRSzr@Oew%IPWV=~ki{J~_hk4JZCB{{Zu!&33v>555&=
zjETo9T2*vZTz%m!fMiZ*m|0z3o|>f+$L)*)V#DqhgZ}{8{{Z1wn*RX6{{S)hDaW(Q
zI6Y>7zo=B9LEGoz5Czs{S`{d~vZG5}Gb>|D{{W|d_`g*bUH<^azsY(MxvXRoU`drD
zD><Mw##z|#R4El`T`g%F+kXf_y8D1KITlMv_)nkegnlxB*3k8LGAY(^a6Rt}U^rD{
zT}A%@E*B`@#JUcX{@C(aXN*)^X17sA&Zk+sfyu}}MMGHD(bP9pMWQC)arbq1sNjFh
zs?`4g(dX}#oSLTu%GGr>sX3nr2Ilq)qSHE#qKj$NYeRsBn*u)Jb6qp=yUeYPAN%e9
z0GKH9M!2oJOj!7HCQY0o3XoE4g5V2ynFsInQ|hwM`F~W*&&^@Bx(U+LKNE;>zyjzh
zX^Vk#PoM|WD&0T!>3N_(&*nc=Z1ke~L9E-3V#Nb#1He28kYgXwTYg8GQ1bNlRv>*Q
W?IJ;#7#T@3Gyed^`XsAq)Bo9M@!z!o

diff --git a/lite/demo/cxx/mobile_detection/mobile_detection.cc b/lite/demo/cxx/ssd_detection/ssd_detection.cc
similarity index 98%
rename from lite/demo/cxx/mobile_detection/mobile_detection.cc
rename to lite/demo/cxx/ssd_detection/ssd_detection.cc
index 9b8f02aeed..011733eb87 100644
--- a/lite/demo/cxx/mobile_detection/mobile_detection.cc
+++ b/lite/demo/cxx/ssd_detection/ssd_detection.cc
@@ -194,7 +194,7 @@ void RunModel(std::string model_dir, std::string img_path) {
   }
   auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.6f, img);
   std::string result_name =
-      img_path.substr(0, img_path.find(".")) + "_detection_result.jpg";
+      img_path.substr(0, img_path.find(".")) + "_ssd_detection_result.jpg";
   cv::imwrite(result_name, img);
 }
 
diff --git a/lite/demo/cxx/test_cv/README.md b/lite/demo/cxx/test_cv/README.md
new file mode 100644
index 0000000000..36d2985a4f
--- /dev/null
+++ b/lite/demo/cxx/test_cv/README.md
@@ -0,0 +1,131 @@
+# 图像预测库的使用
+1. 下载源码（https://github.com/PaddlePaddle/Paddle-Lite），打开LITE_WITH_CV=ON，编译full_publish模式
+example:
+```shell
+set BUILD_WITH_CV=ON or LITE_WITH_CV=ON
+./lite/tools/build.sh
+--arm_os=android
+--arm_abi=armv8
+--arm_lang=gcc
+--android_stl=c++_static
+full_publish
+```
+
+2. 准备模型和优化模型
+example:
+```shell
+wget http://paddle-inference-dist.bj.bcebos.com/mobilenet_v1.tar.gz
+tar zxvf mobilenet_v1.tar.gz
+./lite/tools/build.sh build_optimize_tool
+./build.model_optimize_tool/lite/api/model_optimize_tool 
+--optimize_out_type=naive_buffer 
+--optimize_out=model_dir 
+--model_dir=model_dir
+--prefer_int8_kernel=false
+```
+
+3. 编译并运行完整test_model_cv demo
+example:
+```shell
+cd inference_lite_lib.android.armv8/demo/cxx/test_cv
+```
+
+- 修改MakeFile, 注释编译test_img_propress 语句
+    ```shell
+    test_model_cv: fetch_opencv test_model_cv.o
+            $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+    test_model_cv.o: test_model_cv.cc
+            $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+    #test_img_propress: fetch_opencv test_img_propress.o
+    #        $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress  $(CXX_LIBS) $(LDFLAGS)
+
+    #test_img_propress.o: test_img_propress.cc
+    #        $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc
+
+    .PHONY: clean
+    clean:
+            rm -f test_model_cv.o
+            rm -f test_model_cv
+            #rm -f test_img_propress.o
+            #rm -f test_img_propress
+    ```
+- 修改../../..//cxx/include/paddle_image_preprocess.h， 修改paddle_api.h头文件的路径
+    ```shell
+    origin:
+        #include "lite/api/paddle_api.h"
+        #include "lite/api/paddle_place.h"
+    now:
+        #include "paddle_api.h"
+        #include "paddle_place.h"
+    ```
+- 测试模型必须是优化后的模型
+
+```shell
+make
+
+adb -s device_id push mobilenet_v1 /data/local/tmp/
+adb -s device_id push test_model_cv /data/local/tmp/
+adb -s device_id push test.jpg /data/local/tmp/
+adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
+adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_model_cv /data/local/tmp/mobilenet_v1 /data/local/tmp/test.jpg 1 3 224 224 "
+```
+运行成功将在控制台输出部分预测结果
+
+4. 编译并运行完整test_img_preprocess demo
+example:
+```shell
+cd inference_lite_lib.android.armv8/demo/cxx/test_cv
+```
+
+- 修改MakeFile, 注释编译test_model_cv 语句
+    ```shell
+    #test_model_cv: fetch_opencv test_model_cv.o
+    #        $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_model_cv.o -o test_model_cv  $(CXX_LIBS) $(LDFLAGS)
+
+    #test_model_cv.o: test_model_cv.cc
+    #        $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_model_cv.o -c test_model_cv.cc
+
+    test_img_propress: fetch_opencv test_img_propress.o
+            $(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) test_img_propress.o -o test_img_propress  $(CXX_LIBS) $(LDFLAGS)
+
+    test_img_propress.o: test_img_propress.cc
+            $(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_img_propress.o -c test_img_propress.cc
+
+    .PHONY: clean
+    clean:
+            #rm -f test_model_cv.o
+            #rm -f test_model_cv
+            rm -f test_img_propress.o
+            rm -f test_img_propress
+    ```
+- 修改../../..//cxx/include/paddle_image_preprocess.h， 修改paddle_api.h头文件的路径
+    ```shell
+    origin:
+        #include "lite/api/paddle_api.h"
+        #include "lite/api/paddle_place.h"
+    now:
+        #include "paddle_api.h"
+        #include "paddle_place.h"
+    ```
+- 测试模型必须是优化后的模型
+
+```shell
+make
+
+adb -s device_id push mobilenet_v1 /data/local/tmp/
+adb -s device_id push test_img_propress /data/local/tmp/
+adb -s device_id push test.jpg /data/local/tmp/
+adb -s device_id push ../../../cxx/lib/libpaddle_full_api_shared.so /data/local/tmp/
+adb -s device_id shell chmod +x /data/local/tmp/test_model_cv
+adb -s device_id shell "export LD_LIBRARY_PATH=/data/local/tmp/:$LD_LIBRARY_PATH && 
+/data/local/tmp/test_img_propress /data/local/tmp/test.jpg /data/local/tmp/ 3 3 1 3 224 224 /data/local/tmp/mobilenet_v1  "
+adb -s device_id pull /data/local/tmp/resize.jpg ./
+adb -s device_id pull /data/local/tmp/convert.jpg ./
+adb -s device_id pull /data/local/tmp/flip.jpg ./
+adb -s device_id pull /data/local/tmp/rotate.jpg ./
+```
+运行成功将在控制台输出OpenCV 和 Padlle-lite的耗时；同时，将在test_cv目录下看到生成的图像预处理结果图: 如：resize.jpg、convert.jpg等
diff --git a/lite/demo/cxx/test_cv/test_img_prepross.cc b/lite/demo/cxx/test_cv/test_img_prepross.cc
new file mode 100644
index 0000000000..c2cbd66cc0
--- /dev/null
+++ b/lite/demo/cxx/test_cv/test_img_prepross.cc
@@ -0,0 +1,389 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"               // NOLINT
+#include "paddle_image_preprocess.h"  // NOLINT
+#include "time.h"                     // NOLINT
+typedef paddle::lite_api::Tensor Tensor;
+typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+typedef paddle::lite::utils::cv::FlipParam FlipParam;
+typedef paddle::lite::utils::cv::TransParam TransParam;
+typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
+typedef paddle::lite_api::DataLayoutType LayoutType;
+using namespace paddle::lite_api;  // NOLINT
+
+void fill_with_mat(cv::Mat& mat, uint8_t* src) {  // NOLINT
+  for (int i = 0; i < mat.rows; i++) {
+    for (int j = 0; j < mat.cols; j++) {
+      int tmp = (i * mat.cols + j) * 3;
+      cv::Vec3b& rgb = mat.at<cv::Vec3b>(i, j);
+      rgb[0] = src[tmp];
+      rgb[1] = src[tmp + 1];
+      rgb[2] = src[tmp + 2];
+    }
+  }
+}
+void test_img(std::vector<int> cluster_id,
+              std::vector<int> thread_num,
+              std::string img_path,
+              std::string dst_path,
+              ImageFormat srcFormat,
+              ImageFormat dstFormat,
+              int width,
+              int height,
+              float rotate,
+              FlipParam flip,
+              LayoutType layout,
+              std::string model_dir,
+              int test_iter = 1) {
+  // init
+  // paddle::lite::DeviceInfo::Init();
+  // read img and pre-process
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  float means[3] = {0.485f, 0.456f, 0.406f};
+  float scales[3] = {0.229f, 0.224f, 0.225f};
+  int srch = img.rows;
+  int srcw = img.cols;
+  for (auto& cls : cluster_id) {
+    for (auto& th : thread_num) {
+      std::cout << "cluster: " << cls << ", threads: " << th << std::endl;
+      // 1. Set MobileConfig
+      MobileConfig config;
+      config.set_model_dir(model_dir);
+      config.set_power_mode((PowerMode)cls);
+      config.set_threads(th);
+      std::cout << "model: " << model_dir;
+
+      // 2. Create PaddlePredictor by MobileConfig
+      std::shared_ptr<PaddlePredictor> predictor =
+          CreatePaddlePredictor<MobileConfig>(config);
+
+      // 3. Prepare input data from image
+      std::unique_ptr<Tensor> input_tensor(predictor->GetInput(0));
+
+      /*
+        imread(img_path, param)
+        IMREAD_UNCHANGED(<0) 表示加载原图，不做任何改变
+        IMREAD_GRAYSCALE ( 0)表示把原图作为灰度图像加载进来
+        IMREAD_COLOR (>0) 表示把原图作为RGB图像加载进来
+      */
+      cv::Mat img;
+      if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+        img = imread(img_path, cv::IMREAD_COLOR);
+      } else if (srcFormat == ImageFormat::GRAY) {
+        img = imread(img_path, cv::IMREAD_GRAYSCALE);
+      } else {
+        printf("this format %d does not support \n", srcFormat);
+        return;
+      }
+      if (img.empty()) {
+        std::cout << "opencv read image " << img_path.c_str() << " failed"
+                  << std::endl;
+        return;
+      }
+      int srch = img.rows;
+      int srcw = img.cols;
+      int dsth = height;
+      int dstw = width;
+
+      std::cout << " input tensor size, num= " << 1 << ", channel= " << 1
+                << ", height= " << srch << ", width= " << srcw
+                << ", srcFormat= " << (ImageFormat)srcFormat << std::endl;
+      // RGBA = 0, BGRA, RGB, BGR, GRAY, NV21 = 11, NV12,
+      if (srcFormat == ImageFormat::GRAY) {
+        std::cout << "srcFormat: GRAY" << std::endl;
+      }
+      if (srcFormat == ImageFormat::BGR) {
+        std::cout << "srcFormat: BGR" << std::endl;
+      }
+      if (srcFormat == ImageFormat::RGB) {
+        std::cout << "srcFormat: RGB" << std::endl;
+      }
+      std::cout << " output tensor size, num=" << 1 << ", channel=" << 1
+                << ", height=" << dsth << ", width=" << dstw
+                << ", dstFormat= " << (ImageFormat)dstFormat << std::endl;
+
+      if (dstFormat == ImageFormat::GRAY) {
+        std::cout << "dstFormat: GRAY" << std::endl;
+      }
+      if (dstFormat == ImageFormat::BGR) {
+        std::cout << "dstFormat: BGR" << std::endl;
+      }
+      if (dstFormat == ImageFormat::RGB) {
+        std::cout << "dstFormat: RGB" << std::endl;
+      }
+
+      std::cout << "Rotate = " << rotate << ", Flip = " << flip
+                << ", Layout = " << static_cast<int>(layout) << std::endl;
+      if (static_cast<int>(layout) != 1 && static_cast<int>(layout) != 3) {
+        std::cout << "this layout" << static_cast<int>(layout)
+                  << " is no support" << std::endl;
+      }
+      int size = 3 * srch * srcw;
+      if (srcFormat == ImageFormat::BGR || srcFormat == ImageFormat::RGB) {
+        size = 3 * srch * srcw;
+      } else if (srcFormat == ImageFormat::GRAY) {
+        size = srch * srcw;
+      }
+      uint8_t* src = img.data;
+
+      int out_size = srch * srcw;
+      int resize = dstw * dsth;
+      if (dstFormat == ImageFormat::BGR || dstFormat == ImageFormat::RGB) {
+        out_size = 3 * srch * srcw;
+        resize = 3 * dsth * dstw;
+      } else if (dstFormat == ImageFormat::GRAY) {
+        out_size = srch * srcw;
+        resize = dsth * dstw;
+      }
+      // out
+      uint8_t* lite_dst = new uint8_t[out_size];
+      uint8_t* resize_tmp = new uint8_t[resize];
+      uint8_t* tv_out_ratote = new uint8_t[out_size];
+      uint8_t* tv_out_flip = new uint8_t[out_size];
+      std::vector<int64_t> shape_out = {1, 3, srch, srcw};
+
+      input_tensor->Resize(shape_out);
+      Tensor dst_tensor = *input_tensor;
+      std::cout << "opencv compute" << std::endl;
+      cv::Mat im_convert;
+      cv::Mat im_resize;
+      cv::Mat im_rotate;
+      cv::Mat im_flip;
+      double to_1 = 0;
+      double to_2 = 0;
+      double to_3 = 0;
+      double to_4 = 0;
+      double to1 = 0;
+      for (int i = 0; i < test_iter; i++) {
+        clock_t start = clock();
+        clock_t begin = clock();
+        // convert bgr-gray
+        if (dstFormat == srcFormat) {
+          im_convert = img;
+        } else if (dstFormat == ImageFormat::BGR &&
+                   srcFormat == ImageFormat::GRAY) {
+          cv::cvtColor(img, im_convert, cv::COLOR_GRAY2BGR);
+        } else if (srcFormat == ImageFormat::BGR &&
+                   dstFormat == ImageFormat::GRAY) {
+          cv::cvtColor(img, im_convert, cv::COLOR_BGR2GRAY);
+        } else if (dstFormat == srcFormat) {
+          printf("convert format error \n");
+          return;
+        }
+        clock_t end = clock();
+        to_1 += (end - begin);
+
+        begin = clock();
+        // resize default linear
+        cv::resize(im_convert, im_resize, cv::Size(dstw, dsth), 0.f, 0.f);
+        end = clock();
+        to_2 += (end - begin);
+
+        begin = clock();
+        // rotate 90
+        if (rotate == 90) {
+          cv::flip(im_convert.t(), im_rotate, 1);
+        } else if (rotate == 180) {
+          cv::flip(im_convert, im_rotate, -1);
+        } else if (rotate == 270) {
+          cv::flip(im_convert.t(), im_rotate, 0);
+        }
+        end = clock();
+        to_3 += (end - begin);
+
+        begin = clock();
+        // flip
+        cv::flip(im_convert, im_flip, flip);
+        end = clock();
+        to_4 += (end - begin);
+        clock_t ovet = clock();
+        to1 += (ovet - start);
+      }
+
+      std::cout << "Paddle-lite compute" << std::endl;
+      double lite_to = 0;
+      double lite_to_1 = 0;
+      double lite_to_2 = 0;
+      double lite_to_3 = 0;
+      double lite_to_4 = 0;
+      double lite_to_5 = 0;
+      TransParam tparam;
+      tparam.ih = srch;
+      tparam.iw = srcw;
+      tparam.oh = dsth;
+      tparam.ow = dstw;
+      tparam.flip_param = flip;
+      tparam.rotate_param = rotate;
+
+      ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
+
+      for (int i = 0; i < test_iter; ++i) {
+        clock_t start = clock();
+        clock_t begin = clock();
+        image_preprocess.imageConvert(src, lite_dst);
+        clock_t end = clock();
+        lite_to_1 += (end - begin);
+
+        begin = clock();
+        image_preprocess.imageResize(lite_dst, resize_tmp);
+        end = clock();
+        lite_to_2 += (end - begin);
+
+        begin = clock();
+        image_preprocess.imageRotate(
+            lite_dst, tv_out_ratote, (ImageFormat)dstFormat, srcw, srch, 90);
+        end = clock();
+        lite_to_3 += (end - begin);
+
+        begin = clock();
+        image_preprocess.imageFlip(
+            lite_dst, tv_out_flip, (ImageFormat)dstFormat, srcw, srch, flip);
+        end = clock();
+        lite_to_4 += (end - begin);
+
+        clock_t over = clock();
+        lite_to += (over - start);
+
+        begin = clock();
+        image_preprocess.image2Tensor(lite_dst,
+                                      &dst_tensor,
+                                      (ImageFormat)dstFormat,
+                                      srcw,
+                                      srch,
+                                      layout,
+                                      means,
+                                      scales);
+        end = clock();
+        lite_to_5 += (end - begin);
+      }
+      to_1 = 1000 * to_1 / CLOCKS_PER_SEC;
+      to_2 = 1000 * to_2 / CLOCKS_PER_SEC;
+      to_3 = 1000 * to_3 / CLOCKS_PER_SEC;
+      to_4 = 1000 * to_4 / CLOCKS_PER_SEC;
+      to1 = 1000 * to1 / CLOCKS_PER_SEC;
+      std::cout << "opencv convert run time: " << to_1
+                << "ms, avg: " << to_1 / test_iter << std::endl;
+      std::cout << "opencv resize run time: " << to_2
+                << "ms, avg: " << to_2 / test_iter << std::endl;
+      std::cout << "opencv rotate run time: " << to_3
+                << "ms, avg: " << to_3 / test_iter << std::endl;
+      std::cout << "opencv flip  time: " << to_4
+                << "ms, avg: " << to_4 / test_iter << std::endl;
+      std::cout << "opencv total run time: " << to1
+                << "ms, avg: " << to1 / test_iter << std::endl;
+      std::cout << "------" << std::endl;
+
+      lite_to_1 = 1000 * lite_to_1 / CLOCKS_PER_SEC;
+      lite_to_2 = 1000 * lite_to_2 / CLOCKS_PER_SEC;
+      lite_to_3 = 1000 * lite_to_3 / CLOCKS_PER_SEC;
+      lite_to_4 = 1000 * lite_to_4 / CLOCKS_PER_SEC;
+      lite_to_5 = 1000 * lite_to_5 / CLOCKS_PER_SEC;
+      lite_to = 1000 * lite_to / CLOCKS_PER_SEC;
+      std::cout << "lite convert run time: " << lite_to_1
+                << "ms, avg: " << lite_to_1 / test_iter << std::endl;
+      std::cout << "lite resize run time: " << lite_to_2
+                << "ms, avg: " << lite_to_2 / test_iter << std::endl;
+      std::cout << "lite rotate run time: " << lite_to_3
+                << "ms, avg: " << lite_to_3 / test_iter << std::endl;
+      std::cout << "lite flip  time: " << lite_to_4
+                << "ms, avg: " << lite_to_4 / test_iter << std::endl;
+      std::cout << "lite total run time: " << lite_to
+                << "ms, avg: " << lite_to / test_iter << std::endl;
+      std::cout << "lite img2tensor  time: " << lite_to_5
+                << "ms, avg: " << lite_to_5 / test_iter << std::endl;
+      std::cout << "------" << std::endl;
+
+      double max_ratio = 0;
+      double max_diff = 0;
+      const double eps = 1e-6f;
+      // save_img
+      std::cout << "write image: " << std::endl;
+      std::string resize_name = dst_path + "/resize.jpg";
+      std::string convert_name = dst_path + "/convert.jpg";
+      std::string rotate_name = dst_path + "/rotate.jpg";
+      std::string flip_name = dst_path + "/flip.jpg";
+      cv::Mat resize_mat(dsth, dstw, CV_8UC3);
+      cv::Mat convert_mat(srch, srcw, CV_8UC3);
+      cv::Mat rotate_mat;
+      if (rotate == 90 || rotate == 270) {
+        rotate_mat = cv::Mat(srcw, srch, CV_8UC3);
+      } else {
+        rotate_mat = cv::Mat(srch, srcw, CV_8UC3);
+      }
+      cv::Mat flip_mat(srch, srcw, CV_8UC3);
+      fill_with_mat(resize_mat, resize_tmp);
+      fill_with_mat(convert_mat, lite_dst);
+      fill_with_mat(rotate_mat, tv_out_ratote);
+      fill_with_mat(flip_mat, tv_out_flip);
+      cv::imwrite(convert_name, convert_mat);
+      cv::imwrite(resize_name, resize_mat);
+      cv::imwrite(rotate_name, rotate_mat);
+      cv::imwrite(flip_name, flip_mat);
+      delete[] lite_dst;
+      delete[] resize_tmp;
+      delete[] tv_out_ratote;
+      delete[] tv_out_flip;
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 7) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " image_path dst_apth srcFormat dstFormat width height\n";
+    exit(1);
+  }
+  std::string image_path = argv[1];
+  std::string dst_path = argv[2];
+  int srcFormat = atoi(argv[3]);
+  int dstFormat = atoi(argv[4]);
+  int width = atoi(argv[5]);
+  int height = atoi(argv[6]);
+  int flip = -1;
+  float rotate = 90;
+  int layout = 1;
+  std::string model_dir = "mobilenet_v1";
+  if (argc > 7) {
+    model_dir = argv[7];
+  }
+  if (argc > 8) {
+    flip = atoi(argv[8]);
+  }
+  if (argc > 9) {
+    rotate = atoi(argv[9]);
+  }
+  if (argc > 10) {
+    layout = atoi(argv[10]);
+  }
+  test_img({3},
+           {1, 2, 4},
+           image_path,
+           dst_path,
+           (ImageFormat)srcFormat,
+           (ImageFormat)dstFormat,
+           width,
+           height,
+           rotate,
+           (FlipParam)flip,
+           (LayoutType)layout,
+           model_dir,
+           20);
+  return 0;
+}
diff --git a/lite/demo/cxx/test_cv/test_model_cv.cc b/lite/demo/cxx/test_cv/test_model_cv.cc
new file mode 100644
index 0000000000..24f408bf4a
--- /dev/null
+++ b/lite/demo/cxx/test_cv/test_model_cv.cc
@@ -0,0 +1,224 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"               // NOLINT
+#include "paddle_image_preprocess.h"  // NOLINT
+#include "time.h"                     // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+void pre_process(const cv::Mat& img, int width, int height, Tensor dstTensor) {
+#ifdef LITE_WITH_CV
+  typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
+  typedef paddle::lite::utils::cv::FlipParam FlipParam;
+  typedef paddle::lite::utils::cv::TransParam TransParam;
+  typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
+  typedef paddle::lite_api::DataLayoutType LayoutType;
+  // init TransParam
+  TransParam tp;
+  tp.iw = img.cols;
+  tp.ih = img.rows;
+  tp.ow = width;
+  tp.oh = height;
+  ImageFormat srcFormat = ImageFormat::BGR;
+  ImageFormat dstFormat = ImageFormat::RGB;
+  // init ImagePreprocess
+  ImagePreprocess img_process(srcFormat, dstFormat, tp);
+  // init temp var
+  const uint8_t* img_ptr = reinterpret_cast<const uint8_t*>(img.data);
+  uint8_t* rgb_ptr = new uint8_t[img.cols * img.rows * 3];
+  uint8_t* resize_ptr = new uint8_t[width * height * 3];
+  // do convert bgr--rgb
+  img_process.imageConvert(img_ptr, rgb_ptr);
+  // do resize
+  img_process.imageResize(rgb_ptr, resize_ptr);
+  // data--tensor and normalize
+  float means[3] = {103.94f, 116.78f, 123.68f};
+  float scales[3] = {0.017f, 0.017f, 0.017f};
+  img_process.image2Tensor(
+      resize_ptr, &dstTensor, LayoutType::kNCHW, means, scales);
+  float* data = dstTensor.mutable_data<float>();
+#else
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  float means[3] = {0.485f, 0.456f, 0.406f};
+  float scales[3] = {0.229f, 0.224f, 0.225f};
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  float* data = dstTensor.mutable_data<float>();
+  neon_mean_scale(dimg, data, width * height, means, scales);
+#endif
+}
+
+void RunModel(std::string model_dir,
+              std::string img_path,
+              std::vector<int> input_shape,
+              PowerMode power_mode,
+              int thread_num,
+              int test_iter,
+              int warmup = 0) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+  config.set_power_mode(power_mode);
+  config.set_threads(thread_num);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+  // 3. Prepare input data from image
+  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+  input_tensor->Resize(
+      {input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
+  auto* data = input_tensor->mutable_data<float>();
+  // read img and pre-process
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+
+  pre_process(img, input_shape[3], input_shape[2], *input_tensor);
+
+  // 4. Run predictor
+  for (int i = 0; i < warmup; ++i) {
+    predictor->Run();
+  }
+  double lps = 0.f;
+  double min_time = 1000000.f;
+  double max_time = 0.f;
+  for (int i = 0; i < test_iter; ++i) {
+    clock_t begin = clock();
+    predictor->Run();
+    clock_t end = clock();
+    double t = (end - begin) * 1000;
+    t = t / CLOCKS_PER_SEC;
+    lps += t;
+    if (t < min_time) {
+      min_time = t;
+    }
+    if (t > max_time) {
+      max_time = t;
+    }
+    std::cout << "iter: " << i << ", time: " << t << " ms" << std::endl;
+  }
+  std::cout << "================== Speed Report ==================="
+            << std::endl;
+  std::cout << "Model: " << model_dir
+            << ", power_mode: " << static_cast<int>(power_mode)
+            << ", threads num " << thread_num << ", warmup: " << warmup
+            << ", repeats: " << test_iter << ", avg time: " << lps / test_iter
+            << " ms"
+            << ", min time: " << min_time << " ms"
+            << ", max time: " << max_time << " ms." << std::endl;
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int output_num = 1;
+  for (int i = 0; i < shape_out.size(); ++i) {
+    output_num *= shape_out[i];
+  }
+  std::cout << "output_num: " << output_num << std::endl;
+  for (int i = 0; i < output_num; i += 100) {
+    std::cout << "i: " << i << ", out: " << outptr[i] << std::endl;
+  }
+}
+
+int main(int argc, char** argv) {
+  if (argc < 7) {
+    std::cerr << "[ERROR] usage: " << argv[0]
+              << " model_dir image_path input_shape\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  std::vector<int> input_shape;
+  input_shape.push_back(atoi(argv[3]));
+  input_shape.push_back(atoi(argv[4]));
+  input_shape.push_back(atoi(argv[5]));
+  input_shape.push_back(atoi(argv[6]));
+  int power_mode = 3;
+  int threads = 1;
+  int test_iter = 100;
+  int warmup = 10;
+  if (argc > 7) {
+    power_mode = atoi(argv[7]);
+  }
+  if (argc > 8) {
+    threads = atoi(argv[8]);
+  }
+  if (argc > 9) {
+    test_iter = atoi(argv[9]);
+  }
+  if (argc > 10) {
+    warmup = atoi(argv[10]);
+  }
+  RunModel(model_dir,
+           img_path,
+           input_shape,
+           (PowerMode)power_mode,
+           threads,
+           test_iter,
+           warmup);
+  return 0;
+}
diff --git a/lite/demo/cxx/yolov3_detection/yolov3_detection.cc b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
new file mode 100644
index 0000000000..a9beb1ed28
--- /dev/null
+++ b/lite/demo/cxx/yolov3_detection/yolov3_detection.cc
@@ -0,0 +1,238 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+#include "paddle_api.h"  // NOLINT
+
+using namespace paddle::lite_api;  // NOLINT
+
+struct Object {
+  cv::Rect rec;
+  int class_id;
+  float prob;
+};
+
+int64_t ShapeProduction(const shape_t& shape) {
+  int64_t res = 1;
+  for (auto i : shape) res *= i;
+  return res;
+}
+
+const char* class_names[] = {"person",        "bicycle",      "car",
+                             "motorcycle",    "airplane",     "bus",
+                             "train",         "truck",        "boat",
+                             "traffic light", "fire hydrant", "stop sign",
+                             "parking meter", "bench",        "bird",
+                             "cat",           "dog",          "horse",
+                             "sheep",         "cow",          "elephant",
+                             "bear",          "zebra",        "giraffe",
+                             "backpack",      "umbrella",     "handbag",
+                             "tie",           "suitcase",     "frisbee",
+                             "skis",          "snowboard",    "sports ball",
+                             "kite",          "baseball bat", "baseball glove",
+                             "skateboard",    "surfboard",    "tennis racket",
+                             "bottle",        "wine glass",   "cup",
+                             "fork",          "knife",        "spoon",
+                             "bowl",          "banana",       "apple",
+                             "sandwich",      "orange",       "broccoli",
+                             "carrot",        "hot dog",      "pizza",
+                             "donut",         "cake",         "chair",
+                             "couch",         "potted plant", "bed",
+                             "dining table",  "toilet",       "tv",
+                             "laptop",        "mouse",        "remote",
+                             "keyboard",      "cell phone",   "microwave",
+                             "oven",          "toaster",      "sink",
+                             "refrigerator",  "book",         "clock",
+                             "vase",          "scissors",     "teddy bear",
+                             "hair drier",    "toothbrush"};
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(const float* din,
+                     float* dout,
+                     int size,
+                     const std::vector<float> mean,
+                     const std::vector<float> scale) {
+  if (mean.size() != 3 || scale.size() != 3) {
+    std::cerr << "[ERROR] mean or scale size must equal to 3\n";
+    exit(1);
+  }
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) * scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) * scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) * scale[2];
+  }
+}
+
+void pre_process(const cv::Mat& img, int width, int height, float* data) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(
+      rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f, cv::INTER_CUBIC);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  std::vector<float> mean = {0.485f, 0.456f, 0.406f};
+  std::vector<float> scale = {0.229f, 0.224f, 0.225f};
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, data, width * height, mean, scale);
+}
+
+std::vector<Object> detect_object(const float* data,
+                                  int count,
+                                  float thresh,
+                                  cv::Mat& image) {  // NOLINT
+  if (data == nullptr) {
+    std::cerr << "[ERROR] data can not be nullptr\n";
+    exit(1);
+  }
+  std::vector<Object> rect_out;
+  for (int iw = 0; iw < count; iw++) {
+    int oriw = image.cols;
+    int orih = image.rows;
+    if (data[1] > thresh) {
+      Object obj;
+      int x = static_cast<int>(data[2]);
+      int y = static_cast<int>(data[3]);
+      int w = static_cast<int>(data[4] - data[2] + 1);
+      int h = static_cast<int>(data[5] - data[3] + 1);
+      cv::Rect rec_clip =
+          cv::Rect(x, y, w, h) & cv::Rect(0, 0, image.cols, image.rows);
+      obj.class_id = static_cast<int>(data[0]);
+      obj.prob = data[1];
+      obj.rec = rec_clip;
+      if (w > 0 && h > 0 && obj.prob <= 1) {
+        rect_out.push_back(obj);
+        cv::rectangle(image, rec_clip, cv::Scalar(0, 0, 255), 1, cv::LINE_AA);
+        std::string str_prob = std::to_string(obj.prob);
+        std::string text = std::string(class_names[obj.class_id]) + ": " +
+                           str_prob.substr(0, str_prob.find(".") + 4);
+        int font_face = cv::FONT_HERSHEY_COMPLEX_SMALL;
+        double font_scale = 1.f;
+        int thickness = 1;
+        cv::Size text_size =
+            cv::getTextSize(text, font_face, font_scale, thickness, nullptr);
+        float new_font_scale = w * 0.5 * font_scale / text_size.width;
+        text_size = cv::getTextSize(
+            text, font_face, new_font_scale, thickness, nullptr);
+        cv::Point origin;
+        origin.x = x + 3;
+        origin.y = y + text_size.height + 3;
+        cv::putText(image,
+                    text,
+                    origin,
+                    font_face,
+                    new_font_scale,
+                    cv::Scalar(0, 255, 255),
+                    thickness,
+                    cv::LINE_AA);
+
+        std::cout << "detection, image size: " << image.cols << ", "
+                  << image.rows
+                  << ", detect object: " << class_names[obj.class_id]
+                  << ", score: " << obj.prob << ", location: x=" << x
+                  << ", y=" << y << ", width=" << w << ", height=" << h
+                  << std::endl;
+      }
+    }
+    data += 6;
+  }
+  return rect_out;
+}
+
+void RunModel(std::string model_dir, std::string img_path) {
+  // 1. Set MobileConfig
+  MobileConfig config;
+  config.set_model_dir(model_dir);
+
+  // 2. Create PaddlePredictor by MobileConfig
+  std::shared_ptr<PaddlePredictor> predictor =
+      CreatePaddlePredictor<MobileConfig>(config);
+
+  const int in_width = 608;
+  const int in_height = 608;
+
+  // 3. Prepare input data from image
+  // input 0
+  std::unique_ptr<Tensor> input_tensor0(std::move(predictor->GetInput(0)));
+  input_tensor0->Resize({1, 3, in_height, in_width});
+  auto* data0 = input_tensor0->mutable_data<float>();
+  cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+  pre_process(img, in_width, in_height, data0);
+  // input1
+  std::unique_ptr<Tensor> input_tensor1(std::move(predictor->GetInput(1)));
+  input_tensor1->Resize({1, 2});
+  auto* data1 = input_tensor1->mutable_data<int>();
+  data1[0] = img.rows;
+  data1[1] = img.cols;
+
+  // 4. Run predictor
+  predictor->Run();
+
+  // 5. Get output and post process
+  std::unique_ptr<const Tensor> output_tensor(
+      std::move(predictor->GetOutput(0)));
+  auto* outptr = output_tensor->data<float>();
+  auto shape_out = output_tensor->shape();
+  int64_t cnt = 1;
+  for (auto& i : shape_out) {
+    cnt *= i;
+  }
+  auto rec_out = detect_object(outptr, static_cast<int>(cnt / 6), 0.5f, img);
+  std::string result_name =
+      img_path.substr(0, img_path.find(".")) + "_yolov3_detection_result.jpg";
+  cv::imwrite(result_name, img);
+}
+
+int main(int argc, char** argv) {
+  if (argc < 3) {
+    std::cerr << "[ERROR] usage: " << argv[0] << " model_dir image_path\n";
+    exit(1);
+  }
+  std::string model_dir = argv[1];
+  std::string img_path = argv[2];
+  RunModel(model_dir, img_path);
+  return 0;
+}
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index ce8b8365a8..74b86c519e 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -1,6 +1,6 @@
 # NOTE we leave the add_kernel not protected by LITE_WITH_LIGHT_WEIGHT_FRAMEWORK so that all the kernels will be registered
 # to the model_optimize_tool.
-if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)))
     return()
 endif()
 
diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.cc b/lite/kernels/arm/collect_fpn_proposals_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/collect_fpn_proposals_compute.h b/lite/kernels/arm/collect_fpn_proposals_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/conditional_block_compute.cc b/lite/kernels/arm/conditional_block_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/conditional_block_compute.h b/lite/kernels/arm/conditional_block_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/conv_compute.cc b/lite/kernels/arm/conv_compute.cc
index 8c76f243a6..52849a026e 100644
--- a/lite/kernels/arm/conv_compute.cc
+++ b/lite/kernels/arm/conv_compute.cc
@@ -110,8 +110,7 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kFloat)>::PrepareForRun() {
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
   bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
   bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
-  bool flag_dw_5x5 = pads_all_equal &&
-                     ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2));
+  bool flag_dw_5x5 = pads_all_equal && (kw == 5 && sw == 1);
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
   if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
@@ -156,8 +155,7 @@ void ConvCompute<PRECISION(kInt8), PRECISION(kInt8)>::PrepareForRun() {
   bool kps_equal = (pw == ph) && (sh == sw) && (kw == kh);
   bool no_dilation = (dilations[0] == 1) && (dilations[1] == 1);
   bool flag_dw_3x3 = (kw == 3 && kh == 3 && (sw == 1 || sw == 2));
-  bool flag_dw_5x5 = pads_all_equal &&
-                     ((kw == 5 && sw == 1) || (kw == 5 && sw == 2 && pw == 2));
+  bool flag_dw_5x5 = pads_all_equal && (kw == 5 && sw == 1);
   bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
 
   if (param.groups == ic && ic == oc && kps_equal && pads_equal &&
diff --git a/lite/kernels/arm/conv_transpose_compute_test.cc b/lite/kernels/arm/conv_transpose_compute_test.cc
deleted file mode 100644
index 298c651d9f..0000000000
--- a/lite/kernels/arm/conv_transpose_compute_test.cc
+++ /dev/null
@@ -1,371 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/conv_transpose_compute.h"
-#include <gtest/gtest.h>
-#include <cmath>
-#include <cstdlib>
-#include <functional>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-template <typename type, typename type2>
-static void basic_gemm(int m,
-                       int n,
-                       int k,
-                       const type* a,
-                       const type* b,
-                       const type2* bias,
-                       type2* c,
-                       type2 alpha,
-                       type2 beta,
-                       bool trans_a = false,
-                       bool trans_b = false,
-                       bool flag_bias = false,
-                       bool flag_relu = false) {
-#pragma omp parallel for
-  for (int i = 0; i < m; ++i) {
-    type2 bias_data = (type2)0;
-    if (flag_bias) {
-      bias_data = bias[i];
-    }
-    for (int j = 0; j < n; ++j) {
-      type2 sum = static_cast<type2>(0);
-      for (int l = 0; l < k; ++l) {
-        type av;
-        type bv;
-        if (trans_a) {
-          av = a[l * m + i];
-        } else {
-          av = a[i * k + l];
-        }
-        if (trans_b) {
-          bv = b[j * k + l];
-        } else {
-          bv = b[l * n + j];
-        }
-        sum += av * bv;
-      }
-      type2 tmp = alpha * sum + beta * c[i * n + j] + bias_data;
-      if (flag_relu) {
-        c[i * n + j] = tmp > (type2)0 ? tmp : (type2)0;
-      } else {
-        c[i * n + j] = tmp;
-      }
-    }
-  }
-}
-
-//! for float, dtype1 and type2 is float
-//! for int8, dytpe1 is char, dtype2 is int
-template <typename Dtype1, typename Dtype2>
-bool deconv_basic(const Dtype1* din,
-                  Dtype2* dout,
-                  int num,
-                  int chout,
-                  int hout,
-                  int wout,
-                  int chin,
-                  int hin,
-                  int win,
-                  const Dtype1* weights,
-                  const Dtype2* bias,
-                  int group,
-                  int kernel_w,
-                  int kernel_h,
-                  int stride_w,
-                  int stride_h,
-                  int dila_w,
-                  int dila_h,
-                  int pad_w,
-                  int pad_h,
-                  bool flag_bias,
-                  bool flag_relu) {
-  int m = chout * kernel_w * kernel_h / group;
-  int n = hin * win;
-  int k = chin / group;
-
-  if (chin != chout || group != chin) {
-    CHECK_OR_FALSE(chin % group == 0);
-    CHECK_OR_FALSE(chout % group == 0);
-  }
-
-  lite::Tensor workspace_tensor;
-  std::vector<int64_t> wt_shape = {1, 1, 1, group * m * n};
-  workspace_tensor.Resize(wt_shape);
-  auto* workspace_ptr = workspace_tensor.mutable_data<Dtype2>();
-
-  int group_size_in = win * hin * chin / group;
-  int group_size_out = wout * hout * chout / group;
-  int group_size_coldata = m * n;
-  int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
-  bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&
-                      (stride_w == 1) && (pad_w == 1) && (pad_h == 1) &&
-                      (dila_w == 1) && (dila_h == 1);
-
-  for (int i = 0; i < num; ++i) {
-    const Dtype1* din_batch = din + i * chin * hin * win;
-    Dtype2* dout_batch = dout + i * chout * hout * wout;
-
-    Dtype2* col_data = workspace_ptr;
-    if (flag_1x1s1p1) {
-      col_data = dout_batch;
-    }
-    memset(col_data, 0, sizeof(Dtype2) * group_size_coldata);
-    for (int g = 0; g < group; ++g) {
-      const Dtype1* din_group = din_batch + g * group_size_in;
-      const Dtype1* weights_group = weights + g * group_size_weights;
-      Dtype2* coldata_group = col_data + g * group_size_coldata;
-      basic_gemm<Dtype1, Dtype2>(m,
-                                 n,
-                                 k,
-                                 weights_group,
-                                 din_group,
-                                 nullptr,
-                                 coldata_group,
-                                 (Dtype2)1,
-                                 (Dtype2)0,
-                                 true,
-                                 false,
-                                 false,
-                                 (!flag_bias && flag_relu));
-    }
-    if (!flag_1x1s1p1) {
-      lite::arm::math::col2im(col_data,
-                              chout,
-                              hout,
-                              wout,
-                              kernel_h,
-                              kernel_w,
-                              pad_h,
-                              pad_w,
-                              stride_h,
-                              stride_w,
-                              dila_h,
-                              dila_w,
-                              dout_batch);
-    }
-    if (flag_bias) {
-      lite::arm::math::fill_bias_relu(
-          dout_batch, bias, chout, wout * hout, flag_bias, flag_relu);
-    }
-  }
-  return true;
-}
-
-template <typename Dtype1, typename Dtype2>
-void conv2d_transpose_compute_ref(const operators::ConvParam& param) {
-  const Dtype1* din = param.x->data<Dtype1>();
-  Dtype2* dout = param.output->mutable_data<Dtype2>();
-
-  int num = param.x->dims()[0];
-  int chout = param.output->dims()[1];
-  int hout = param.output->dims()[2];
-  int wout = param.output->dims()[3];
-
-  int chin = param.x->dims()[1];
-  int hin = param.x->dims()[2];
-  int win = param.x->dims()[3];
-
-  const Dtype1* weights = param.filter->mutable_data<Dtype1>();
-  Dtype2* bias = nullptr;
-  if (param.bias != nullptr) {
-    bias = param.bias->mutable_data<Dtype2>();
-  }
-
-  int group = param.groups;
-  int kernel_h = param.filter->dims()[2];
-  int kernel_w = param.filter->dims()[3];
-  int stride_h = param.strides[0];
-  int stride_w = param.strides[1];
-  int dila_h = param.dilations[0];
-  int dila_w = param.dilations[1];
-
-  int pad_h = param.paddings[0];
-  int pad_w = param.paddings[1];
-  bool flag_bias = (param.bias != nullptr);
-  bool flag_relu = param.fuse_relu;
-
-  deconv_basic<float, float>(din,
-                             dout,
-                             num,
-                             chout,
-                             hout,
-                             wout,
-                             chin,
-                             hin,
-                             win,
-                             weights,
-                             bias,
-                             group,
-                             kernel_w,
-                             kernel_h,
-                             stride_w,
-                             stride_h,
-                             dila_w,
-                             dila_h,
-                             pad_w,
-                             pad_h,
-                             flag_bias,
-                             flag_relu);
-}
-
-TEST(conv2d_transpose_arm, retrive_op) {
-  auto op = KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-      "conv2d_transpose");
-  ASSERT_FALSE(op.empty());
-  ASSERT_TRUE(op.front());
-}
-
-TEST(conv2d_transpose_arm, init) {
-  Conv2DTransposeCompute compute;
-  ASSERT_EQ(compute.precision(), PRECISION(kFloat));
-  ASSERT_EQ(compute.target(), TARGET(kARM));
-}
-
-TEST(conv2d_transpose_arm, compute) {
-  DeviceInfo::Init();
-  for (auto n : {1, 2}) {
-    for (auto ic : {1, 3 /*, 128*/}) {
-      for (auto oc : {1, 3 /*, 128*/}) {
-        for (auto ih : {2, 8 /*, 56 , 112, 224, 512*/}) {
-          for (auto iw : {2, 8 /*, 56, 112, 224, 512*/}) {
-            for (auto flag_bias : {false, true}) {
-              for (auto flag_relu : {false, true}) {
-                for (auto dilation : {1, 2}) {
-                  for (auto stride : {1, 2}) {
-                    for (auto padding : {0, 1, 2}) {
-                      for (auto ks : {2, 3, 5}) {
-                        for (auto group : {1, 2}) {
-                          // obtain shape
-                          if (ic % group != 0 || oc % group != 0) {
-                            group = 1;
-                          }
-                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
-                          std::vector<int64_t> filter_shape = {
-                              oc / group, ic, ks, ks};
-                          int oh = (ih - 1) * stride - 2 * padding +
-                                   dilation * (ks - 1) + 1;
-                          int ow = (iw - 1) * stride - 2 * padding +
-                                   dilation * (ks - 1) + 1;
-                          if (oh < 1 || ow < 1) {
-                            break;
-                          }
-                          std::vector<int64_t> output_shape = {n, oc, oh, ow};
-                          std::vector<int64_t> bias_shape = {1, oc, 1, 1};
-
-                          // define and resize tensor
-                          Tensor input;
-                          Tensor filter;
-                          Tensor filter_copy;
-                          Tensor bias;
-                          Tensor output;
-                          Tensor output_ref;
-                          input.Resize(input_shape);
-                          filter.Resize(filter_shape);
-                          filter_copy.Resize(filter_shape);
-                          output.Resize(output_shape);
-                          output_ref.Resize(output_shape);
-                          auto* input_data = input.mutable_data<float>();
-                          auto* filter_data = filter.mutable_data<float>();
-                          auto* filter_copy_data =
-                              filter_copy.mutable_data<float>();
-                          auto* output_data = output.mutable_data<float>();
-
-                          // initialize tensor
-                          for (int i = 0; i < input.dims().production(); i++) {
-                            float sign = i % 3 == 0 ? -1.0f : 1.0f;
-                            input_data[i] = sign * static_cast<float>(i % 128);
-                          }
-                          for (int i = 0; i < filter.dims().production(); i++) {
-                            filter_data[i] =
-                                i /
-                                static_cast<float>(filter.dims().production());
-                            filter_copy_data[i] =
-                                i / static_cast<float>(
-                                        filter_copy.dims().production());
-                          }
-                          if (flag_bias) {
-                            bias.Resize(bias_shape);
-                            auto* bias_data = bias.mutable_data<float>();
-                            for (int i = 0; i < bias.dims().production(); i++) {
-                              bias_data[i] = static_cast<float>(i);
-                            }
-                          }
-
-                          // prepare kernel params and run
-                          std::unique_ptr<KernelContext> ctx(new KernelContext);
-                          ctx->As<ARMContext>();
-                          Conv2DTransposeCompute conv2d_transpose;
-                          conv2d_transpose.SetContext(std::move(ctx));
-                          operators::ConvParam param;
-                          param.x = &input;
-                          param.filter = &filter;
-                          param.output = &output;
-                          param.bias = nullptr;
-                          if (flag_bias) {
-                            bias.Resize(bias_shape);
-                            auto* bias_data = bias.mutable_data<float>();
-                            for (int i = 0; i < bias.dims().production(); i++) {
-                              bias_data[i] = static_cast<float>(i);
-                            }
-                            param.bias = &bias;
-                          }
-                          param.fuse_relu = flag_relu;
-                          param.paddings = std::vector<int>({padding, padding});
-                          param.strides = std::vector<int>({stride, stride});
-                          param.dilations =
-                              std::vector<int>({dilation, dilation});
-                          param.groups = group;
-                          conv2d_transpose.SetParam(param);
-                          conv2d_transpose.Launch();
-
-                          // invoking ref implementation and compare results
-                          param.filter = &filter_copy;
-                          param.output = &output_ref;
-                          conv2d_transpose_compute_ref<float, float>(param);
-                          auto* output_ref_data =
-                              output_ref.mutable_data<float>();
-                          for (int i = 0; i < output.dims().production(); i++) {
-                            EXPECT_NEAR(
-                                output_data[i], output_ref_data[i], 1e-3);
-                          }
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.cc b/lite/kernels/arm/distribute_fpn_proposals_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/distribute_fpn_proposals_compute.h b/lite/kernels/arm/distribute_fpn_proposals_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/grid_sampler_compute.cc b/lite/kernels/arm/grid_sampler_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/grid_sampler_compute.h b/lite/kernels/arm/grid_sampler_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/instance_norm_compute.cc b/lite/kernels/arm/instance_norm_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/instance_norm_compute.h b/lite/kernels/arm/instance_norm_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/merge_lod_tensor_compute.cc b/lite/kernels/arm/merge_lod_tensor_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/merge_lod_tensor_compute.h b/lite/kernels/arm/merge_lod_tensor_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/merge_lod_tensor_compute_test.cc b/lite/kernels/arm/merge_lod_tensor_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/reduce_prod_compute.cc b/lite/kernels/arm/reduce_prod_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/reduce_prod_compute.h b/lite/kernels/arm/reduce_prod_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/split_lod_tensor_compute.cc b/lite/kernels/arm/split_lod_tensor_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/split_lod_tensor_compute.h b/lite/kernels/arm/split_lod_tensor_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/split_lod_tensor_compute_test.cc b/lite/kernels/arm/split_lod_tensor_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/arm/yolo_box_compute.cc b/lite/kernels/arm/yolo_box_compute.cc
index ad8a630b8c..38443bf277 100644
--- a/lite/kernels/arm/yolo_box_compute.cc
+++ b/lite/kernels/arm/yolo_box_compute.cc
@@ -32,6 +32,8 @@ void YoloBoxCompute::Run() {
   int class_num = param.class_num;
   float conf_thresh = param.conf_thresh;
   int downsample_ratio = param.downsample_ratio;
+  Boxes->clear();
+  Scores->clear();
   lite::arm::math::yolobox(X,
                            ImgSize,
                            Boxes,
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index bf59d02726..2df00f00a4 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -1,4 +1,4 @@
-if(NOT LITE_WITH_CUDA)
+if((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_CUDA))
     return()
 endif()
 
diff --git a/lite/kernels/cuda/conv_compute_test.cc b/lite/kernels/cuda/conv_compute_test.cc
index 2ebd7e33ba..46b63f2e31 100644
--- a/lite/kernels/cuda/conv_compute_test.cc
+++ b/lite/kernels/cuda/conv_compute_test.cc
@@ -15,6 +15,7 @@
 #include "lite/kernels/cuda/conv_compute.h"
 #include <gtest/gtest.h>
 #include <memory>
+#include <random>
 #include <utility>
 #include <vector>
 
diff --git a/lite/kernels/cuda/elementwise_add_compute.cu b/lite/kernels/cuda/elementwise_add_compute.cu
deleted file mode 100644
index 4bacf532a2..0000000000
--- a/lite/kernels/cuda/elementwise_add_compute.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-#include "lite/backends/cuda/math/elementwise.h"
-#include "lite/core/op_registry.h"
-#include "lite/kernels/cuda/elementwise_add_compute.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-void ElementwiseAddCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  const lite::Tensor* x = param.X;
-  const lite::Tensor* y = param.Y;
-  lite::Tensor* out = param.Out;
-
-  CHECK(x->dims().production() == y->dims().production());
-
-  auto* x_data = x->data<float>();
-  auto* y_data = y->data<float>();
-  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
-
-  int pixel_num = x->numel();
-  lite::cuda::math::elementwise_add(
-      pixel_num, x_data, y_data, out_data, stream);
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-void ElementwiseAddComputeNHWC::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  const lite::Tensor* x = param.X;
-  const lite::Tensor* y = param.Y;
-  lite::Tensor* out = param.Out;
-
-  CHECK(x->dims().production() == y->dims().production());
-
-  auto* x_data = x->data<float>();
-  auto* y_data = y->data<float>();
-  auto out_data = out->mutable_data<float>(TARGET(kCUDA));
-
-  int pixel_num = x->numel();
-  lite::cuda::math::elementwise_add(
-      pixel_num, x_data, y_data, out_data, stream);
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-void ElementwiseAddComputeInt8::Run() {
-  auto& param = this->Param<param_t>();
-  auto& ctx = this->ctx_->template As<CUDAContext>();
-  auto stream = ctx.exec_stream();
-
-  const lite::Tensor* x = param.X;
-  const lite::Tensor* y = param.Y;
-  lite::Tensor* out = param.Out;
-
-  CHECK(x->dims().production() == y->dims().production());
-
-  const int c = x->dims()[3];
-
-  auto* x_data = x->data<float>();
-  auto* y_data = y->data<float>();
-  auto out_data = out->mutable_data<int8_t>(TARGET(kCUDA));
-
-  int pixel_num = x->numel();
-  float output_scale = param.output_scale;
-  if (c % 4 == 0) {
-    lite::cuda::math::elementwise_add_nhwc4_int8(
-        pixel_num / 4,
-        static_cast<const void*>(x_data),
-        static_cast<const void*>(y_data),
-        1. / output_scale,
-        static_cast<void*>(out_data),
-        stream);
-  } else {
-    lite::cuda::math::elementwise_add_int8(
-        pixel_num, x_data, y_data, 1. / output_scale, out_data, stream);
-  }
-
-  cudaError_t error = cudaGetLastError();
-  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(elementwise_add,
-                     kCUDA,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::cuda::ElementwiseAddCompute,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(elementwise_add,
-                     kCUDA,
-                     kFloat,
-                     kNHWC,
-                     paddle::lite::kernels::cuda::ElementwiseAddComputeNHWC,
-                     nhwc_format)
-    .BindInput("X",
-               {LiteType::GetTensorTy(TARGET(kCUDA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC))})
-    .BindInput("Y",
-               {LiteType::GetTensorTy(TARGET(kCUDA),
-                                      PRECISION(kFloat),
-                                      DATALAYOUT(kNHWC))})
-    .BindOutput("Out",
-                {LiteType::GetTensorTy(TARGET(kCUDA),
-                                       PRECISION(kFloat),
-                                       DATALAYOUT(kNHWC))})
-    .Finalize();
diff --git a/lite/kernels/cuda/elementwise_add_compute.h b/lite/kernels/cuda/elementwise_add_compute.h
deleted file mode 100644
index 5c3fecc5d8..0000000000
--- a/lite/kernels/cuda/elementwise_add_compute.h
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "lite/core/kernel.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-class ElementwiseAddCompute
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ElementwiseParam;
-
-  void Run() override;
-  virtual ~ElementwiseAddCompute() = default;
-};
-
-class ElementwiseAddComputeNHWC
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::ElementwiseParam;
-
-  void Run() override;
-  virtual ~ElementwiseAddComputeNHWC() = default;
-};
-
-class ElementwiseAddComputeInt8
-    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNHWC)> {
- public:
-  using param_t = operators::ElementwiseParam;
-
-  void Run() override;
-  virtual ~ElementwiseAddComputeInt8() = default;
-};
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/elementwise_add_compute_test.cc b/lite/kernels/cuda/elementwise_add_compute_test.cc
deleted file mode 100644
index cc63f1470b..0000000000
--- a/lite/kernels/cuda/elementwise_add_compute_test.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/cuda/elementwise_add_compute.h"
-#include <gtest/gtest.h>
-#include <memory>
-#include <utility>
-#include "lite/api/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace cuda {
-
-using Tensor = lite::Tensor;
-
-static void ElementwiseAddRef(float* x, float* y, float* out, int num) {
-  for (int i = 0; i < num; ++i) {
-    out[i] = x[i] + y[i];
-  }
-}
-
-TEST(elementwise_add, normal) {
-  ElementwiseAddCompute elementwise_add_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::ElementwiseParam param;
-  Tensor x, y, out;
-  Tensor x_cpu, y_cpu, out_cpu;
-  Tensor x_ref, y_ref, out_ref;
-
-  const int n = 1;
-  const int c = 3;
-  const int h = 2000;
-  const int w = 2000;
-
-  x.Resize({n, c, h, w});
-  y.Resize({n, c, h, w});
-  out.Resize({n, c, h, w});
-  x_cpu.Resize({n, c, h, w});
-  y_cpu.Resize({n, c, h, w});
-  out_cpu.Resize({n, c, h, w});
-  x_ref.Resize({n, c, h, w});
-  y_ref.Resize({n, c, h, w});
-  out_ref.Resize({n, c, h, w});
-
-  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
-
-  auto* x_cpu_data = x_cpu.mutable_data<float>();
-  auto* y_cpu_data = y_cpu.mutable_data<float>();
-  auto* out_cpu_data = out_cpu.mutable_data<float>();
-
-  auto* x_ref_data = x_ref.mutable_data<float>();
-  auto* y_ref_data = y_ref.mutable_data<float>();
-  auto* out_ref_data = out_ref.mutable_data<float>();
-
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i + 5.0;
-    x_ref_data[i] = i + 5.0;
-  }
-  for (int i = 0; i < y_cpu.numel(); ++i) {
-    y_cpu_data[i] = i - 5.0;
-    y_ref_data[i] = i - 5.0;
-  }
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
-
-  param.X = &x;
-  param.Y = &y;
-  param.Out = &out;
-  elementwise_add_kernel.SetParam(param);
-
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  elementwise_add_kernel.SetContext(std::move(ctx));
-  elementwise_add_kernel.Launch();
-  cudaDeviceSynchronize();
-
-  CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
-  ElementwiseAddRef(x_ref_data, y_ref_data, out_ref_data, out.numel());
-  for (int i = 0; i < out.numel(); i++) {
-    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(elementwise_add, int8_out) {
-  ElementwiseAddComputeInt8 elementwise_add_kernel;
-  std::unique_ptr<KernelContext> ctx(new KernelContext);
-  auto& context = ctx->As<CUDAContext>();
-
-  operators::ElementwiseParam param;
-  Tensor x, y, out;
-  Tensor x_cpu, y_cpu, out_cpu;
-
-  const int n = 1;
-  const int h = 36;
-  const int w = 36;
-  const int c = 125;
-
-  x.Resize({n, h, w, c});
-  y.Resize({n, h, w, c});
-  out.Resize({n, h, w, c});
-  x_cpu.Resize({n, h, w, c});
-  y_cpu.Resize({n, h, w, c});
-  out_cpu.Resize({n, h, w, c});
-
-  auto* out_data = out.mutable_data<int8_t>(TARGET(kCUDA));
-
-  auto* x_cpu_data = x_cpu.mutable_data<float>();
-  auto* y_cpu_data = y_cpu.mutable_data<float>();
-  auto* out_cpu_data = out_cpu.mutable_data<int8_t>();
-
-  for (int i = 0; i < x_cpu.numel(); ++i) {
-    x_cpu_data[i] = i + 5.0;
-  }
-  for (int i = 0; i < y_cpu.numel(); ++i) {
-    y_cpu_data[i] = i;
-  }
-
-  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
-  y.Assign<float, lite::DDim, TARGET(kCUDA)>(y_cpu_data, y_cpu.dims());
-
-  param.X = &x;
-  param.Y = &y;
-  param.Out = &out;
-  param.output_scale = 50 / 127.;
-  elementwise_add_kernel.SetParam(param);
-
-  cudaStream_t stream;
-  cudaStreamCreate(&stream);
-  context.SetExecStream(stream);
-
-  elementwise_add_kernel.SetContext(std::move(ctx));
-  auto start = GetCurrentUS();
-  for (int i = 0; i < 1000000; i++) {
-    elementwise_add_kernel.Launch();
-  }
-  LOG(INFO) << "time: " << (GetCurrentUS() - start) / 1000000.;
-
-  CopySync<TARGET(kCUDA)>(
-      out_cpu_data, out_data, sizeof(int8_t) * out.numel(), IoDirection::DtoH);
-  for (int i = 0; i < out.numel(); i++) {
-    //    LOG(INFO) << float(out_cpu_data[i]);
-  }
-}
-
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/cuda/mul_compute.h b/lite/kernels/cuda/mul_compute.h
index c2fc4364ef..320b562128 100644
--- a/lite/kernels/cuda/mul_compute.h
+++ b/lite/kernels/cuda/mul_compute.h
@@ -93,7 +93,6 @@ class MulCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
             .Slice(param.y_num_col_dims, param.y->dims().size())
             .production());
     CHECK_EQ(x_w, y_h) << "x_w must be equal with y_h";
-    LOG(INFO) << x_h << " " << x_w << " " << y_h << " " << y_w;
 
     mul_compute<float>(blas, x_data, x_h, x_w, y_data, y_h, y_w, out_data);
   }
diff --git a/lite/kernels/cuda/sequence_pool_concat_compute.cu b/lite/kernels/cuda/sequence_pool_concat_compute.cu
old mode 100755
new mode 100644
diff --git a/lite/kernels/cuda/sequence_pool_concat_compute.h b/lite/kernels/cuda/sequence_pool_concat_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu
index 0a00c06cbf..6b4b2875f3 100644
--- a/lite/kernels/cuda/yolo_box_compute.cu
+++ b/lite/kernels/cuda/yolo_box_compute.cu
@@ -233,7 +233,7 @@ REGISTER_LITE_KERNEL(yolo_box,
                                       DATALAYOUT(kNCHW))})
     .BindInput("ImgSize",
                {LiteType::GetTensorTy(TARGET(kCUDA),
-                                      PRECISION(kFloat),
+                                      PRECISION(kInt32),
                                       DATALAYOUT(kNCHW))})
     .BindOutput("Boxes",
                 {LiteType::GetTensorTy(TARGET(kCUDA),
diff --git a/lite/kernels/fpga/CMakeLists.txt b/lite/kernels/fpga/CMakeLists.txt
old mode 100644
new mode 100755
index 7c47e72872..f6c3a39949
--- a/lite/kernels/fpga/CMakeLists.txt
+++ b/lite/kernels/fpga/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (NOT LITE_WITH_FPGA)
+if ((NOT LITE_ON_MODEL_OPTIMIZE_TOOL) AND (NOT LITE_WITH_FPGA))
     return()
 endif()
 
diff --git a/lite/kernels/fpga/calib_compute.cc b/lite/kernels/fpga/calib_compute.cc
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/conv_compute.cc b/lite/kernels/fpga/conv_compute.cc
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/conv_compute.h b/lite/kernels/fpga/conv_compute.h
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/dropout_compute.cc b/lite/kernels/fpga/dropout_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/elementwise_compute.cc b/lite/kernels/fpga/elementwise_compute.cc
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/fc_compute.h b/lite/kernels/fpga/fc_compute.h
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/feed_compute.cc b/lite/kernels/fpga/feed_compute.cc
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/feed_compute.h b/lite/kernels/fpga/feed_compute.h
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/fetch_compute.h b/lite/kernels/fpga/fetch_compute.h
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/gru_compute.h b/lite/kernels/fpga/gru_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/im2sequence_compute.cc b/lite/kernels/fpga/im2sequence_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/im2sequence_compute.h b/lite/kernels/fpga/im2sequence_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/mul_compute.h b/lite/kernels/fpga/mul_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/multiclass_nms_compute.cc b/lite/kernels/fpga/multiclass_nms_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/norm_compute.cc b/lite/kernels/fpga/norm_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/norm_compute.h b/lite/kernels/fpga/norm_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/pooling_compute_test.cc b/lite/kernels/fpga/pooling_compute_test.cc
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/prior_box_compute.cc b/lite/kernels/fpga/prior_box_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/prior_box_compute.h b/lite/kernels/fpga/prior_box_compute.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/reshape_compute.cc b/lite/kernels/fpga/reshape_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/fpga/scale_compute.cc b/lite/kernels/fpga/scale_compute.cc
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/scale_compute.h b/lite/kernels/fpga/scale_compute.h
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/softmax_compute.cc b/lite/kernels/fpga/softmax_compute.cc
old mode 100644
new mode 100755
diff --git a/lite/kernels/fpga/transpose_compute.cc b/lite/kernels/fpga/transpose_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index c84e996f4c..2c516e47e4 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -1,11 +1,10 @@
-if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU AND NOT LITE_WITH_BM)
+if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XPU)
   return()
 endif()
 
 lite_cc_library(subgraph_bridge_registry
     SRCS registry.cc
     DEPS op)
-
 lite_cc_library(subgraph_bridge_engine
     SRCS engine.cc
     DEPS tensor op scope program)
diff --git a/lite/kernels/npu/bridges/act_op.cc b/lite/kernels/npu/bridges/act_op.cc
index 62eb649e0e..a4d1009f1b 100644
--- a/lite/kernels/npu/bridges/act_op.cc
+++ b/lite/kernels/npu/bridges/act_op.cc
@@ -43,33 +43,34 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Act node
-  auto act_node = graph->AddNode<ge::op::Activation>(out_name);
-  act_node->set_input_x(*x_node);
+  auto act_node = graph->Add<ge::op::Activation>(out_name);
+  auto act_op = act_node->data<ge::op::Activation>();
+  act_op->set_input_x(*x_node->data());
   // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
   // clipped_relu etc.
-  act_node->set_attr_mode(CvtActMode(op_type));
+  act_op->set_attr_mode(CvtActMode(op_type));
   if (op_type == "relu_clipped") {
     auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
-    act_node->set_attr_coef(Relu_clipped_coef);
+    act_op->set_attr_coef(Relu_clipped_coef);
   } else if (op_type == "relu6") {
     float Relu_clipped_coef = 6.f;
-    act_node->set_attr_coef(Relu_clipped_coef);
+    act_op->set_attr_coef(Relu_clipped_coef);
   } else if (op_type == "leaky_relu") {
     auto alpha = op_info->GetAttr<float>("alpha");
-    act_node->set_attr_negative_slope(alpha);
+    act_op->set_attr_negative_slope(alpha);
   } else if (op_type == "hard_sigmoid") {
     auto slope = op_info->GetAttr<float>("slope");
     auto offset = op_info->GetAttr<float>("offset");
-    act_node->set_attr_negative_slope(slope);
-    act_node->set_attr_coef(offset);
+    act_op->set_attr_negative_slope(slope);
+    act_op->set_attr_coef(offset);
   }
   return SUCCESS;
 }
@@ -79,25 +80,27 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         sigmoid,
+REGISTER_SUBGRAPH_BRIDGE(sigmoid,
+                         kNPU,
                          paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU, relu, paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU, tanh, paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         relu_clipped,
+REGISTER_SUBGRAPH_BRIDGE(relu, kNPU, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(tanh, kNPU, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(relu_clipped,
+                         kNPU,
                          paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU, relu6, paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         leaky_relu,
+REGISTER_SUBGRAPH_BRIDGE(relu6,
+                         kNPU,
                          paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU, abs, paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         softsign,
+REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
+                         kNPU,
                          paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         softplus,
+REGISTER_SUBGRAPH_BRIDGE(abs, kNPU, paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(softsign,
+                         kNPU,
                          paddle::lite::subgraph::npu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         hard_sigmoid,
+REGISTER_SUBGRAPH_BRIDGE(softplus,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(hard_sigmoid,
+                         kNPU,
                          paddle::lite::subgraph::npu::ActConverter);
diff --git a/lite/kernels/npu/bridges/argmax_op.cc b/lite/kernels/npu/bridges/argmax_op.cc
old mode 100755
new mode 100644
index 835d4dd1ed..3d397aab9d
--- a/lite/kernels/npu/bridges/argmax_op.cc
+++ b/lite/kernels/npu/bridges/argmax_op.cc
@@ -44,20 +44,21 @@ int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   int axis = op_info->GetAttr<int64_t>("axis");
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Axis node
-  auto axis_const_node = graph->AddNode(out_name + "/axis", axis);
+  auto axis_node = graph->Add(out_name + "/axis", axis);
 
   // Argmax node
-  auto argmax_node = graph->AddNode<ge::op::ArgMax>(out_name);
-  argmax_node->set_input_x1(*x_node);
-  argmax_node->set_input_x2(*axis_const_node);
+  auto argmax_node = graph->Add<ge::op::ArgMax>(out_name);
+  auto argmax_op = argmax_node->data<ge::op::ArgMax>();
+  argmax_op->set_input_x1(*x_node->data());
+  argmax_op->set_input_x2(*axis_node->data());
   return SUCCESS;
 }
 
@@ -66,6 +67,6 @@ int ArgmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         arg_max,
+REGISTER_SUBGRAPH_BRIDGE(arg_max,
+                         kNPU,
                          paddle::lite::subgraph::npu::ArgmaxConverter);
diff --git a/lite/kernels/npu/bridges/argmax_op_test.cc b/lite/kernels/npu/bridges/argmax_op_test.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/npu/bridges/batch_norm_op.cc b/lite/kernels/npu/bridges/batch_norm_op.cc
index 57b52cf745..d151fd8d7b 100644
--- a/lite/kernels/npu/bridges/batch_norm_op.cc
+++ b/lite/kernels/npu/bridges/batch_norm_op.cc
@@ -67,30 +67,31 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   bool use_global_stats = op_info->GetAttr<bool>("use_global_stats");
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Scale, Bias, Mean, Variance node
-  auto scale_const_node = graph->AddNode(scale_name, *scale);
-  auto bias_const_node = graph->AddNode(bias_name, *bias);
-  auto mean_const_node = graph->AddNode(mean_name, *mean);
-  auto variance_const_node = graph->AddNode(variance_name, *variance);
+  auto scale_node = graph->Add(scale_name, *scale);
+  auto bias_node = graph->Add(bias_name, *bias);
+  auto mean_node = graph->Add(mean_name, *mean);
+  auto variance_node = graph->Add(variance_name, *variance);
 
   // Batch Norm node
-  auto batch_norm_node = graph->AddNode<ge::op::BatchNormExt2>(y_name);
-  batch_norm_node->set_input_x(*x_node);
-  batch_norm_node->set_input_scale(*scale_const_node);
-  batch_norm_node->set_input_offset(*bias_const_node);
-  batch_norm_node->set_input_mean(*mean_const_node);
-  batch_norm_node->set_input_variance(*variance_const_node);
-  batch_norm_node->set_attr_momentum(momentum);
-  batch_norm_node->set_attr_epsilon(epsilon);
-  batch_norm_node->set_attr_mode(mode);
-  batch_norm_node->set_attr_use_global_stats(use_global_stats);
+  auto batch_norm_node = graph->Add<ge::op::BatchNormExt2>(y_name);
+  auto batch_norm_op = batch_norm_node->data<ge::op::BatchNormExt2>();
+  batch_norm_op->set_input_x(*x_node->data());
+  batch_norm_op->set_input_scale(*scale_node->data());
+  batch_norm_op->set_input_offset(*bias_node->data());
+  batch_norm_op->set_input_mean(*mean_node->data());
+  batch_norm_op->set_input_variance(*variance_node->data());
+  batch_norm_op->set_attr_momentum(momentum);
+  batch_norm_op->set_attr_epsilon(epsilon);
+  batch_norm_op->set_attr_mode(mode);
+  batch_norm_op->set_attr_use_global_stats(use_global_stats);
   return SUCCESS;
 }
 
@@ -99,6 +100,6 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         batch_norm,
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kNPU,
                          paddle::lite::subgraph::npu::BatchNormConverter);
diff --git a/lite/kernels/npu/bridges/batch_norm_op_test.cc b/lite/kernels/npu/bridges/batch_norm_op_test.cc
deleted file mode 100644
index 38a876efb7..0000000000
--- a/lite/kernels/npu/bridges/batch_norm_op_test.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/batch_norm_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename dtype>
-void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable<Tensor>();
-  auto bias =
-      scope->FindVar(op_info->Input("Bias").front())->GetMutable<Tensor>();
-  auto scale =
-      scope->FindVar(op_info->Input("Scale").front())->GetMutable<Tensor>();
-  auto mean =
-      scope->FindVar(op_info->Input("Mean").front())->GetMutable<Tensor>();
-  auto variance =
-      scope->FindVar(op_info->Input("Variance").front())->GetMutable<Tensor>();
-
-  auto x_data = x->data<dtype>();
-  auto y_data = y->mutable_data<dtype>();
-  auto scale_data = scale->mutable_data<dtype>();
-  auto bias_data = bias->mutable_data<dtype>();
-  auto mean_data = mean->mutable_data<dtype>();
-  auto variance_data = variance->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-
-  float epsilon = op_info->GetAttr<float>("epsilon");
-  float momentum = op_info->GetAttr<float>("momentum");
-  auto data_layout = op_info->GetAttr<std::string>("data_layout");
-
-  bool global_stats = op_info->GetAttr<bool>("use_global_stats");
-  if (global_stats) {
-    int64_t outer_size = 0;
-    int64_t channel_size = 0;
-    int64_t inner_size = 0;
-    if (data_layout == "NCHW") {
-      outer_size = x_dims[0];
-      channel_size = x_dims[1];
-      inner_size = x_dims.Slice(2, x_dims.size()).production();
-    } else {
-      LOG(FATAL) << "Unknown storage order: " << data_layout;
-    }
-    auto x_ptr = x_data;
-    auto y_ptr = y_data;
-    for (int o = 0; o < outer_size; o++) {
-      for (int c = 0; c < channel_size; c++) {
-        for (int i = 0; i < inner_size; i++) {
-          dtype norm_x =
-              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
-          *y_ptr = norm_x * scale_data[c] + bias_data[c];
-          x_ptr++;
-          y_ptr++;
-        }
-      }
-    }
-  }
-}
-
-void test_batch_norm(
-    int bs, int ic, int ih, int iw, float epsilon, float momentum) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  std::string scale_var_name = "scale";
-  std::string bias_var_name = "bias";
-  std::string mean_var_name = "mean";
-  std::string variance_var_name = "variance";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* scale = scope.Var(scale_var_name)->GetMutable<Tensor>();
-  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-  auto* mean = scope.Var(mean_var_name)->GetMutable<Tensor>();
-  auto* variance = scope.Var(variance_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  scale->Resize({ic});
-  bias->Resize({ic});
-  mean->Resize({ic});
-  variance->Resize({ic});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-  FillTensor<float, int>(scale);
-  FillTensor<float, int>(bias);
-  FillTensor<float, int>(mean);
-  // variance > 0
-  FillTensor<float, int>(variance, 1.f, 5.f);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("batch_norm");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetInput("Scale", {scale_var_name});
-  opdesc.SetInput("Bias", {bias_var_name});
-  opdesc.SetInput("Mean", {mean_var_name});
-  opdesc.SetInput("Variance", {variance_var_name});
-  opdesc.SetOutput("Y", {out_var_name});
-  opdesc.SetAttr("is_test", 1);
-  opdesc.SetAttr("use_global_stats", true);
-  opdesc.SetAttr("epsilon", epsilon);
-  opdesc.SetAttr("momentum", momentum);
-  opdesc.SetAttr("data_layout", std::string("NCHW"));
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::BatchNormOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  batch_norm_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, batch_norm) {
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto epsilon : {1e-4f, 1e-5f}) {
-            for (auto momentum : {0.9f, 0.99f}) {
-              test_batch_norm(bs, ic, ih, iw, epsilon, momentum);
-            }
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(batch_norm);
-USE_NPU_BRIDGE(batch_norm);
diff --git a/lite/kernels/npu/bridges/concat_op.cc b/lite/kernels/npu/bridges/concat_op.cc
index 44a2734c89..e40af8703d 100644
--- a/lite/kernels/npu/bridges/concat_op.cc
+++ b/lite/kernels/npu/bridges/concat_op.cc
@@ -44,21 +44,22 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Traverse all of input nodes which are added into the new created concat
   // node
-  auto concat_node = graph->AddNode<ge::op::Concat>(out_name);
-  concat_node->set_attr_axis(axis);
-  concat_node->set_attr_N(num);
-  concat_node->create_dynamic_input_x(num);
+  auto concat_node = graph->Add<ge::op::Concat>(out_name);
+  auto concat_op = concat_node->data<ge::op::Concat>();
+  concat_op->set_attr_axis(axis);
+  concat_op->set_attr_N(num);
+  concat_op->create_dynamic_input_x(num);
   int idx = 1;
   for (auto& x_name : x_names) {
     auto x = scope->FindMutableTensor(x_name);
     auto x_dims = x->dims();
-    std::shared_ptr<ge::Operator> x_node = nullptr;
-    if (graph->HasNode(x_name)) {
-      x_node = graph->GetNode(x_name);
+    std::shared_ptr<Node> x_node = nullptr;
+    if (graph->Has(x_name)) {
+      x_node = graph->Get(x_name);
     } else {
-      x_node = graph->AddNode(x_name, x_dims);
+      x_node = graph->Add(x_name, *x);
     }
-    concat_node->set_dynamic_input_x(idx, *x_node);
+    concat_op->set_dynamic_input_x(idx, *x_node->data());
     idx++;
   }
   return SUCCESS;
@@ -69,6 +70,6 @@ int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         concat,
+REGISTER_SUBGRAPH_BRIDGE(concat,
+                         kNPU,
                          paddle::lite::subgraph::npu::ConcatConverter);
diff --git a/lite/kernels/npu/bridges/conv_op.cc b/lite/kernels/npu/bridges/conv_op.cc
index 6b34e76880..60877f768b 100644
--- a/lite/kernels/npu/bridges/conv_op.cc
+++ b/lite/kernels/npu/bridges/conv_op.cc
@@ -67,11 +67,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(dilations.size(), 2L);
 
   // Input node
-  std::shared_ptr<ge::Operator> input_node = nullptr;
-  if (graph->HasNode(input_name)) {
-    input_node = graph->GetNode(input_name);
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
   } else {
-    input_node = graph->AddNode(input_name, input_dims);
+    input_node = graph->Add(input_name, *input);
   }
 
   if (paddings.size() == 2L) {
@@ -109,104 +109,102 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   // Filter node
-  auto filter_const_node = graph->AddNode(filter_name, *filter);
+  auto filter_node = graph->Add(filter_name, *filter);
 
   // Add bias node if exists bias
   // Supports the bias nodes with the following dimensions
   // 0: {oc}
   // 1: {1, oc, oh, ow}
   // 2: {n, oc, oh, ow}
-  std::shared_ptr<ge::Operator> bias_node = nullptr;
+  std::shared_ptr<Node> bias_node = nullptr;
   bool is_channel_bias = false;
   if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
-    CHECK(bias_type->precision() == PRECISION(kFloat));
-    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
-    auto bias = scope->FindMutableTensor(bias_name);
-    auto bias_dims = bias->dims();
-    auto bias_data_size = bias_dims.production();
-    auto output_data_size = output_dims.production();
-    std::vector<int64_t> bias_shape;
-    if (bias_data_size == oc) {
-      // 0: {oc}
-      bias_shape = {1, oc, 1, 1};
-      is_channel_bias = true;
-    } else if (bias_data_size == output_data_size / bs) {
-      // 1: {1, oc, oh, ow}
-      bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
-    } else if (bias_data_size == output_data_size) {
-      // 2: {n, oc, oh, ow}
-      bias_shape = output_dims.Vectorize();
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
     } else {
-      LOG(WARNING) << "[NPU] Bias dimension " << bias_dims
-                   << " isn't supported in conv2d Op when output dimension is "
-                   << output_dims;
-      return FAILED;
-    }
-    if (graph->HasNode(bias_name)) {
-      // Bias node from input node
-      bias_node = graph->GetNode(bias_name);
-    } else {
-      // Bias node with const data
-      bias_node = graph->AddNode(bias_name, *bias, bias_shape);
+      auto bias_type = kernel->GetInputDeclType("Bias");
+      CHECK(bias_type->precision() == PRECISION(kFloat));
+      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      auto bias_data_size = bias_dims.production();
+      auto output_data_size = output_dims.production();
+      std::vector<int64_t> bias_shape;
+      if (bias_data_size == oc) {
+        // 0: {oc}
+        bias_shape = {1, oc, 1, 1};
+        is_channel_bias = true;
+      } else if (bias_data_size == output_data_size / bs) {
+        // 1: {1, oc, oh, ow}
+        bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
+      } else if (bias_data_size == output_data_size) {
+        // 2: {n, oc, oh, ow}
+        bias_shape = output_dims.Vectorize();
+      } else {
+        LOG(WARNING)
+            << "[NPU] Bias dimension " << bias_dims
+            << " isn't supported in conv2d Op when output dimension is "
+            << output_dims;
+        return FAILED;
+      }
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
     }
   }
 
   // Conv node
-  std::shared_ptr<ge::Operator> conv_node = nullptr;
+  std::shared_ptr<Node> conv_node = nullptr;
   if (use_depthwise_conv && is_depthwise_mode) {
-    auto depthwise_conv_node =
-        graph->AddNode<ge::op::ConvolutionDepthwise>(output_name);
-    depthwise_conv_node->set_input_x(*input_node);
-    depthwise_conv_node->set_input_filter(*filter_const_node);
-    depthwise_conv_node->set_attr_mode(1);
-    depthwise_conv_node->set_attr_algo(0);
-    depthwise_conv_node->set_attr_format(0);    // NCHW
-    depthwise_conv_node->set_attr_pad_mode(5);  // VALID
-    depthwise_conv_node->set_attr_group(groups);
-    depthwise_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
+    conv_node = graph->Add<ge::op::ConvolutionDepthwise>(output_name);
+    auto conv_op = conv_node->data<ge::op::ConvolutionDepthwise>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_filter(*filter_node->data());
+    conv_op->set_attr_mode(1);
+    conv_op->set_attr_algo(0);
+    conv_op->set_attr_format(0);    // NCHW
+    conv_op->set_attr_pad_mode(5);  // VALID
+    conv_op->set_attr_group(groups);
+    conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
         {paddings[0], paddings[1], paddings[2], paddings[3]}));
-    depthwise_conv_node->set_attr_dilation(
+    conv_op->set_attr_dilation(
         ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
-    depthwise_conv_node->set_attr_stride(
-        ge::AttrValue::LIST_INT({strides[0], strides[1]}));
-    depthwise_conv_node->set_attr_kernel(
+    conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
+    conv_op->set_attr_kernel(
         ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    conv_node = depthwise_conv_node;
     // ConvolutionDepthwise Op doesn't support bias, so append Add node to
     // support bias
     if (bias_node != nullptr) {
-      auto add_node = graph->AddNode<ge::op::Add>(output_name);
-      add_node->set_input_x1(*depthwise_conv_node);
-      add_node->set_input_x2(*bias_node);
+      auto add_node = graph->Add<ge::op::Add>(output_name);
+      auto add_op = add_node->data<ge::op::Add>();
+      add_op->set_input_x1(*conv_node->data());
+      add_op->set_input_x2(*bias_node->data());
       conv_node = add_node;
     }
   } else {
-    auto common_conv_node = graph->AddNode<ge::op::Convolution>(output_name);
-    common_conv_node->set_input_x(*input_node);
-    common_conv_node->set_input_w(*filter_const_node);
-    common_conv_node->set_attr_mode(1);
-    common_conv_node->set_attr_pad_mode(0);  // NOTSET
-    common_conv_node->set_attr_group(groups);
-    common_conv_node->set_attr_pad(ge::AttrValue::LIST_INT(
+    conv_node = graph->Add<ge::op::Convolution>(output_name);
+    auto conv_op = conv_node->data<ge::op::Convolution>();
+    conv_op->set_input_x(*input_node->data());
+    conv_op->set_input_w(*filter_node->data());
+    conv_op->set_attr_mode(1);
+    conv_op->set_attr_pad_mode(0);  // NOTSET
+    conv_op->set_attr_group(groups);
+    conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
         {paddings[0], paddings[0], paddings[2], paddings[2]}));
-    common_conv_node->set_attr_dilation(
+    conv_op->set_attr_dilation(
         ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
-    common_conv_node->set_attr_stride(
-        ge::AttrValue::LIST_INT({strides[0], strides[1]}));
-    common_conv_node->set_attr_kernel(
+    conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
+    conv_op->set_attr_kernel(
         ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
-    conv_node = common_conv_node;
     // Convolution Op only support bias with dimension {1, oc, 1, 1},
     // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
     if (bias_node != nullptr) {
       if (is_channel_bias) {
-        common_conv_node->set_input_b(*bias_node);
+        conv_op->set_input_b(*bias_node->data());
       } else {
-        auto add_node = graph->AddNode<ge::op::Add>(output_name);
-        add_node->set_input_x1(*common_conv_node);
-        add_node->set_input_x2(*bias_node);
+        auto add_node = graph->Add<ge::op::Add>(output_name);
+        auto add_op = add_node->data<ge::op::Add>();
+        add_op->set_input_x1(*conv_node->data());
+        add_op->set_input_x2(*bias_node->data());
         conv_node = add_node;
       }
     }
@@ -215,9 +213,10 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   if (fuse_relu) {
     // Append relu node if fuse_relu is true
-    auto relu_node = graph->AddNode<ge::op::Activation>(output_name);
-    relu_node->set_input_x(*conv_node);
-    relu_node->set_attr_mode(CvtActMode("relu"));
+    auto relu_node = graph->Add<ge::op::Activation>(output_name);
+    auto relu_op = relu_node->data<ge::op::Activation>();
+    relu_op->set_input_x(*conv_node->data());
+    relu_op->set_attr_mode(CvtActMode("relu"));
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
@@ -227,9 +226,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         conv2d,
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kNPU,
                          paddle::lite::subgraph::npu::ConvConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         depthwise_conv2d,
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kNPU,
                          paddle::lite::subgraph::npu::ConvConverter);
diff --git a/lite/kernels/npu/bridges/conv_transpose_op.cc b/lite/kernels/npu/bridges/conv_transpose_op.cc
index 5ac0723c78..ab31a920ec 100644
--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -58,11 +58,11 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(dilations.size(), 2L);
 
   // Input node
-  std::shared_ptr<ge::Operator> input_node = nullptr;
-  if (graph->HasNode(input_name)) {
-    input_node = graph->GetNode(input_name);
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
   } else {
-    input_node = graph->AddNode(input_name, input_dims);
+    input_node = graph->Add(input_name, *input);
   }
 
   // Create input sizes node to describe the dimensions of input tensor
@@ -83,55 +83,59 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         (input_dims[i + 2] - 1) * strides[i] + kernel_ext - 2 * paddings[i];
     input_sizes.push_back(output_size);
   }
-  auto input_sizes_const_node =
-      graph->AddNode(output_name + "/input_sizes", input_sizes);
+  auto input_sizes_node = graph->Add(output_name + "/input_sizes", input_sizes);
 
   // Filter node
-  auto filter_const_node = graph->AddNode(filter_name, *filter);
+  auto filter_node = graph->Add(filter_name, *filter);
 
   // Deconv node
-  auto conv_transpose_node = graph->AddNode<ge::op::Deconvolution>(output_name);
-  conv_transpose_node->set_input_input_sizes(*input_sizes_const_node);
-  conv_transpose_node->set_input_filter(*filter_const_node);
-  conv_transpose_node->set_input_x(*input_node);
+  auto conv_transpose_node = graph->Add<ge::op::Deconvolution>(output_name);
+  auto conv_transpose_op = conv_transpose_node->data<ge::op::Deconvolution>();
+  conv_transpose_op->set_input_input_sizes(*input_sizes_node->data());
+  conv_transpose_op->set_input_filter(*filter_node->data());
+  conv_transpose_op->set_input_x(*input_node->data());
   // Set attributes
-  conv_transpose_node->set_attr_format(0);    // NCHW
-  conv_transpose_node->set_attr_pad_mode(0);  // NOTSET
-  conv_transpose_node->set_attr_group(groups);
-  conv_transpose_node->set_attr_pad(ge::AttrValue::LIST_INT(
+  conv_transpose_op->set_attr_format(0);    // NCHW
+  conv_transpose_op->set_attr_pad_mode(0);  // NOTSET
+  conv_transpose_op->set_attr_group(groups);
+  conv_transpose_op->set_attr_pad(ge::AttrValue::LIST_INT(
       {paddings[0], paddings[1], paddings[2], paddings[3]}));
-  conv_transpose_node->set_attr_dilation(
+  conv_transpose_op->set_attr_dilation(
       ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
-  conv_transpose_node->set_attr_stride(
+  conv_transpose_op->set_attr_stride(
       ge::AttrValue::LIST_INT({strides[0], strides[1]}));
-  conv_transpose_node->set_attr_kernel(
+  conv_transpose_op->set_attr_kernel(
       ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
 
   // Append add node to add bias if exists bias
-  std::shared_ptr<ge::Operator> output_node = conv_transpose_node;
   if (HasInputArg(op_info, scope, "Bias")) {
-    // Create bias node
+    std::shared_ptr<Node> bias_node = nullptr;
     auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
-    CHECK(bias_type->precision() == PRECISION(kFloat));
-    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
-    auto bias = scope->FindMutableTensor(bias_name);
-    auto channel_size = bias->dims().production();
-    CHECK_EQ(channel_size, filter_dims[1] * groups);
-    auto bias_const_node =
-        graph->AddNode(bias_name, *bias, {1, channel_size, 1, 1});
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias_type = kernel->GetInputDeclType("Bias");
+      CHECK(bias_type->precision() == PRECISION(kFloat));
+      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto channel_size = bias->dims().production();
+      CHECK_EQ(channel_size, filter_dims[1] * groups);
+      bias_node = graph->Add(bias_name, *bias, {1, channel_size, 1, 1});
+    }
     // Append add node to add bias node
-    auto add_node = graph->AddNode<ge::op::Add>(output_name);
-    add_node->set_input_x1(*conv_transpose_node);
-    add_node->set_input_x2(*bias_const_node);
-    output_node = add_node;
+    auto add_node = graph->Add<ge::op::Add>(output_name);
+    auto add_op = add_node->data<ge::op::Add>();
+    add_op->set_input_x1(*conv_transpose_node->data());
+    add_op->set_input_x2(*bias_node->data());
+    conv_transpose_node = add_node;
   }
 
   if (fuse_relu) {
     // Append relu node if fuse_relu is true
-    auto relu_node = graph->AddNode<ge::op::Activation>(output_name);
-    relu_node->set_input_x(*output_node);
-    relu_node->set_attr_mode(CvtActMode("relu"));
+    auto relu_node = graph->Add<ge::op::Activation>(output_name);
+    auto relu_op = relu_node->data<ge::op::Activation>();
+    relu_op->set_input_x(*conv_transpose_node->data());
+    relu_op->set_attr_mode(CvtActMode("relu"));
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
@@ -141,6 +145,6 @@ int ConvTransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         conv2d_transpose,
+REGISTER_SUBGRAPH_BRIDGE(conv2d_transpose,
+                         kNPU,
                          paddle::lite::subgraph::npu::ConvTransposeConverter);
diff --git a/lite/kernels/npu/bridges/elementwise_ops.cc b/lite/kernels/npu/bridges/elementwise_ops.cc
index a31a1426dc..69b77b5def 100644
--- a/lite/kernels/npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/npu/bridges/elementwise_ops.cc
@@ -74,45 +74,45 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto axis = op_info->GetAttr<int>("axis");
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Y node
-  std::shared_ptr<ge::Operator> y_node = nullptr;
-  if (graph->HasNode(y_name)) {
-    y_node = graph->GetNode(y_name);
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
   } else {
     auto y_new_shape = CvtYShape(x_dims, y_dims, axis);
-    y_node = graph->AddNode(y_name, y_new_shape);
+    y_node = graph->Add(y_name, *y, y_new_shape);
   }
 
   // Elementwise node
-  std::shared_ptr<ge::Operator> elementwise_node = nullptr;
+  std::shared_ptr<Node> elt_node = nullptr;
   if (op_type == "elementwise_add" ||
       op_type == "fusion_elementwise_add_activation") {
-    auto elt_node = graph->AddNode<ge::op::Add>(out_name);
-    elt_node->set_input_x1(*x_node);
-    elt_node->set_input_x2(*y_node);
-    elementwise_node = elt_node;
+    elt_node = graph->Add<ge::op::Add>(out_name);
+    auto elt_op = elt_node->data<ge::op::Add>();
+    elt_op->set_input_x1(*x_node->data());
+    elt_op->set_input_x2(*y_node->data());
   } else if (op_type == "elementwise_sub") {
-    auto elt_node = graph->AddNode<ge::op::Sub>(out_name);
-    elt_node->set_input_x1(*x_node);
-    elt_node->set_input_x2(*y_node);
-    elementwise_node = elt_node;
+    elt_node = graph->Add<ge::op::Sub>(out_name);
+    auto elt_op = elt_node->data<ge::op::Sub>();
+    elt_op->set_input_x1(*x_node->data());
+    elt_op->set_input_x2(*y_node->data());
   } else if (op_type == "elementwise_mul") {
-    auto elt_node = graph->AddNode<ge::op::Mul>(out_name);
-    elt_node->set_input_x(*x_node);
-    elt_node->set_input_y(*y_node);
-    elementwise_node = elt_node;
+    elt_node = graph->Add<ge::op::Mul>(out_name);
+    auto elt_op = elt_node->data<ge::op::Mul>();
+    elt_op->set_input_x(*x_node->data());
+    elt_op->set_input_y(*y_node->data());
   } else if (op_type == "elementwise_div") {
-    auto elt_node = graph->AddNode<ge::op::RealDiv>(out_name);
-    elt_node->set_input_x1(*x_node);
-    elt_node->set_input_x2(*y_node);
-    elementwise_node = elt_node;
+    elt_node = graph->Add<ge::op::RealDiv>(out_name);
+    auto elt_op = elt_node->data<ge::op::RealDiv>();
+    elt_op->set_input_x1(*x_node->data());
+    elt_op->set_input_x2(*y_node->data());
   } else {
     LOG(WARNING) << "[NPU] Unsupported op type: " << op_type;
     return FAILED;
@@ -121,11 +121,12 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Act node
   if (op_type == "fusion_elementwise_add_activation") {
     auto act_type = op_info->GetAttr<std::string>("act_type");
-    auto act_node = graph->AddNode<ge::op::Activation>(out_name);
-    act_node->set_input_x(*elementwise_node);
+    auto act_node = graph->Add<ge::op::Activation>(out_name);
+    auto act_op = act_node->data<ge::op::Activation>();
+    act_op->set_input_x(*elt_node->data());
     // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
     // clipped_relu etc.
-    act_node->set_attr_mode(CvtActMode(act_type));
+    act_op->set_attr_mode(CvtActMode(act_type));
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
@@ -135,18 +136,18 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         elementwise_add,
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kNPU,
                          paddle::lite::subgraph::npu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         fusion_elementwise_add_activation,
+REGISTER_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation,
+                         kNPU,
                          paddle::lite::subgraph::npu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         elementwise_sub,
+REGISTER_SUBGRAPH_BRIDGE(elementwise_sub,
+                         kNPU,
                          paddle::lite::subgraph::npu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         elementwise_mul,
+REGISTER_SUBGRAPH_BRIDGE(elementwise_mul,
+                         kNPU,
                          paddle::lite::subgraph::npu::ElementwiseConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         elementwise_div,
+REGISTER_SUBGRAPH_BRIDGE(elementwise_div,
+                         kNPU,
                          paddle::lite::subgraph::npu::ElementwiseConverter);
diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc
old mode 100755
new mode 100644
index e7e35831dd..546a235148
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -57,9 +57,11 @@ int Engine::BuildOriginProgram() {
       VLOG(3) << "The attr '" << kKernelTypeAttr
               << "' not found, pick the first kernel for " << op_type;
 #if defined(LITE_WITH_ARM)
-      auto kernels = op->CreateKernels({Place{TARGET(kARM)}});
+      auto kernels =
+          op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
 #elif defined(LITE_WITH_X86)
-      auto kernels = op->CreateKernels({Place{TARGET(kX86)}});
+      auto kernels =
+          op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
 #endif
       CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
       picked_kernel = std::move(kernels.front());
diff --git a/lite/kernels/npu/bridges/engine.h b/lite/kernels/npu/bridges/engine.h
old mode 100755
new mode 100644
diff --git a/lite/kernels/npu/bridges/fc_op.cc b/lite/kernels/npu/bridges/fc_op.cc
index 7b66d54565..3d02817215 100644
--- a/lite/kernels/npu/bridges/fc_op.cc
+++ b/lite/kernels/npu/bridges/fc_op.cc
@@ -57,22 +57,24 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
           << " m: " << m << " k: " << k << " n: " << n;
 
   // Create input node and reshape it to (m, k, 1, 1)
-  std::shared_ptr<ge::Operator> input_node = nullptr;
-  if (graph->HasNode(input_name)) {
-    input_node = graph->GetNode(input_name);
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
   } else {
-    input_node = graph->AddNode(input_name, input_dims);
+    input_node = graph->Add(input_name, *input);
   }
   auto reshaped_input_node =
-      graph->AddNode<ge::op::Reshape>(input_name + "/reshape");
-  reshaped_input_node->set_input_tensor(*input_node);
-  reshaped_input_node->set_attr_shape({m, k, 1, 1});
-  reshaped_input_node->set_attr_axis(0);
+      graph->Add<ge::op::Reshape>(input_name + "/reshape");
+  auto reshaped_input_op = reshaped_input_node->data<ge::op::Reshape>();
+  reshaped_input_op->set_input_tensor(*input_node->data());
+  reshaped_input_op->set_attr_shape({m, k, 1, 1});
+  reshaped_input_op->set_attr_axis(0);
 
   // Create w const node, set its shape to (n, k, 1, 1) and fill with
   // the transposed w tensor
   Tensor transpose_w;
   transpose_w.Resize({n, k, 1, 1});
+  transpose_w.set_persistable(true);
   auto transpose_w_data = transpose_w.mutable_data<float>();
   auto w_data = w->mutable_data<float>();
   for (int i = 0; i < k; i++) {
@@ -80,29 +82,36 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       transpose_w_data[j * k + i] = w_data[i * n + j];
     }
   }
-  auto trans_w_const_node = graph->AddNode(w_name, transpose_w);
+  auto trans_w_node = graph->Add(w_name, transpose_w);
 
   // FC node
-  auto fc_node = graph->AddNode<ge::op::FullConnection>(out_name + "/fc");
-  fc_node->set_input_x(*reshaped_input_node);
-  fc_node->set_input_w(*trans_w_const_node);
+  auto fc_node = graph->Add<ge::op::FullConnection>(out_name + "/fc");
+  auto fc_op = fc_node->data<ge::op::FullConnection>();
+  fc_op->set_input_x(*reshaped_input_node->data());
+  fc_op->set_input_w(*trans_w_node->data());
   // Add bias node if bias tensor exists
   if (HasInputArg(op_info, scope, "Bias")) {
+    std::shared_ptr<Node> bias_node = nullptr;
     auto bias_name = op_info->Input("Bias").front();
-    auto bias_type = kernel->GetInputDeclType("Bias");
-    CHECK(bias_type->precision() == PRECISION(kFloat));
-    CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
-    auto bias = scope->FindMutableTensor(bias_name);
-    auto bias_dims = bias->dims();
-    CHECK_EQ(bias_dims.production(), n);
-    auto bias_const_node = graph->AddNode(bias_name, *bias, {1, n, 1, 1});
-    fc_node->set_input_b(*bias_const_node);
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
+    } else {
+      auto bias_type = kernel->GetInputDeclType("Bias");
+      CHECK(bias_type->precision() == PRECISION(kFloat));
+      CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
+      auto bias = scope->FindMutableTensor(bias_name);
+      auto bias_dims = bias->dims();
+      CHECK_EQ(bias_dims.production(), n);
+      bias_node = graph->Add(bias_name, *bias, {1, n, 1, 1});
+    }
+    fc_op->set_input_b(*bias_node->data());
   }
   // Reshape output of FC node from (m, n, 1, 1) to (m, n)
-  auto reshaped_fc_node = graph->AddNode<ge::op::Reshape>(out_name);
-  reshaped_fc_node->set_input_tensor(*fc_node);
-  reshaped_fc_node->set_attr_shape({m, n});
-  reshaped_fc_node->set_attr_axis(0);
+  auto reshaped_fc_node = graph->Add<ge::op::Reshape>(out_name);
+  auto reshaped_fc_op = reshaped_fc_node->data<ge::op::Reshape>();
+  reshaped_fc_op->set_input_tensor(*fc_node->data());
+  reshaped_fc_op->set_attr_shape({m, n});
+  reshaped_fc_op->set_attr_axis(0);
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
@@ -111,4 +120,4 @@ int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU, fc, paddle::lite::subgraph::npu::FCConverter);
+REGISTER_SUBGRAPH_BRIDGE(fc, kNPU, paddle::lite::subgraph::npu::FCConverter);
diff --git a/lite/kernels/npu/bridges/graph.cc b/lite/kernels/npu/bridges/graph.cc
old mode 100755
new mode 100644
index 48ebfd5673..7d3afd92bf
--- a/lite/kernels/npu/bridges/graph.cc
+++ b/lite/kernels/npu/bridges/graph.cc
@@ -21,26 +21,52 @@ namespace lite {
 namespace subgraph {
 namespace npu {
 
-// Const node
-std::shared_ptr<ge::op::Const> Graph::AddNode(const std::string& name,
-                                              const Tensor& tensor,
-                                              std::vector<int64_t> shape,
-                                              PrecisionType precision,
-                                              DataLayoutType layout) {
-  auto node = AddNode<ge::op::Const>(name, precision, layout);
-  node->set_attr_value(CvtTensor(tensor, shape, precision, layout));
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
+  auto it = nodes_.find(name);
+  if (it != nodes_.end()) {
+    // Only variable node can be shared with the same name
+    if (!node->is_var() || !it->second.back()->is_var()) {
+      LOG(FATAL) << "[NPU] Const or data node " << name << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
+  }
+  it->second.push_back(node);
+  return it->second.size();
+}
+
+// Const or data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 DataLayoutType layout) {
+  std::shared_ptr<Node> node = nullptr;
+  PrecisionType precision = tensor.precision();
+  if (tensor.persistable()) {
+    // Const node
+    node = Add<ge::op::Const>(name, precision, layout);
+    node->data<ge::op::Const>()->set_attr_value(
+        CvtTensor(tensor, shape, layout));
+  } else {
+    // Data node
+    node = Add(name, shape, precision, layout);
+  }
   return node;
 }
 
 // Data node
-std::shared_ptr<ge::op::Data> Graph::AddNode(const std::string& name,
-                                             std::vector<int64_t> shape,
-                                             PrecisionType precision,
-                                             DataLayoutType layout) {
-  auto node = AddNode<ge::op::Data>(name);
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout) {
+  auto node = Add<ge::op::Data>(name, precision, layout);
   ge::TensorDesc desc(
       ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
-  node->update_input_desc_x(desc);
+  node->data<ge::op::Data>()->update_input_desc_x(desc);
   return node;
 }
 
diff --git a/lite/kernels/npu/bridges/graph.h b/lite/kernels/npu/bridges/graph.h
old mode 100755
new mode 100644
index 9b6e49c5e9..cc4a7e2a7c
--- a/lite/kernels/npu/bridges/graph.h
+++ b/lite/kernels/npu/bridges/graph.h
@@ -19,7 +19,7 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
+#include "graph/op/all_ops.h"
 #include "lite/core/op_lite.h"
 #include "lite/core/tensor.h"
 
@@ -28,105 +28,94 @@ namespace lite {
 namespace subgraph {
 namespace npu {
 
-// Type of graph nodes
-class Type {
+// Graph and node is defined to collect all of converted HiAI IR nodes
+class Node {
  public:
-  Type(PrecisionType precision = PRECISION(kFloat),
-       DataLayoutType layout = DATALAYOUT(kNCHW),
-       bool persistable = false)
-      : precision_(precision), layout_(layout), persistable_(persistable) {}
-
+  enum class Role {
+    kVar = 0,
+    kConst,
+    kData,
+  };
+
+  Node(std::shared_ptr<ge::Operator> data,
+       PrecisionType precision,
+       DataLayoutType layout,
+       Role role)
+      : data_(data), precision_(precision), layout_(layout), role_(role) {}
+  Node(PrecisionType precision, DataLayoutType layout, Role role)
+      : precision_(precision), layout_(layout), role_(role) {}
+
+  void set_data(std::shared_ptr<ge::Operator> data) { data_ = data; }
   void set_precision(PrecisionType precision) { precision_ = precision; }
   void set_layout(DataLayoutType layout) { layout_ = layout; }
-  bool set_persistable(bool persistable) { persistable_ = persistable; }
+  void set_role(Role role) { role_ = role; }
 
+  template <typename T>
+  std::shared_ptr<T> data() {
+    return std::static_pointer_cast<T>(data_);
+  }
+  std::shared_ptr<ge::Operator> data() { return data_; }
   PrecisionType precision() const { return precision_; }
   DataLayoutType layout() const { return layout_; }
-  bool persistable() const { return persistable_; }
+  bool is_var() const { return role_ == Role::kVar; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
 
  private:
+  std::shared_ptr<ge::Operator> data_{nullptr};
   PrecisionType precision_{PRECISION(kFloat)};
   DataLayoutType layout_{DATALAYOUT(kNCHW)};
-  bool persistable_{false};
+  Role role_{Role::kVar};
 };
 
-// Graph to collect all of converted HiAI IR nodes
 class Graph {
  public:
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Variable, const or data node
   template <typename T>
-  std::shared_ptr<T> AddNode(const std::string& name,
-                             PrecisionType precision = PRECISION(kFloat),
-                             DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    auto unique_name = [&](const std::string& key) {
-      int idx = 1;
-      auto it = counts_.find(key);
-      if (it == counts_.end()) {
-        counts_.insert(std::make_pair(key, idx));
-      } else {
-        idx = ++(it->second);
-      }
-      return key + "_" + std::to_string(idx);
-    };
-    bool persistable = typeid(T) == typeid(ge::op::Const);
-    auto it = nodes_.find(name);
-    if (it != nodes_.end()) {
-      // Only variable can rebind the name
-      CHECK(!it->second.second.persistable() && !persistable)
-          << "[NPU] Node " << name << " redefined.";
-      // Generate a new unique name as the key to bind the origin node:
-      // new_name->node
-      nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second));
-      nodes_.erase(it);
+  std::shared_ptr<Node> Add(const std::string& name,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    Node::Role role = Node::Role::kVar;
+    if (typeid(T) == typeid(ge::op::Const)) {
+      role = Node::Role::kConst;
+    } else if (typeid(T) == typeid(ge::op::Data)) {
+      role = Node::Role::kData;
     }
-    // Create a new node and bind with the name: name->new_node
-    auto node = std::make_shared<T>(unique_name(name + "_op"));
-    nodes_.insert(std::make_pair(
-        name, std::make_pair(node, Type(precision, layout, persistable))));
+    auto node = std::make_shared<Node>(precision, layout, role);
+    auto idx = Add(name, node);
+    CHECK_GE(idx, 1);
+    // Generate a unique name for the created HiAI IR
+    node->set_data(std::make_shared<T>(name + "__" + std::to_string(idx)));
     return node;
   }
 
-  // Const node
-  std::shared_ptr<ge::op::Const> AddNode(
-      const std::string& name,
-      const Tensor& tensor,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    return AddNode(name, tensor, tensor.dims().Vectorize(), precision, layout);
+  // Const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, tensor.dims().Vectorize(), layout);
   }
 
-  std::shared_ptr<ge::op::Const> AddNode(
-      const std::string& name,
-      const Tensor& tensor,
-      std::vector<int64_t> shape,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW));
-
-  std::shared_ptr<ge::op::Const> AddNode(
-      const std::string& name,
-      const Tensor& tensor,
-      DDim dims,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    return AddNode(name, tensor, dims.Vectorize(), precision, layout);
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, dims.Vectorize(), layout);
   }
 
+  // Const node
   template <typename T>
-  std::shared_ptr<ge::op::Const> AddNode(
-      const std::string& name,
-      const std::vector<T>& data,
-      std::vector<int64_t> shape = {},
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    const std::type_info& info = typeid(T);
-    PrecisionType precision = PRECISION(kFloat);
-    if (info == typeid(float)) {
-      precision = PRECISION(kFloat);
-    } else if (info == typeid(int8_t)) {
-      precision = PRECISION(kFloat);
-    } else if (info == typeid(int32_t)) {
-      precision = PRECISION(kInt32);
-    } else {
-      LOG(FATAL) << "[NPU] Unknow data type " << info.name();
-    }
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            std::vector<int64_t> shape = {},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
     if (shape.empty()) {
       shape = {static_cast<int64_t>(data.size())};
     } else {
@@ -138,78 +127,66 @@ class Graph {
     }
     Tensor tensor;
     tensor.Resize(shape);
+    tensor.set_persistable(true);
     std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
                 reinterpret_cast<const uint8_t*>(data.data()),
                 data.size() * sizeof(T));
-    return AddNode(name, tensor, precision, layout);
+    return Add(name, tensor, layout);
   }
 
   template <typename T>
-  std::shared_ptr<ge::op::Const> AddNode(
-      const std::string& name,
-      const std::vector<T>& data,
-      DDim dims,
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    return AddNode(name, data, dims.Vectorize(), layout);
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, data, dims.Vectorize(), layout);
   }
 
   template <typename T>
-  std::shared_ptr<ge::op::Const> AddNode(
-      const std::string& name,
-      T value,
-      std::vector<int64_t> shape = {1},
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            std::vector<int64_t> shape = {1},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
     int64_t size = 1;
     for (auto i : shape) {
       size *= i;
     }
     std::vector<T> data(size, value);
-    return AddNode(name, data, shape, layout);
+    return Add(name, data, shape, layout);
   }
 
   template <typename T>
-  std::shared_ptr<ge::op::Const> AddNode(
-      const std::string& name,
-      T value,
-      DDim dims,
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    return AddNode(name, value, dims.Vectorize(), layout);
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, value, dims.Vectorize(), layout);
   }
 
   // Data node
-  std::shared_ptr<ge::op::Data> AddNode(
-      const std::string& name,
-      std::vector<int64_t> shape,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW));
-
-  std::shared_ptr<ge::op::Data> AddNode(
-      const std::string& name,
-      DDim dims,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    return AddNode(name, dims.Vectorize(), precision, layout);
-  }
-
-  std::shared_ptr<ge::Operator> GetNode(std::string name) {
-    CHECK(HasNode(name)) << "[NPU] Node " << name << " not found.";
-    return nodes_.at(name).first;
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            DDim dims,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, dims.Vectorize(), precision, layout);
   }
 
-  const Type& GetType(const std::string& name) {
-    CHECK(HasNode(name)) << "[NPU] Node " << name << " not found.";
-    return nodes_.at(name).second;
+  std::shared_ptr<Node> Get(std::string name) {
+    CHECK(Has(name)) << "[NPU] Node " << name << " not found.";
+    return nodes_.at(name).back();
   }
 
-  bool HasNode(const std::string& name) {
+  bool Has(const std::string& name) {
     return nodes_.find(name) != nodes_.end();
   }
 
  private:
-  std::unordered_map<std::string,
-                     std::pair<std::shared_ptr<ge::Operator>, Type>>
-      nodes_;
-  std::unordered_map<std::string, int> counts_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
 };
 
 }  // namespace npu
diff --git a/lite/kernels/npu/bridges/interpolate_op.cc b/lite/kernels/npu/bridges/interpolate_op.cc
index f95ebc347a..238200abf3 100644
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -55,11 +55,11 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                                                  "supported in HiAI DDK";
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Priority: OutSize > scale > out_h/out_w
@@ -71,17 +71,18 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   // Update out_h and out_w and create out_size node if has OutSize
-  std::shared_ptr<ge::Operator> out_size_node = nullptr;
+  std::shared_ptr<Node> out_size_node = nullptr;
   if (HasInputArg(op_info, scope, "OutSize")) {
     auto out_size_name = op_info->Input("OutSize").front();
     auto out_size_type = kernel->GetInputDeclType("OutSize");
     CHECK(out_size_type->precision() == PRECISION(kInt32));
     CHECK(out_size_type->layout() == DATALAYOUT(kNCHW));
-    if (graph->HasNode(out_size_name)) {
-      out_size_node = graph->GetNode(out_size_name);
+    if (graph->Has(out_size_name)) {
+      out_size_node = graph->Get(out_size_name);
     } else {
       auto out_size = scope->FindMutableTensor(out_size_name);
       CHECK_EQ(out_size->numel(), 2);
+      CHECK(out_size->persistable());
       auto out_size_data = out_size->mutable_data<int>();
       // Update out_h and out_w if has OutSize
       out_h = out_size_data[0];
@@ -97,22 +98,25 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
           << " is too large, should not exceed " << largest_multiple
           << " in HiAI DDK";
     }
-    out_size_node = graph->AddNode(out_name + "/out_size",
-                                   std::vector<int>({out_h, out_w}));
+    out_size_node =
+        graph->Add(out_name + "/out_size", std::vector<int>({out_h, out_w}));
   }
 
   if (interp_method == "bilinear") {
-    auto bilinear_interp_node =
-        graph->AddNode<ge::op::ResizeBilinear>(out_name);
-    bilinear_interp_node->set_input_x(*x_node);
-    bilinear_interp_node->set_input_size(*out_size_node);
-    bilinear_interp_node->set_attr_align_corners(align_corners);
+    auto bilinear_interp_node = graph->Add<ge::op::ResizeBilinear>(out_name);
+    auto bilinear_interp_op =
+        bilinear_interp_node->data<ge::op::ResizeBilinear>();
+    bilinear_interp_op->set_input_x(*x_node->data());
+    bilinear_interp_op->set_input_size(*out_size_node->data());
+    bilinear_interp_op->set_attr_align_corners(align_corners);
   } else if (interp_method == "nearest") {
     auto nearest_interp_node =
-        graph->AddNode<ge::op::ResizeNearestNeighbor>(out_name);
-    nearest_interp_node->set_input_image(*x_node);
-    nearest_interp_node->set_input_size(*out_size_node);
-    nearest_interp_node->set_attr_align_corners(align_corners);
+        graph->Add<ge::op::ResizeNearestNeighbor>(out_name);
+    auto nearest_interp_op =
+        nearest_interp_node->data<ge::op::ResizeNearestNeighbor>();
+    nearest_interp_op->set_input_image(*x_node->data());
+    nearest_interp_op->set_input_size(*out_size_node->data());
+    nearest_interp_op->set_attr_align_corners(align_corners);
   } else {
     LOG(WARNING) << "[NPU] Unsupported interpolate method: " << interp_method;
     return FAILED;
@@ -125,9 +129,9 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         bilinear_interp,
+REGISTER_SUBGRAPH_BRIDGE(bilinear_interp,
+                         kNPU,
                          paddle::lite::subgraph::npu::InterpolateConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         nearest_interp,
+REGISTER_SUBGRAPH_BRIDGE(nearest_interp,
+                         kNPU,
                          paddle::lite::subgraph::npu::InterpolateConverter);
diff --git a/lite/kernels/npu/bridges/mul_op.cc b/lite/kernels/npu/bridges/mul_op.cc
index f63b6826b9..27df458195 100644
--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -56,45 +56,46 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       << "[NPU] columns of X must be equal with rows of Y";
   int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production();
   VLOG(3) << "m:" << m << ",n:" << n << ",k:" << k;
-  VLOG(3) << "x_name:" << x_name << ", is data: " << graph->HasNode(x_name);
-  VLOG(3) << "y_name:" << y_name << ", is data: " << graph->HasNode(y_name);
-  CHECK(graph->HasNode(x_name))
+  VLOG(3) << "x_name:" << x_name << ", is data: " << graph->Has(x_name);
+  VLOG(3) << "y_name:" << y_name << ", is data: " << graph->Has(y_name);
+  CHECK(graph->Has(x_name))
       << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";
 
   // X node which supports persistable and non-persistable tensor, and
   // reshape to (m, k)
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
-    auto reshaped_x_node = graph->AddNode<ge::op::Reshape>(x_name + "/reshape");
-    reshaped_x_node->set_input_tensor(*x_node);
-    reshaped_x_node->set_attr_shape({m, k});
-    reshaped_x_node->set_attr_axis(0);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+    auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
+    auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
+    reshaped_x_op->set_input_tensor(*x_node->data());
+    reshaped_x_op->set_attr_shape({m, k});
+    reshaped_x_op->set_attr_axis(0);
     x_node = reshaped_x_node;
   } else {
-    auto x_const_node = graph->AddNode(x_name, *x, {m, k});
-    x_node = x_const_node;
+    x_node = graph->Add(x_name, *x, {m, k});
   }
 
   // Y node which only supports persistable tensor, and reshape to
   // (k,n)
-  std::shared_ptr<ge::Operator> y_node = nullptr;
-  if (graph->HasNode(y_name)) {
-    y_node = graph->GetNode(y_name);
-    auto reshaped_y_node = graph->AddNode<ge::op::Reshape>(y_name + "/reshape");
-    reshaped_y_node->set_input_tensor(*y_node);
-    reshaped_y_node->set_attr_shape({k, n});
-    reshaped_y_node->set_attr_axis(0);
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+    auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
+    auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
+    reshaped_y_op->set_input_tensor(*y_node->data());
+    reshaped_y_op->set_attr_shape({k, n});
+    reshaped_y_op->set_attr_axis(0);
     y_node = reshaped_y_node;
   } else {
-    auto y_const_node = graph->AddNode(y_name, *y, {k, n});
-    y_node = y_const_node;
+    y_node = graph->Add(y_name, *y, {k, n});
   }
 
   // Matmul node
-  auto mul_node = graph->AddNode<ge::op::MatMul>(out_name);
-  mul_node->set_input_x1(*x_node);
-  mul_node->set_input_x2(*y_node);
+  auto mul_node = graph->Add<ge::op::MatMul>(out_name);
+  auto mul_op = mul_node->data<ge::op::MatMul>();
+  mul_op->set_input_x1(*x_node->data());
+  mul_op->set_input_x2(*y_node->data());
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
@@ -103,4 +104,4 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU, mul, paddle::lite::subgraph::npu::MulConverter);
+REGISTER_SUBGRAPH_BRIDGE(mul, kNPU, paddle::lite::subgraph::npu::MulConverter);
diff --git a/lite/kernels/npu/bridges/pad2d_op.cc b/lite/kernels/npu/bridges/pad2d_op.cc
index 451f48b1df..e6852da787 100644
--- a/lite/kernels/npu/bridges/pad2d_op.cc
+++ b/lite/kernels/npu/bridges/pad2d_op.cc
@@ -45,35 +45,34 @@ int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(padding.size(), 4);
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Padding node
   int xds = x_dims.size();
   padding.insert(padding.begin(), xds * 2 - 4, 0);
-  auto padding_const_node =
-      graph->AddNode(out_name + "/padding", padding, {xds, 2});
+  auto padding_node = graph->Add(out_name + "/padding", padding, {xds, 2});
 
   // Pad node
-  auto pad2d_node = graph->AddNode<ge::op::Pad>(out_name);
-  pad2d_node->set_input_x(*x_node);
-  pad2d_node->set_input_padding(*padding_const_node);
+  auto pad2d_node = graph->Add<ge::op::Pad>(out_name);
+  auto pad2d_op = pad2d_node->data<ge::op::Pad>();
+  pad2d_op->set_input_x(*x_node->data());
+  pad2d_op->set_input_padding(*padding_node->data());
   auto mode = op_info->GetAttr<std::string>("mode");
   if (mode == "constant") {
     // Pad value node
     auto pad_value = op_info->GetAttr<float>("pad_value");
-    auto pad_value_const_node =
-        graph->AddNode(out_name + "/pad_value", pad_value);
-    pad2d_node->set_input_constant_values(*pad_value_const_node);
-    pad2d_node->set_attr_T(0);  // type of pad_value:  0:float  3:int32
-    pad2d_node->set_attr_mode(0);
+    auto pad_value_node = graph->Add(out_name + "/pad_value", pad_value);
+    pad2d_op->set_input_constant_values(*pad_value_node->data());
+    pad2d_op->set_attr_T(0);  // type of pad_value:  0:float  3:int32
+    pad2d_op->set_attr_mode(0);
   } else if (mode == "reflect") {
     LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
-    pad2d_node->set_attr_mode(1);
+    pad2d_op->set_attr_mode(1);
     return FAILED;
   } else {
     LOG(WARNING) << "[NPU] pad mode " << mode << " isn't supported in HiAI DDK";
@@ -87,6 +86,6 @@ int Pad2dConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         pad2d,
+REGISTER_SUBGRAPH_BRIDGE(pad2d,
+                         kNPU,
                          paddle::lite::subgraph::npu::Pad2dConverter);
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
old mode 100755
new mode 100644
index d6fc535338..a63a0d889d
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -14,40 +14,42 @@
 
 #pragma once
 
-USE_SUBGRAPH_BRIDGE(NPU, sigmoid);
-USE_SUBGRAPH_BRIDGE(NPU, relu);
-USE_SUBGRAPH_BRIDGE(NPU, tanh);
-USE_SUBGRAPH_BRIDGE(NPU, relu_clipped);
-USE_SUBGRAPH_BRIDGE(NPU, leaky_relu);
-USE_SUBGRAPH_BRIDGE(NPU, softsign);
-USE_SUBGRAPH_BRIDGE(NPU, hard_sigmoid);
+USE_SUBGRAPH_BRIDGE(sigmoid, kNPU);
+USE_SUBGRAPH_BRIDGE(relu, kNPU);
+USE_SUBGRAPH_BRIDGE(tanh, kNPU);
+USE_SUBGRAPH_BRIDGE(relu_clipped, kNPU);
+USE_SUBGRAPH_BRIDGE(leaky_relu, kNPU);
+USE_SUBGRAPH_BRIDGE(softsign, kNPU);
+USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU);
 
-USE_SUBGRAPH_BRIDGE(NPU, batch_norm);
-USE_SUBGRAPH_BRIDGE(NPU, concat);
-USE_SUBGRAPH_BRIDGE(NPU, conv2d);
-USE_SUBGRAPH_BRIDGE(NPU, depthwise_conv2d);
-USE_SUBGRAPH_BRIDGE(NPU, conv2d_transpose);
+USE_SUBGRAPH_BRIDGE(batch_norm, kNPU);
+USE_SUBGRAPH_BRIDGE(concat, kNPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kNPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kNPU);
+USE_SUBGRAPH_BRIDGE(conv2d_transpose, kNPU);
 
-USE_SUBGRAPH_BRIDGE(NPU, elementwise_add);
-USE_SUBGRAPH_BRIDGE(NPU, fusion_elementwise_add_activation);
-USE_SUBGRAPH_BRIDGE(NPU, elementwise_sub);
-USE_SUBGRAPH_BRIDGE(NPU, elementwise_mul);
-USE_SUBGRAPH_BRIDGE(NPU, elementwise_div);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU);
+USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU);
+USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU);
 
-USE_SUBGRAPH_BRIDGE(NPU, fc);
-USE_SUBGRAPH_BRIDGE(NPU, bilinear_interp);
-USE_SUBGRAPH_BRIDGE(NPU, nearest_interp);
-USE_SUBGRAPH_BRIDGE(NPU, mul);
-USE_SUBGRAPH_BRIDGE(NPU, pad2d);
-USE_SUBGRAPH_BRIDGE(NPU, pool2d);
-USE_SUBGRAPH_BRIDGE(NPU, reduce_mean);
-USE_SUBGRAPH_BRIDGE(NPU, reshape);
-USE_SUBGRAPH_BRIDGE(NPU, reshape2);
-USE_SUBGRAPH_BRIDGE(NPU, scale);
-USE_SUBGRAPH_BRIDGE(NPU, shuffle_channel);
-USE_SUBGRAPH_BRIDGE(NPU, softmax);
-USE_SUBGRAPH_BRIDGE(NPU, split);
-USE_SUBGRAPH_BRIDGE(NPU, sqrt);
-USE_SUBGRAPH_BRIDGE(NPU, square);
-USE_SUBGRAPH_BRIDGE(NPU, transpose);
-USE_SUBGRAPH_BRIDGE(NPU, transpose2);
+USE_SUBGRAPH_BRIDGE(fc, kNPU);
+USE_SUBGRAPH_BRIDGE(bilinear_interp, kNPU);
+USE_SUBGRAPH_BRIDGE(nearest_interp, kNPU);
+USE_SUBGRAPH_BRIDGE(mul, kNPU);
+USE_SUBGRAPH_BRIDGE(pad2d, kNPU);
+USE_SUBGRAPH_BRIDGE(pool2d, kNPU);
+USE_SUBGRAPH_BRIDGE(reduce_mean, kNPU);
+USE_SUBGRAPH_BRIDGE(reshape, kNPU);
+USE_SUBGRAPH_BRIDGE(reshape2, kNPU);
+USE_SUBGRAPH_BRIDGE(scale, kNPU);
+USE_SUBGRAPH_BRIDGE(shuffle_channel, kNPU);
+USE_SUBGRAPH_BRIDGE(softmax, kNPU);
+USE_SUBGRAPH_BRIDGE(split, kNPU);
+USE_SUBGRAPH_BRIDGE(sqrt, kNPU);
+USE_SUBGRAPH_BRIDGE(square, kNPU);
+USE_SUBGRAPH_BRIDGE(transpose, kNPU);
+USE_SUBGRAPH_BRIDGE(transpose2, kNPU);
+USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU);
+USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU);
diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
deleted file mode 100644
index 9a432d17e5..0000000000
--- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/kernels/npu/bridges/registry.h"
-
-USE_NPU_BRIDGE(sigmoid);
-USE_NPU_BRIDGE(relu);
-USE_NPU_BRIDGE(tanh);
-USE_NPU_BRIDGE(relu_clipped);
-USE_NPU_BRIDGE(leaky_relu);
-USE_NPU_BRIDGE(softsign);
-USE_NPU_BRIDGE(hard_sigmoid);
-
-USE_NPU_BRIDGE(batch_norm);
-USE_NPU_BRIDGE(concat);
-USE_NPU_BRIDGE(conv2d);
-USE_NPU_BRIDGE(depthwise_conv2d);
-USE_NPU_BRIDGE(conv2d_transpose);
-
-USE_NPU_BRIDGE(elementwise_add);
-USE_NPU_BRIDGE(fusion_elementwise_add_activation);
-USE_NPU_BRIDGE(elementwise_sub);
-USE_NPU_BRIDGE(elementwise_mul);
-USE_NPU_BRIDGE(elementwise_div);
-
-USE_NPU_BRIDGE(fc);
-USE_NPU_BRIDGE(bilinear_interp);
-USE_NPU_BRIDGE(nearest_interp);
-USE_NPU_BRIDGE(mul);
-USE_NPU_BRIDGE(pad2d);
-USE_NPU_BRIDGE(pool2d);
-USE_NPU_BRIDGE(reduce_mean);
-USE_NPU_BRIDGE(reshape);
-USE_NPU_BRIDGE(reshape2);
-USE_NPU_BRIDGE(scale);
-USE_NPU_BRIDGE(shuffle_channel);
-USE_NPU_BRIDGE(softmax);
-USE_NPU_BRIDGE(split);
-USE_NPU_BRIDGE(sqrt);
-USE_NPU_BRIDGE(square);
-USE_NPU_BRIDGE(transpose);
-USE_NPU_BRIDGE(transpose2);
diff --git a/lite/kernels/npu/bridges/pool_op.cc b/lite/kernels/npu/bridges/pool_op.cc
index 8b108fc4ee..42349d1839 100644
--- a/lite/kernels/npu/bridges/pool_op.cc
+++ b/lite/kernels/npu/bridges/pool_op.cc
@@ -48,11 +48,11 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // pool mode
@@ -109,19 +109,19 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   // Pooling node
-  auto pool_node = graph->AddNode<ge::op::Pooling>(out_name);
-  pool_node->set_input_x(*x_node);
-  pool_node->set_attr_mode(mode);
-  pool_node->set_attr_pad_mode(pad_mode);
-  pool_node->set_attr_global_pooling(global_pooling);
-  pool_node->set_attr_window(
-      ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()));
-  pool_node->set_attr_pad(ge::AttrValue::LIST_INT{
+  auto pool_node = graph->Add<ge::op::Pooling>(out_name);
+  auto pool_op = pool_node->data<ge::op::Pooling>();
+  pool_op->set_input_x(*x_node->data());
+  pool_op->set_attr_mode(mode);
+  pool_op->set_attr_pad_mode(pad_mode);
+  pool_op->set_attr_global_pooling(global_pooling);
+  pool_op->set_attr_window(ge::AttrValue::LIST_INT(ksize.begin(), ksize.end()));
+  pool_op->set_attr_pad(ge::AttrValue::LIST_INT{
       paddings[0], paddings[1], paddings[2], paddings[3]});
-  pool_node->set_attr_stride(
+  pool_op->set_attr_stride(
       ge::AttrValue::LIST_INT(strides.begin(), strides.end()));
-  pool_node->set_attr_ceil_mode(ceil_mode);
-  // pool_node->set_attr_data_mode(data_mode);
+  pool_op->set_attr_ceil_mode(ceil_mode);
+  // pool_op->set_attr_data_mode(data_mode);
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
@@ -130,6 +130,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         pool2d,
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kNPU,
                          paddle::lite::subgraph::npu::PoolConverter);
diff --git a/lite/kernels/npu/bridges/pool_op_test.cc b/lite/kernels/npu/bridges/pool_op_test.cc
deleted file mode 100644
index 298e065547..0000000000
--- a/lite/kernels/npu/bridges/pool_op_test.cc
+++ /dev/null
@@ -1,252 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pool_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto& in_dims = x->dims();
-  auto& out_dims = out->dims();
-
-  const float* src_ptr = x->data<const float>();
-  float* dst_ptr = out->mutable_data<float>();
-
-  std::vector<int> ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  std::vector<int> strides = op_info->GetAttr<std::vector<int>>("strides");
-  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  bool exclusive = op_info->GetAttr<bool>("exclusive");
-  std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
-  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
-
-  int in_n = in_dims[0];
-  int in_c = in_dims[1];
-  int in_h = in_dims[2];
-  int in_w = in_dims[3];
-  int size_in_n = in_c * in_h * in_w;
-  int size_in_c = in_h * in_w;
-
-  int out_h = out_dims[2];
-  int out_w = out_dims[3];
-  int size_out_n = in_c * out_h * out_w;
-  int size_out_c = out_h * out_w;
-
-  int window_h = ksize[0];
-  int window_w = ksize[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[2];
-
-  if (global_pooling == true) {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        const float* src = src_ptr + n * size_in_n + c * size_in_c;
-        float res = src[0];
-        if (pooling_type == "max") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res = cur_val > res ? cur_val : res;
-          }
-        } else if (pooling_type == "avg") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res += cur_val;
-          }
-          res /= size_in_c;
-        }
-        dst_ptr[n * size_out_n + c] = res;
-      }
-    }
-  } else {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        for (int h = 0; h < out_h; ++h) {
-          int sh = h * stride_h;
-          int eh = sh + window_h;
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
-          for (int w = 0; w < out_w; ++w) {
-            int sw = w * stride_w;
-            int ew = sw + window_w;
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
-            int pooling_size = (ew - sw) * (eh - sh);
-            if (pooling_size == 0) continue;
-            float res = 0.f;
-            for (int kh = sh; kh < eh; ++kh) {
-              for (int kw = sw; kw < ew; ++kw) {
-                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
-                if (kh == sh && kw == sw) {
-                  res = src_ptr[src_idx];
-                } else {
-                  if (pooling_type == "max") {
-                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
-                  }
-                  if (pooling_type == "avg") {
-                    res += src_ptr[src_idx];
-                  }
-                }
-              }
-            }
-            if (pooling_type == "avg") {
-              if (exclusive) {
-                res /= pooling_size;
-              } else {
-                res /= window_h * window_w;
-              }
-            }
-            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
-          }
-        }
-      }
-    }
-  }
-}
-
-void test_pool(int bs,
-               int ic,
-               int ih,
-               int iw,
-               std::string pooling_type,
-               bool ceil_mode,
-               bool global_pooling,
-               bool exclusive,
-               int ksize,
-               int stride,
-               int padding) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("pool2d");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("pooling_type", pooling_type);
-  opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
-  opdesc.SetAttr("global_pooling", global_pooling);
-  opdesc.SetAttr("exclusive", exclusive);
-  opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings",
-                 std::vector<int>({padding, padding, padding, padding}));
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  pool_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, pool) {
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      for (auto global_pooling : {/*true, */ false}) {
-        for (auto exclusive : {true /*, false*/}) {
-          for (auto ksize : {2, 3}) {
-            for (auto stride : {1, 2}) {
-              for (auto padding : {0, 1}) {
-                for (auto bs : {1, 3}) {
-                  for (auto ic : {1, 3}) {
-                    for (auto ih : {3, 7}) {
-                      for (auto iw : {3, 7}) {
-                        test_pool(bs,
-                                  ic,
-                                  ih,
-                                  iw,
-                                  pooling_type,
-                                  ceil_mode,
-                                  global_pooling,
-                                  exclusive,
-                                  ksize,
-                                  stride,
-                                  padding);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      bool global_pooling = true;
-      bool exclusive = true;
-      int ksize = 2;
-      int stride = 1;
-      int padding = 0;
-      int bs = 6;
-      int ic = 6;
-      int ih = 6;
-      int iw = 6;
-      test_pool(bs,
-                ic,
-                ih,
-                iw,
-                pooling_type,
-                ceil_mode,
-                global_pooling,
-                exclusive,
-                ksize,
-                stride,
-                padding);
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(pool2d);
-USE_NPU_BRIDGE(pool2d);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc
index 6c7f29fb27..29f065675c 100644
--- a/lite/kernels/npu/bridges/reduce_mean_op.cc
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
@@ -52,29 +52,30 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   std::sort(dim.begin(), dim.end());
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Using ReduceSum + Scale to implement ReduceMean
 
   // Dim node
-  auto dim_const_node = graph->AddNode(out_name + "/dim", dim);
+  auto dim_node = graph->Add(out_name + "/dim", dim);
 
   // Reduce Sum node
-  auto reduce_sum_node =
-      graph->AddNode<ge::op::ReduceSum>(out_name + "/reducesum");
-  reduce_sum_node->set_input_x(*x_node);
-  reduce_sum_node->set_input_w(*dim_const_node);
-  reduce_sum_node->set_attr_keep_dims(keep_dim);
+  auto reduce_sum_node = graph->Add<ge::op::ReduceSum>(out_name + "/reducesum");
+  auto reduce_sum_op = reduce_sum_node->data<ge::op::ReduceSum>();
+  reduce_sum_op->set_input_x(*x_node->data());
+  reduce_sum_op->set_input_w(*dim_node->data());
+  reduce_sum_op->set_attr_keep_dims(keep_dim);
 
   // Scale node
-  auto scale_node = graph->AddNode<ge::op::Scale>(out_name);
-  scale_node->set_input_x(*reduce_sum_node);
-  scale_node->set_attr_axis(1);
+  auto scale_node = graph->Add<ge::op::Scale>(out_name);
+  auto scale_op = scale_node->data<ge::op::Scale>();
+  scale_op->set_input_x(*reduce_sum_node->data());
+  scale_op->set_attr_axis(1);
 
   // Add filter node(fill with scale)
   float scale = 1;
@@ -95,9 +96,8 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag),
         scale_bias_shape.end());
   }
-  auto filter_const_node =
-      graph->AddNode(out_name + "/filter", scale, scale_bias_shape);
-  scale_node->set_input_filter(*filter_const_node);
+  auto filter_node = graph->Add(out_name + "/filter", scale, scale_bias_shape);
+  scale_op->set_input_filter(*filter_node->data());
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
@@ -106,6 +106,6 @@ int ReduceMeanConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         reduce_mean,
+REGISTER_SUBGRAPH_BRIDGE(reduce_mean,
+                         kNPU,
                          paddle::lite::subgraph::npu::ReduceMeanConverter);
diff --git a/lite/kernels/npu/bridges/registry.cc b/lite/kernels/npu/bridges/registry.cc
index 5f89bcb313..39181ccee9 100644
--- a/lite/kernels/npu/bridges/registry.cc
+++ b/lite/kernels/npu/bridges/registry.cc
@@ -24,27 +24,27 @@ Registry& Registry::Instance() {
   return x;
 }
 
-void Registry::Insert(const std::string& dev_type,
-                      const std::string& op_type,
+void Registry::Insert(const std::string& op_type,
+                      const std::string& target,
                       const cvt_func_type& cvt_func_name) {
-  auto it = map_.find(dev_type);
+  auto it = map_.find(target);
   if (it == map_.end()) {
     map_.insert(std::make_pair(
-        dev_type, std::unordered_map<std::string, cvt_func_type>()));
+        target, std::unordered_map<std::string, cvt_func_type>()));
   }
-  map_.at(dev_type).insert(std::make_pair(op_type, cvt_func_name));
+  map_.at(target).insert(std::make_pair(op_type, cvt_func_name));
 }
 
-const cvt_func_type& Registry::Select(const std::string& dev_type,
-                                      const std::string& op_type) const {
-  return map_.at(dev_type).at(op_type);
+const cvt_func_type& Registry::Select(const std::string& op_type,
+                                      const std::string& target) const {
+  return map_.at(target).at(op_type);
 }
 
-bool Registry::Exists(const std::string& dev_type,
-                      const std::string& op_type) const {
-  bool found = map_.find(dev_type) != map_.end();
+bool Registry::Exists(const std::string& op_type,
+                      const std::string& target) const {
+  bool found = map_.find(target) != map_.end();
   if (found) {
-    found = map_.at(dev_type).find(op_type) != map_.at(dev_type).end();
+    found = map_.at(target).find(op_type) != map_.at(target).end();
   }
   return found;
 }
diff --git a/lite/kernels/npu/bridges/registry.h b/lite/kernels/npu/bridges/registry.h
index 5198a3f8f2..615a106864 100644
--- a/lite/kernels/npu/bridges/registry.h
+++ b/lite/kernels/npu/bridges/registry.h
@@ -42,12 +42,12 @@ class Registry {
  public:
   static Registry& Instance();
 
-  void Insert(const std::string& dev_type,
-              const std::string& op_type,
+  void Insert(const std::string& op_type,
+              const std::string& target,
               const cvt_func_type& cvt_func_name);
-  const cvt_func_type& Select(const std::string& dev_type,
-                              const std::string& op_type) const;
-  bool Exists(const std::string& dev_type, const std::string& op_type) const;
+  const cvt_func_type& Select(const std::string& op_type,
+                              const std::string& target) const;
+  bool Exists(const std::string& op_type, const std::string& target) const;
   Registry() = default;
 
  private:
@@ -67,24 +67,24 @@ class Registry {
 #define UNUSED __attribute__((unused))
 #endif
 
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
+#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(uniq_name, msg)         \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
-#define REGISTER_SUBGRAPH_BRIDGE(dev_type, op_type, cvt_func_name)        \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                               \
-      __reg_subgraph_bridge_##dev_type##_##op_type##__,                   \
+#define REGISTER_SUBGRAPH_BRIDGE(op_type__, target__, cvt_func_name)      \
+  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE_LITE(                          \
+      __reg_subgraph_bridge_##op_type__##_##target__##__,                 \
       "REGISTER_SUBGRAPH_BRIDGE must be called in global namespace only " \
       "once!");                                                           \
-  int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert() {           \
+  int __reg_subgraph_bridge_##op_type__##_##target__##_Insert() {         \
     paddle::lite::subgraph::Registry::Instance().Insert(                  \
-        #dev_type, #op_type, cvt_func_name);                              \
+        #op_type__, #target__, cvt_func_name);                            \
     return 0;                                                             \
   }
 
-#define USE_SUBGRAPH_BRIDGE(dev_type, op_type)                            \
-  extern int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert();     \
-  static int __reg_subgraph_bridge_##dev_type##_##op_type##_Insert_return \
-      UNUSED = __reg_subgraph_bridge_##dev_type##_##op_type##_Insert();
+#define USE_SUBGRAPH_BRIDGE(op_type__, target__)                            \
+  extern int __reg_subgraph_bridge_##op_type__##_##target__##_Insert();     \
+  static int __reg_subgraph_bridge_##op_type__##_##target__##_Insert_return \
+      UNUSED = __reg_subgraph_bridge_##op_type__##_##target__##_Insert();
diff --git a/lite/kernels/npu/bridges/reshape_op.cc b/lite/kernels/npu/bridges/reshape_op.cc
index d5100dee4a..50c7f9d65a 100644
--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -34,26 +34,25 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
   auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
+
   auto out_name = op_info->Output("Out").front();
   auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Reshape node
-  auto reshape_node = graph->AddNode<ge::op::Reshape>(out_name);
-  reshape_node->set_input_tensor(*x_node);
+  auto reshape_node = graph->Add<ge::op::Reshape>(
+      out_name, x_node->precision(), x_node->layout());
+  auto reshape_op = reshape_node->data<ge::op::Reshape>();
+  reshape_op->set_input_tensor(*x_node->data());
 
   // Read shape from "ShapeTensor"(input), or "Shape"(input), or "shape"(attr)
   if (HasInputArg(op_info, scope, "ShapeTensor")) {
@@ -64,9 +63,9 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     // auto actual_shape_type = kernel->GetInputDeclType("Shape");
     // CHECK(actual_shape_type->precision() == PRECISION(kInt32));
     // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW));
-    std::shared_ptr<ge::Operator> actual_shape_node = nullptr;
-    if (graph->HasNode(actual_shape_name)) {
-      actual_shape_node = graph->GetNode(actual_shape_name);
+    std::shared_ptr<Node> actual_shape_node = nullptr;
+    if (graph->Has(actual_shape_name)) {
+      actual_shape_node = graph->Get(actual_shape_name);
     } else {
       auto actual_shape = scope->FindMutableTensor(actual_shape_name);
       auto actual_shape_dims = actual_shape->dims();
@@ -80,13 +79,13 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
         LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                         "but Shape has "
                      << out_shape.size();
+        return FAILED;
       }
-      auto actual_shape_const_node =
-          graph->AddNode(actual_shape_name,
-                         std::vector<int>(out_shape.begin(), out_shape.end()));
-      actual_shape_node = actual_shape_const_node;
+      actual_shape_node =
+          graph->Add(actual_shape_name,
+                     std::vector<int>(out_shape.begin(), out_shape.end()));
     }
-    reshape_node->set_input_w(*actual_shape_node);
+    reshape_op->set_input_w(*actual_shape_node->data());
   } else {
     auto shape = op_info->GetAttr<std::vector<int>>("shape");
     auto out_dims = lite::operators::ValidateShape(shape, x_dims);
@@ -95,33 +94,12 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                       "but shape has "
                    << out_shape.size();
+      return FAILED;
     }
-    reshape_node->set_attr_shape(
+    reshape_op->set_attr_shape(
         ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
   }
 
-  // XShape node
-  if (op_type == "reshape2") {
-    // Append an extra reshape node to calc XShape
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1, 1);
-    for (size_t i = 0; i < x_dims.size(); i++) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    if (xshape_dims.size() > 4) {
-      LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
-                      "but XShape has "
-                   << xshape_dims.size();
-      return FAILED;
-    }
-    auto xshape_name = op_info->Output("XShape").front();
-    // auto xshape_type = kernel->GetOutputDeclType("XShape");
-    // CHECK(xshape_type->precision() == PRECISION(kFloat));
-    // CHECK(xshape_type->layout() == DATALAYOUT(kNCHW));
-    auto xshape_node = graph->AddNode<ge::op::Reshape>(xshape_name);
-    xshape_node->set_input_tensor(*x_node);
-    xshape_node->set_attr_shape(
-        ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end()));
-  }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
@@ -130,9 +108,9 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         reshape,
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kNPU,
                          paddle::lite::subgraph::npu::ReshapeConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         reshape2,
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kNPU,
                          paddle::lite::subgraph::npu::ReshapeConverter);
diff --git a/lite/kernels/npu/bridges/scale_op.cc b/lite/kernels/npu/bridges/scale_op.cc
index ca04996faf..d0139a9e2f 100644
--- a/lite/kernels/npu/bridges/scale_op.cc
+++ b/lite/kernels/npu/bridges/scale_op.cc
@@ -37,12 +37,15 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
-  CHECK_GE(x_dims.size(), 2);
+  auto x_rank = x_dims.size();
+  CHECK_GE(x_rank, 2);
   auto out_name = op_info->Output("Out").front();
   auto out_type = kernel->GetOutputDeclType("Out");
   CHECK(out_type->precision() == PRECISION(kFloat));
   CHECK(out_type->layout() == DATALAYOUT(kNCHW));
-  std::vector<int64_t> scale_bias_shape = {x_dims[1]};
+  // HiAI only support [n, c, 1, 1] for the shape of scale and bias
+  std::vector<int64_t> scale_bias_shape = {
+      1, x_rank < 3 ? 1 : x_dims[x_rank - 3], 1, 1};
   float scale = op_info->GetAttr<float>("scale");
   float bias = op_info->GetAttr<float>("bias");
   bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
@@ -51,29 +54,28 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x, CvtShape(x_dims));
   }
 
   // Scale node
-  auto scale_node = graph->AddNode<ge::op::Scale>(out_name);
-  scale_node->set_input_x(*x_node);
-  scale_node->set_attr_axis(1);
+  auto scale_node = graph->Add<ge::op::Scale>(out_name);
+  auto scale_op = scale_node->data<ge::op::Scale>();
+  scale_op->set_input_x(*x_node->data());
+  scale_op->set_attr_axis(1);
 
   // Add filter node(fill with scale)
-  auto filter_const_node =
-      graph->AddNode(out_name + "/filter", scale, scale_bias_shape);
-  scale_node->set_input_filter(*filter_const_node);
+  auto filter_node = graph->Add(out_name + "/filter", scale, scale_bias_shape);
+  scale_op->set_input_filter(*filter_node->data());
 
   // Add bias node(fill with bias)
   if (fabs(bias) > 1e-6f) {
-    auto bias_const_node =
-        graph->AddNode(out_name + "/bias", bias, scale_bias_shape);
-    scale_node->set_input_bias(*bias_const_node);
-    scale_node->set_attr_has_bias_value(true);
+    auto bias_node = graph->Add(out_name + "/bias", bias, scale_bias_shape);
+    scale_op->set_input_bias(*bias_node->data());
+    scale_op->set_attr_has_bias_value(true);
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
@@ -83,6 +85,6 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         scale,
+REGISTER_SUBGRAPH_BRIDGE(scale,
+                         kNPU,
                          paddle::lite::subgraph::npu::ScaleConverter);
diff --git a/lite/kernels/npu/bridges/shuffle_channel_op.cc b/lite/kernels/npu/bridges/shuffle_channel_op.cc
index 47469e1506..0552bd2382 100644
--- a/lite/kernels/npu/bridges/shuffle_channel_op.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op.cc
@@ -44,17 +44,19 @@ int ShuffleChannelConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto group = op_info->GetAttr<int>("group");
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Shuffle Channel node
-  auto shuffle_channel_node = graph->AddNode<ge::op::ShuffleChannel>(out_name);
-  shuffle_channel_node->set_input_x(*x_node);
-  shuffle_channel_node->set_attr_group(group);
+  auto shuffle_channel_node = graph->Add<ge::op::ShuffleChannel>(out_name);
+  auto shuffle_channel_op =
+      shuffle_channel_node->data<ge::op::ShuffleChannel>();
+  shuffle_channel_op->set_input_x(*x_node->data());
+  shuffle_channel_op->set_attr_group(group);
   return SUCCESS;
 }
 
@@ -63,6 +65,6 @@ int ShuffleChannelConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         shuffle_channel,
+REGISTER_SUBGRAPH_BRIDGE(shuffle_channel,
+                         kNPU,
                          paddle::lite::subgraph::npu::ShuffleChannelConverter);
diff --git a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc b/lite/kernels/npu/bridges/shuffle_channel_op_test.cc
deleted file mode 100644
index cbf2eac9f3..0000000000
--- a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/shuffle_channel_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void shuffle_channel_ref(
-    const std::shared_ptr<operators::ShuffleChannelOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  int group = op_info->GetAttr<int>("group");
-  auto x_dims = x->dims();
-
-  int n_size = x_dims.production() / x_dims[0];
-  int c_size = n_size / x_dims[1];
-  for (int n = 0; n < x_dims[0]; n++) {
-    int g_num = x_dims[1] / group;
-    auto tmp_out_data = out_data;
-    for (int g = 0; g < g_num; g++) {
-      auto tmp_x_data = x_data + g * c_size;
-      for (int i = 0; i < group; i++) {
-        std::memcpy(tmp_out_data,
-                    tmp_x_data + i * g_num * c_size,
-                    c_size * sizeof(float));
-        tmp_out_data += c_size;
-      }
-    }
-    x_data += n_size;
-    out_data += n_size;
-  }
-}
-
-void test_shuffle_channel(int bs, int ic, int ih, int iw, int group) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("shuffle_channel");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("group", group);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ShuffleChannelOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  shuffle_channel_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, softmax) {
-  for (auto bs : {1, 4}) {
-    for (auto ic : {1, 24, 35}) {
-      for (auto ih : {1, 4}) {
-        for (auto iw : {1, 4}) {
-          for (auto group : {1, 3, 7, 24, 35}) {
-            if (ic % group != 0) continue;
-            test_shuffle_channel(bs, ic, ih, iw, group);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(shuffle_channel);
-USE_NPU_BRIDGE(shuffle_channel);
diff --git a/lite/kernels/npu/bridges/softmax_op.cc b/lite/kernels/npu/bridges/softmax_op.cc
index 01d8b0a944..24bbb790e0 100644
--- a/lite/kernels/npu/bridges/softmax_op.cc
+++ b/lite/kernels/npu/bridges/softmax_op.cc
@@ -37,29 +37,34 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
+  auto x_rank = x_dims.size();
   auto out_name = op_info->Output("Out").front();
   auto out_type = kernel->GetOutputDeclType("Out");
   CHECK(out_type->precision() == PRECISION(kFloat));
   CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto axis = op_info->GetAttr<int>("axis");
-  if (x_dims.size() > 3) {
-    CHECK(!(axis == 2 && x_dims[3] > 1))
-        << "[NPU] Unsupported softmax params: axis = " << axis
-        << "  :x_w = " << x_dims[3];
+  if (axis < 0) {
+    axis += x_rank;
+  }
+  if (axis == 2 && x_rank > 3 && x_dims[3] != 1) {
+    LOG(WARNING) << "[NPU] Unsupported softmax params: axis = " << axis
+                 << "  :x_w = " << x_dims[3];
+    return FAILED;
   }
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Softmax node
-  auto softmax_node = graph->AddNode<ge::op::Softmax>(out_name);
-  softmax_node->set_input_x(*x_node);
-  softmax_node->set_attr_axis(axis);
+  auto softmax_node = graph->Add<ge::op::Softmax>(out_name);
+  auto softmax_op = softmax_node->data<ge::op::Softmax>();
+  softmax_op->set_input_x(*x_node->data());
+  softmax_op->set_attr_axis(axis);
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
@@ -68,6 +73,6 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         softmax,
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kNPU,
                          paddle::lite::subgraph::npu::SoftmaxConverter);
diff --git a/lite/kernels/npu/bridges/split_op.cc b/lite/kernels/npu/bridges/split_op.cc
index 597de04d5b..2cdf49fd54 100644
--- a/lite/kernels/npu/bridges/split_op.cc
+++ b/lite/kernels/npu/bridges/split_op.cc
@@ -47,33 +47,34 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   int64_t sections_num = static_cast<int64_t>(sections.size());
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Split node
-  auto split_node = graph->AddNode<ge::op::Split>(op_type + "/" + x_name);
-  split_node->set_input_x(*x_node);
-  split_node->set_attr_axis(static_cast<int64_t>(axis));
+  auto split_node = graph->Add<ge::op::Split>(op_type + "/" + x_name);
+  auto split_op = split_node->data<ge::op::Split>();
+  split_op->set_input_x(*x_node->data());
+  split_op->set_attr_axis(static_cast<int64_t>(axis));
   if (num > 0) {
-    split_node->set_attr_output_num(static_cast<int64_t>(num));
+    split_op->set_attr_output_num(static_cast<int64_t>(num));
   } else {
-    split_node->set_attr_output_num(sections_num);
+    split_op->set_attr_output_num(sections_num);
     auto size_split = ge::AttrValue::LIST_INT(sections.begin(), sections.end());
-    split_node->set_attr_size_split(size_split);
+    split_op->set_attr_size_split(size_split);
   }
 
-  split_node->create_dynamic_output_y(out_names.size());
+  split_op->create_dynamic_output_y(out_names.size());
   int idx = 1;
   for (auto& out_name : out_names) {
-    auto zero_const_node =
-        graph->AddNode(out_name + "/zero" + std::to_string(idx), 0);
-    auto add_node = graph->AddNode<ge::op::Add>(out_name);
-    add_node->set_input_x1(*split_node, "y" + std::to_string(idx));
-    add_node->set_input_x2(*zero_const_node);
+    auto zero_node = graph->Add(out_name + "/zero" + std::to_string(idx), 0);
+    auto add_node = graph->Add<ge::op::Add>(out_name);
+    auto add_op = add_node->data<ge::op::Add>();
+    add_op->set_input_x1(*split_node->data(), "y" + std::to_string(idx));
+    add_op->set_input_x2(*zero_node->data());
     idx++;
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
@@ -84,6 +85,6 @@ int SplitConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         split,
+REGISTER_SUBGRAPH_BRIDGE(split,
+                         kNPU,
                          paddle::lite::subgraph::npu::SplitConverter);
diff --git a/lite/kernels/npu/bridges/sqrt_op.cc b/lite/kernels/npu/bridges/sqrt_op.cc
index 2ee58862fb..e8fde2272a 100644
--- a/lite/kernels/npu/bridges/sqrt_op.cc
+++ b/lite/kernels/npu/bridges/sqrt_op.cc
@@ -43,16 +43,17 @@ int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Sqrt node
-  auto sqrt_node = graph->AddNode<ge::op::Sqrt>(out_name);
-  sqrt_node->set_input_x(*x_node);
+  auto sqrt_node = graph->Add<ge::op::Sqrt>(out_name);
+  auto sqrt_op = sqrt_node->data<ge::op::Sqrt>();
+  sqrt_op->set_input_x(*x_node->data());
   return SUCCESS;
 }
 
@@ -61,4 +62,6 @@ int SqrtConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU, sqrt, paddle::lite::subgraph::npu::SqrtConverter);
+REGISTER_SUBGRAPH_BRIDGE(sqrt,
+                         kNPU,
+                         paddle::lite::subgraph::npu::SqrtConverter);
diff --git a/lite/kernels/npu/bridges/square_op.cc b/lite/kernels/npu/bridges/square_op.cc
index 3f6676c8a8..f03c7690cb 100644
--- a/lite/kernels/npu/bridges/square_op.cc
+++ b/lite/kernels/npu/bridges/square_op.cc
@@ -43,16 +43,17 @@ int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Square node
-  auto square_node = graph->AddNode<ge::op::Square>(out_name);
-  square_node->set_input_x(*x_node);
+  auto square_node = graph->Add<ge::op::Square>(out_name);
+  auto square_op = square_node->data<ge::op::Square>();
+  square_op->set_input_x(*x_node->data());
   return SUCCESS;
 }
 
@@ -61,6 +62,6 @@ int SquareConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         square,
+REGISTER_SUBGRAPH_BRIDGE(square,
+                         kNPU,
                          paddle::lite::subgraph::npu::SquareConverter);
diff --git a/lite/kernels/npu/bridges/transpose_op.cc b/lite/kernels/npu/bridges/transpose_op.cc
index 70449dac7a..bdac84df3c 100644
--- a/lite/kernels/npu/bridges/transpose_op.cc
+++ b/lite/kernels/npu/bridges/transpose_op.cc
@@ -37,23 +37,24 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
-  auto out_name = op_info->Input("Out").front();
+  auto out_name = op_info->Output("Out").front();
   auto axis = op_info->GetAttr<std::vector<int>>("axis");
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Transpose node
-  auto transpose_node = graph->AddNode<ge::op::Permute>(out_name);
-  transpose_node->set_input_x(*x_node);
-  auto w_const_node = graph->AddNode(out_name + "/w", 1.0f);
-  transpose_node->set_input_w(*w_const_node);
-  transpose_node->set_attr_order(
+  auto transpose_node = graph->Add<ge::op::Permute>(out_name);
+  auto transpose_op = transpose_node->data<ge::op::Permute>();
+  transpose_op->set_input_x(*x_node->data());
+  auto w_node = graph->Add(out_name + "/w", 1.0f);
+  transpose_op->set_input_w(*w_node->data());
+  transpose_op->set_attr_order(
       ge::AttrValue::LIST_INT(axis.begin(), axis.end()));
   return SUCCESS;
 }
@@ -63,9 +64,9 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         transpose,
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kNPU,
                          paddle::lite::subgraph::npu::TransposeConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         transpose2,
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kNPU,
                          paddle::lite::subgraph::npu::TransposeConverter);
diff --git a/lite/kernels/npu/bridges/transpose_op_test.cc b/lite/kernels/npu/bridges/transpose_op_test.cc
deleted file mode 100644
index 9ad2610caa..0000000000
--- a/lite/kernels/npu/bridges/transpose_op_test.cc
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/transpose_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-int data_index(std::vector<int> pos, DDimLite dims) {
-  int d1 = dims[1];
-  int d2 = dims[2];
-  int d3 = dims[3];
-  return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1;
-}
-
-std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
-  std::vector<int> out_pos(in_pos.size());
-  for (int i = 0; i < axis.size(); i++) {
-    out_pos[axis[i]] = in_pos[i];
-  }
-  return out_pos;
-}
-
-void transpose_ref(const std::shared_ptr<operators::TransposeOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto output =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = input->dims();
-  auto y_dims = output->dims();
-  auto axis = op_info->GetAttr<std::vector<int>>("axis");
-
-  auto* input_data = input->data<float>();
-  auto* output_data = output->mutable_data<float>();
-
-  int input_n = x_dims[0];
-  int input_c = x_dims[1];
-  int input_h = x_dims[2];
-  int input_w = x_dims[3];
-  int output_n = y_dims[0];
-  int output_c = y_dims[1];
-  int output_h = y_dims[2];
-  int output_w = y_dims[3];
-
-  for (int n = 0; n < input_n; ++n) {
-    for (int c = 0; c < input_c; ++c) {
-      for (int h = 0; h < input_h; ++h) {
-        for (int w = 0; w < input_w; ++w) {
-          std::vector<int> in_pos{n, c, h, w};
-          std::vector<int> out_pos = pos_trans(in_pos, axis);
-          int in_index = data_index(in_pos, x_dims);
-          int out_index = data_index(out_pos, y_dims);
-          output_data[out_index] = input_data[in_index];
-        }
-      }
-    }
-  }
-}
-
-void test_transpose(int bs, int ic, int ih, int iw, std::vector<int> axis) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("transpose");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::TransposeOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  transpose_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, transpose) {
-#if 0
-  for (auto bs : {1, 4, 7}) {
-    for (auto ic : {1, 4, 7}) {
-      for (auto ih : {1, 4, 7}) {
-        for (auto iw : {1, 4, 7}) {
-          for (auto axis : {std::vector<int>{0, 1, 2, 3},
-                            std::vector<int>{0, 1, 3, 2},
-                            std::vector<int>{0, 3, 1, 2},
-                            std::vector<int>{1, 2, 3, 0},
-                            std::vector<int>{3, 2, 1, 0},
-                            std::vector<int>{2, 3, 1, 0}}) {
-            test_transpose(bs, ic, ih, iw, axis);
-          }
-        }
-      }
-    }
-  }
-#endif
-  test_transpose(2, 3, 4, 5, std::vector<int>{0, 1, 3, 2});
-  // test_transpose(2, 3, 4, 5, std::vector<int>{0, 1, 2, 3});
-  // test_transpose(2, 2, 2, 2, std::vector<int>{0,1,3,2});
-  // test_transpose(1, 1, 2, 2, std::vector<int>{0,1,3,2});
-  // test_transpose(1, 1, 1, 2, std::vector<int>{0,1,2,3});
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(transpose);
-USE_NPU_BRIDGE(transpose);
-
-USE_LITE_OP(transpose2);
-USE_NPU_BRIDGE(transpose2);
diff --git a/lite/kernels/npu/bridges/unsqueeze_op.cc b/lite/kernels/npu/bridges/unsqueeze_op.cc
old mode 100755
new mode 100644
index 8ff95d4ed8..bcb3bee83b
--- a/lite/kernels/npu/bridges/unsqueeze_op.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op.cc
@@ -32,30 +32,30 @@ int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   auto x_name = op_info->Input("X").front();
   auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
   CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
+
   auto out_name = op_info->Output("Out").front();
   auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
   CHECK(out_type->layout() == DATALAYOUT(kNCHW));
   auto out_shape = scope->FindTensor(out_name)->dims().Vectorize();
   CHECK(op_info->HasAttr("axes"))
       << "[NPU] unsqueeze not support axes from tensor now";
 
   // X node
-  std::shared_ptr<ge::Operator> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Unsqueeze node
-  auto unsqueeze_node = graph->AddNode<ge::op::Reshape>(out_name);
-  unsqueeze_node->set_input_tensor(*x_node);
-  unsqueeze_node->set_attr_shape(
+  auto unsqueeze_node = graph->Add<ge::op::Reshape>(out_name);
+  auto unsqueeze_op = unsqueeze_node->data<ge::op::Reshape>();
+  unsqueeze_op->set_input_tensor(*x_node->data());
+  unsqueeze_op->set_attr_shape(
       ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
@@ -65,9 +65,9 @@ int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         unsqueeze,
+REGISTER_SUBGRAPH_BRIDGE(unsqueeze,
+                         kNPU,
                          paddle::lite::subgraph::npu::UnsqueezeConverter);
-REGISTER_SUBGRAPH_BRIDGE(NPU,
-                         unsqueeze2,
+REGISTER_SUBGRAPH_BRIDGE(unsqueeze2,
+                         kNPU,
                          paddle::lite::subgraph::npu::UnsqueezeConverter);
diff --git a/lite/kernels/npu/bridges/unsqueeze_op_test.cc b/lite/kernels/npu/bridges/unsqueeze_op_test.cc
deleted file mode 100755
index c59843f614..0000000000
--- a/lite/kernels/npu/bridges/unsqueeze_op_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/unsqueeze_op.h"
-#include <gtest/gtest.h>
-#include <cmath>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-static DDim GetOutputShape(const std::vector<int>& unsqz_dims,
-                           const DDim& in_dims) {
-  int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
-  int cur_output_size = in_dims.size();
-  std::vector<int64_t> output_shape(output_size, 0);
-
-  // Validate Check: rank range.
-  CHECK_LE(output_size, 6) << "The output tensor's rank should be less than 6.";
-
-  for (int axis : unsqz_dims) {
-    int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
-    // Validate Check: the axis bound
-    CHECK((cur >= 0) && (cur <= cur_output_size))
-        << "The unsqueeze dims must be within range of current rank.";
-    // Move old axis, and insert new axis
-    for (int i = cur_output_size; i >= cur; --i) {
-      if (output_shape[i] == 1) {
-        // Move axis
-        output_shape[i + 1] = 1;
-        output_shape[i] = 0;
-      }
-    }
-
-    output_shape[cur] = 1;
-    // Add the output size.
-    cur_output_size++;
-  }
-
-  // Make output shape
-  for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
-    if (output_shape[out_idx] == 0) {
-      output_shape[out_idx] = in_dims[in_idx++];
-    }
-  }
-
-  return DDim(output_shape);
-}
-
-template <typename dtype>
-void unsqueeze_ref(const std::shared_ptr<operators::UnsqueezeOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-
-  auto x = scope->FindTensor("x");
-  auto out = scope->FindMutableTensor("out_ref");
-  auto axes = op_info->GetAttr<std::vector<int>>("axes");
-  auto y_dims = GetOutputShape(axes, x->dims());
-  out->Resize(y_dims);
-
-  auto x_data = x->data<dtype>();
-  auto out_data = out->mutable_data<dtype>();
-
-  memcpy(out_data, x_data, x->numel() * sizeof(float));
-}
-
-void test_unsqueeze(const std::vector<int64_t>& input_shape,
-                    std::vector<int> axes) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.NewTensor(x_var_name);
-  auto* out = scope.NewTensor(out_var_name);
-  auto* out_ref = scope.NewTensor(out_ref_var_name);
-  x->Resize(input_shape);
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("unsqueeze");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axes", axes);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::UnsqueezeOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // execute reference implementation and save to output tensor
-  unsqueeze_ref<float>(op);
-
-  // compare results
-  CHECK_EQ(out->dims().size(), out_ref->dims().size());
-  for (int i = 0; i < out->dims().size(); i++) {
-    CHECK_EQ(out->dims()[i], out_ref->dims()[i]);
-  }
-
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, unsqueeze) {
-  test_unsqueeze({2}, {0, 2});
-  test_unsqueeze({2, 3}, {1, 3});
-  test_unsqueeze({1, 2, 3}, {3});
-  test_unsqueeze({5, 6, 7}, {1});
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(unsqueeze);
-USE_NPU_BRIDGE(unsqueeze);
diff --git a/lite/kernels/npu/bridges/utility.cc b/lite/kernels/npu/bridges/utility.cc
old mode 100755
new mode 100644
index f79936c5d7..d9c9ffae92
--- a/lite/kernels/npu/bridges/utility.cc
+++ b/lite/kernels/npu/bridges/utility.cc
@@ -85,10 +85,26 @@ ge::Format CvtDataLayoutType(DataLayoutType itype) {
   return otype;
 }
 
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape) {
+  std::vector<int64_t> out_shape;
+  // Padding the shape to 4-dimensions(NCHW)
+  for (int i = 0; i < 4 - in_shape.size(); i++) {
+    out_shape.push_back(1);
+  }
+  for (int i = 0; i < in_shape.size(); i++) {
+    out_shape.push_back(in_shape[i]);
+  }
+  return out_shape;
+}
+
+std::vector<int64_t> CvtShape(const DDim& in_dims) {
+  return CvtShape(in_dims.Vectorize());
+}
+
 ge::TensorPtr CvtTensor(const Tensor& in_tensor,
                         std::vector<int64_t> out_shape,
-                        PrecisionType in_precision,
                         DataLayoutType in_layout) {
+  PrecisionType in_precision = in_tensor.precision();
   auto in_size = in_tensor.dims().production();
   auto in_shape = in_tensor.dims().Vectorize();
   if (out_shape.empty()) {
diff --git a/lite/kernels/npu/bridges/utility.h b/lite/kernels/npu/bridges/utility.h
old mode 100755
new mode 100644
index e8300a0472..c4721d55a0
--- a/lite/kernels/npu/bridges/utility.h
+++ b/lite/kernels/npu/bridges/utility.h
@@ -19,12 +19,12 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "ai_ddk_lib/include/graph/buffer.h"
-#include "ai_ddk_lib/include/graph/graph.h"
-#include "ai_ddk_lib/include/graph/model.h"
-#include "ai_ddk_lib/include/graph/op/all_ops.h"
-#include "ai_ddk_lib/include/graph/operator.h"
-#include "ai_ddk_lib/include/graph/operator_reg.h"
+#include "graph/buffer.h"
+#include "graph/graph.h"
+#include "graph/model.h"
+#include "graph/op/all_ops.h"
+#include "graph/operator.h"
+#include "graph/operator_reg.h"
 #include "lite/core/op_lite.h"
 #include "lite/utils/macros.h"
 
@@ -70,59 +70,15 @@ ge::DataType CvtPrecisionType(PrecisionType itype);
 
 ge::Format CvtDataLayoutType(DataLayoutType itype);
 
+// Padding the shape to 4-dimensions(NCHW) for HiAI
+std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape);
+
+std::vector<int64_t> CvtShape(const DDim& in_dims);
+
 ge::TensorPtr CvtTensor(const Tensor& in_tensor,
                         std::vector<int64_t> out_shape = {},
-                        PrecisionType in_precision = PRECISION(kFloat),
                         DataLayoutType in_layout = DATALAYOUT(kNCHW));
 
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(const std::vector<T>& data,
-                                      std::vector<int64_t> shape = {},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  const std::type_info& info = typeid(T);
-  ge::DataType type = ge::DT_FLOAT;
-  if (info == typeid(float)) {
-    type = ge::DT_FLOAT;
-  } else if (info == typeid(int8_t)) {
-    type = ge::DT_INT8;
-  } else if (info == typeid(int16_t)) {
-    type = ge::DT_INT16;
-  } else if (info == typeid(int32_t)) {
-    type = ge::DT_INT32;
-  } else if (info == typeid(int64_t)) {
-    type = ge::DT_INT64;
-  } else {
-    LOG(FATAL) << "[NPU] Unknow value type " << info.name();
-  }
-  if (shape.empty()) {
-    shape = {static_cast<int64_t>(data.size())};
-  } else {
-    int size = 1;
-    for (auto i : shape) {
-      size *= i;
-    }
-    CHECK_EQ(data.size(), size);
-  }
-  ge::TensorDesc desc(ge::Shape(shape), format, type);
-  ge::TensorPtr tensor = std::make_shared<ge::Tensor>();
-  tensor->SetTensorDesc(desc);
-  tensor->SetData(reinterpret_cast<uint8_t*>(data.data()),
-                  data.size() * sizeof(T));
-  return tensor;
-}
-
-template <typename T>
-ge::TensorPtr CreateTensorAndFillData(T value,
-                                      std::vector<int64_t> shape = {1},
-                                      ge::Format format = ge::FORMAT_NCHW) {
-  int64_t size = 1;
-  for (auto i : shape) {
-    size *= i;
-  }
-  std::vector<T> data(size, value);
-  return CreateTensorAndFillData(data, shape, format);
-}
-
 int CvtActMode(std::string act_type);
 
 }  // namespace npu
diff --git a/lite/kernels/npu/graph_compute.cc b/lite/kernels/npu/graph_compute.cc
deleted file mode 100644
index 9a05a33062..0000000000
--- a/lite/kernels/npu/graph_compute.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/npu/graph_compute.h"
-#include <sys/time.h>
-#include <time.h>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-
-void GraphCompute::PrepareForRun() {
-  auto& ctx = this->ctx_->template As<NPUContext>();
-  auto& param = this->Param<param_t>();
-
-  // Load HiAI model from the weight tensor and release its buffer
-  // to save memory
-  CHECK(param.weight);
-  CHECK(lite::npu::LoadModel(*param.weight, &model_client_, &model_name_));
-  // TODO(hong19860320): find an good way to free the model data.
-  // No interface exists to free the data of tensor, so I resize the dim to 1
-  // and change target to force it to realloc a small size memory.
-  param.weight->Resize({1});
-  param.weight->mutable_data<int8_t>(TargetType::kARM);
-  CHECK(model_client_);
-
-  // Query the dimensions of NPU input and output tensors from HiAI model
-  std::vector<hiai::TensorDimension> npu_idims;
-  std::vector<hiai::TensorDimension> npu_odims;
-  int ret =
-      model_client_->GetModelIOTensorDim(model_name_, npu_idims, npu_odims);
-  CHECK_EQ(ret, hiai::AI_SUCCESS)
-      << "[NPU] Get the dimensions of input and output tensors failed.";
-
-  // Check whether the data sizes of NPU input and output tensors are the
-  // same as CPU's, then create and initialize NPU input and output tensors.
-  npu_itensors_.resize(npu_idims.size());
-  npu_otensors_.resize(npu_odims.size());
-  npu_idatasizes_.resize(npu_idims.size());
-  npu_odatasizes_.resize(npu_odims.size());
-  for (size_t i = 0; i < npu_idims.size(); ++i) {
-    auto cpu_itensor = param.inputs[i].second;
-    CHECK(cpu_itensor);
-    VLOG(3) << "[NPU] CPU input dims[" << i << "]: " << cpu_itensor->dims();
-    VLOG(3) << "[NPU] NPU input dims[" << i << "]: {"
-            << npu_idims[i].GetNumber() << "," << npu_idims[i].GetChannel()
-            << "," << npu_idims[i].GetHeight() << "," << npu_idims[i].GetWidth()
-            << "}";
-    npu_idatasizes_[i] = npu_idims[i].GetNumber() * npu_idims[i].GetChannel() *
-                         npu_idims[i].GetHeight() * npu_idims[i].GetWidth();
-    CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
-    npu_itensors_[i].reset(new hiai::AiTensor);
-    npu_itensors_[i]->Init(&(npu_idims[i]));
-  }
-  for (size_t i = 0; i < npu_odims.size(); ++i) {
-    auto cpu_otensor = param.outputs[i].second;
-    CHECK(cpu_otensor);
-    VLOG(3) << "[NPU] CPU output dims[" << i << "]: " << cpu_otensor->dims();
-    VLOG(3) << "[NPU] NPU output dims[" << i << "]: {"
-            << npu_odims[i].GetNumber() << "," << npu_odims[i].GetChannel()
-            << "," << npu_odims[i].GetHeight() << "," << npu_odims[i].GetWidth()
-            << "}";
-    npu_odatasizes_[i] = npu_odims[i].GetNumber() * npu_odims[i].GetChannel() *
-                         npu_odims[i].GetHeight() * npu_odims[i].GetWidth();
-    if (cpu_otensor->dims().production() != npu_odatasizes_[i]) {
-      cpu_otensor->Resize({npu_odims[i].GetNumber(),
-                           npu_odims[i].GetChannel(),
-                           npu_odims[i].GetHeight(),
-                           npu_odims[i].GetWidth()});
-    }
-    npu_otensors_[i].reset(new hiai::AiTensor);
-    npu_otensors_[i]->Init(&(npu_odims[i]));
-  }
-}
-
-void GraphCompute::Run() {
-  auto& param = this->Param<param_t>();
-
-  // Check whether the data sizes of NPU input tensors are the same as
-  // CPU's, and copy the data of CPU input tensors to NPU's.
-  CHECK_EQ(param.inputs.size(), npu_itensors_.size());
-  CHECK_EQ(param.outputs.size(), npu_otensors_.size());
-  for (size_t i = 0; i < param.inputs.size(); ++i) {
-    auto cpu_itensor = param.inputs[i].second;
-    CHECK(cpu_itensor);
-    CHECK_EQ(cpu_itensor->dims().production(), npu_idatasizes_[i]);
-    std::memcpy(static_cast<float*>(npu_itensors_[i]->GetBuffer()),
-                cpu_itensor->data<float>(),
-                sizeof(float) * static_cast<size_t>(npu_idatasizes_[i]));
-  }
-
-  // Run HiAI model with model name
-  std::string key = "model_name";  // Note: key seems must be model_name
-  model_context_.AddPara(key, model_name_);
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  int istamp;
-  auto start_time = GetCurrentUS();
-  CHECK_EQ(hiai::AI_SUCCESS,
-           model_client_->Process(
-               model_context_, npu_itensors_, npu_otensors_, 1000, istamp));
-  VLOG(3) << "[NPU] Process cost " << GetCurrentUS() - start_time << " us";
-
-  // Check whether the data sizes of NPU output tensors are the same as
-  // CPU's, and copy the data of NPU output tensors to CPU's.
-  for (size_t i = 0; i < param.outputs.size(); ++i) {
-    auto cpu_otensor = param.outputs[i].second;
-    CHECK(cpu_otensor);
-    CHECK_EQ(cpu_otensor->dims().production(), npu_odatasizes_[i]);
-    std::memcpy(cpu_otensor->mutable_data<float>(),
-                static_cast<float*>(npu_otensors_[i]->GetBuffer()),
-                sizeof(float) * static_cast<size_t>(npu_odatasizes_[i]));
-  }
-}
-
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(graph_op,
-                     kNPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::npu::GraphCompute,
-                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/npu/graph_compute.h b/lite/kernels/npu/graph_compute.h
deleted file mode 100644
index b289b8e42f..0000000000
--- a/lite/kernels/npu/graph_compute.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-
-class GraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
- public:
-  using param_t = operators::GraphParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  virtual ~GraphCompute() = default;
-
- private:
-  std::shared_ptr<hiai::AiModelMngerClient> model_client_;
-  std::string model_name_;
-  hiai::AiContext model_context_;
-
-  std::vector<int64_t> npu_idatasizes_;
-  std::vector<int64_t> npu_odatasizes_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_itensors_;
-  std::vector<std::shared_ptr<hiai::AiTensor>> npu_otensors_;
-};
-
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/npu/subgraph_compute.cc b/lite/kernels/npu/subgraph_compute.cc
old mode 100755
new mode 100644
index d9b1919506..6f32099274
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -16,7 +16,7 @@
 #include <sys/time.h>
 #include <time.h>
 #include <utility>
-#include "ai_ddk_lib/include/hiai_ir_build.h"
+#include "hiai_ir_build.h"  // NOLINT
 #include "lite/backends/npu/device.h"
 #include "lite/core/op_registry.h"
 #include "lite/kernels/npu/bridges/graph.h"
@@ -39,13 +39,13 @@ int SubgraphEngine::BuildDeviceProgram() {
     op->CheckShape();
     op->InferShape();
     std::string op_type = op->op_info()->Type();
-    if (!bridges.Exists("NPU", op_type)) {
+    if (!bridges.Exists(op_type, "kNPU")) {
       return subgraph::FAILED;
     }
     auto kernel = inst.kernel();
-    status |= bridges.Select("NPU", op_type)(reinterpret_cast<void*>(&graph),
-                                             const_cast<OpLite*>(op),
-                                             const_cast<KernelBase*>(kernel));
+    status |= bridges.Select(op_type, "kNPU")(reinterpret_cast<void*>(&graph),
+                                              const_cast<OpLite*>(op),
+                                              const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
       return subgraph::FAILED;
     }
@@ -57,26 +57,26 @@ int SubgraphEngine::BuildDeviceProgram() {
   std::vector<ge::Operator> device_inodes;
   std::vector<ge::Operator> device_onodes;
   for (auto& input_name : input_names_) {
-    if (graph.HasNode(input_name)) {
-      if (!graph.GetType(input_name).persistable()) {
-        device_inodes.push_back(*graph.GetNode(input_name));
+    if (graph.Has(input_name)) {
+      if (graph.Get(input_name)->is_data()) {
+        device_inodes.push_back(*graph.Get(input_name)->data());
         device_inames_.push_back(input_name);
       } else {
         LOG(WARNING) << "[NPU] Input node " << input_name
-                     << " is skipped because it is a persistable node.";
+                     << " is ignored because it is not a data node.";
       }
     } else {
       LOG(WARNING) << "[NPU] Input node " << input_name
-                   << " is skipped because it does not exist.";
+                   << " is ignored because it does not exist.";
     }
   }
   for (auto& output_name : output_names_) {
-    if (graph.HasNode(output_name)) {
-      device_onodes.push_back(*graph.GetNode(output_name));
+    if (graph.Has(output_name)) {
+      device_onodes.push_back(*graph.Get(output_name)->data());
       device_onames_.push_back(output_name);
     } else {
       LOG(WARNING) << "[NPU] Output node " << output_name
-                   << " is skipped because it does not exist.";
+                   << " is ignored because it does not exist.";
     }
   }
   CHECK(!device_inames_.empty())
@@ -108,14 +108,14 @@ int SubgraphEngine::BuildDeviceProgram() {
   origin_otensors_.resize(device_onames_.size());
   device_otensors_.resize(device_onames_.size());
   for (int i = 0; i < device_inames_.size(); i++) {
-    auto type = graph.GetType(device_inames_[i]);
-    auto precision = type.precision();
-    auto layout = type.layout();
+    auto node = graph.Get(device_inames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
     origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
     CHECK(origin_itensors_[i]);
     origin_idims_[i] = origin_itensors_[i]->dims();
-    VLOG(3) << "[NPU] Inputs[" << i
-            << "] precision: " << PrecisionToStr(precision)
+    VLOG(3) << "[NPU] Inputs[" << i << "] name: " << device_inames_[i]
+            << " precision: " << PrecisionToStr(precision)
             << " layout: " << DataLayoutToStr(layout) << " dims: {"
             << device_idims[i].GetNumber() << ","
             << device_idims[i].GetChannel() << ","
@@ -129,14 +129,14 @@ int SubgraphEngine::BuildDeviceProgram() {
     device_itensors_[i]->Init(&(device_idims[i]));
   }
   for (int i = 0; i < device_onames_.size(); i++) {
-    auto type = graph.GetType(device_onames_[i]);
-    auto precision = type.precision();
-    auto layout = type.layout();
+    auto node = graph.Get(device_onames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
     origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
     CHECK(origin_otensors_[i]);
     origin_odims_[i] = origin_otensors_[i]->dims();
-    VLOG(3) << "[NPU] Outputs[" << i
-            << "] precision: " << PrecisionToStr(precision)
+    VLOG(3) << "[NPU] Outputs[" << i << "] name: " << device_onames_[i]
+            << " precision: " << PrecisionToStr(precision)
             << " layout: " << DataLayoutToStr(layout) << " dims: {"
             << device_odims[i].GetNumber() << ","
             << device_odims[i].GetChannel() << ","
diff --git a/lite/kernels/npu/subgraph_compute.h b/lite/kernels/npu/subgraph_compute.h
old mode 100755
new mode 100644
index 27b4a36cfe..2cdc4a0e62
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -17,7 +17,7 @@
 #include <memory>
 #include <string>
 #include <vector>
-#include "ai_ddk_lib/include/HiAiModelManagerService.h"
+#include "HiAiModelManagerService.h"
 #include "lite/core/kernel.h"
 #include "lite/kernels/npu/bridges/engine.h"
 #include "lite/kernels/npu/bridges/registry.h"
diff --git a/lite/kernels/opencl/CMakeLists.txt b/lite/kernels/opencl/CMakeLists.txt
index 3423b1e920..f4d3254a7b 100644
--- a/lite/kernels/opencl/CMakeLists.txt
+++ b/lite/kernels/opencl/CMakeLists.txt
@@ -14,7 +14,7 @@ add_kernel(pool_opencl OPENCL basic SRCS pool_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(io_copy_compute_opencl OPENCL basic SRCS io_copy_compute.cc DEPS ${tensor_lite} ${cl_kernel_deps})
 add_kernel(relu_opencl OPENCL basic SRCS relu_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(depthwise_conv2d_opencl OPENCL basic SRCS depthwise_conv2d_compute.cc DEPS ${cl_kernel_deps})
-add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
+#add_kernel(conv2d_1x1_opencl OPENCL basic SRCS conv2d_1x1_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(reshape_opencl OPENCL basic SRCS reshape_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(conv_opencl OPENCL basic SRCS conv_compute.cc DEPS ${cl_kernel_deps})
 add_kernel(layout_opencl OPENCL basic SRCS layout_compute.cc DEPS ${cl_kernel_deps})
@@ -49,12 +49,14 @@ lite_cc_test(test_depthwise_conv2d_opencl SRCS depthwise_conv2d_compute_test.cc
         DEPS depthwise_conv2d_opencl op_registry program context cl_image_converter
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
 
-lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc
-        DEPS conv2d_1x1_opencl cl_image_converter op_registry program context
-        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+#lite_cc_test(test_conv2d_1x1_opencl SRCS conv2d_1x1_compute_test.cc
+#        DEPS conv2d_1x1_opencl cl_image_converter op_registry program context
+#        ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
 lite_cc_test(test_reshape_opencl SRCS reshape_compute_test.cc
         DEPS reshape_opencl cl_image_converter op_registry program context
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
+
 lite_cc_test(test_conv_opencl SRCS conv_compute_test.cc
         DEPS conv_opencl op_registry program context
         ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl)
diff --git a/lite/kernels/opencl/conv2d_1x1_compute.cc b/lite/kernels/opencl/conv2d_1x1_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/opencl/conv2d_1x1_compute_test.cc b/lite/kernels/opencl/conv2d_1x1_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/opencl/reshape_compute.cc b/lite/kernels/opencl/reshape_compute.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/opencl/reshape_compute_test.cc b/lite/kernels/opencl/reshape_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/kernels/x86/fc_compute_test.cc b/lite/kernels/x86/fc_compute_test.cc
deleted file mode 100644
index abc0597457..0000000000
--- a/lite/kernels/x86/fc_compute_test.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/kernels/x86/fc_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-TEST(fc_x86, retrive_op) {
-  auto fc =
-      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>("fc");
-  ASSERT_FALSE(fc.empty());
-  ASSERT_TRUE(fc.front());
-}
-
-TEST(fc_x86, init) {
-  FcCompute<float> fc;
-  ASSERT_EQ(fc.precision(), PRECISION(kFloat));
-  ASSERT_EQ(fc.target(), TARGET(kX86));
-}
-
-TEST(fc_x86, run_test) {
-  lite::Tensor x, w, b, out;
-  constexpr int batch_size = 2;
-  std::vector<int64_t> x_shape{batch_size, 3};
-  x.Resize(lite::DDim(x_shape));
-  std::vector<int64_t> w_shape{3, 4};
-  w.Resize(lite::DDim(w_shape));
-  std::vector<int64_t> b_shape{1, 4};
-  b.Resize(lite::DDim(b_shape));
-  std::vector<int64_t> out_shape{1, 4};
-  out.Resize(lite::DDim(out_shape));
-
-  auto x_data = x.mutable_data<float>();
-  auto w_data = w.mutable_data<float>();
-  auto b_data = b.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-
-  for (int64_t i = 0; i < x.dims().production(); i++) {
-    x_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < w.dims().production(); i++) {
-    w_data[i] = static_cast<float>(i);
-  }
-  for (int64_t i = 0; i < b.dims().production(); i++) {
-    b_data[i] = static_cast<float>(i);
-  }
-
-  /* lite::x86::math::fc_compute_eigen(x_data, batch_size, 3,  //
-                                     w_data, 3, 4,           //
-                                     b_data, ref_data); */
-
-  // FcCompute fc;
-  FcCompute<float> fc;
-  operators::FcParam param;
-
-  param.in_num_col_dims = 1;
-  param.input = &x;
-  param.w = &w;
-  param.bias = &b;
-  param.output = &out;
-  param.in_mat_dims = x.dims();
-
-  // std::unique_ptr<KernelContext> ctx(new KernelContext);
-  // ctx->As<X86Context>();
-  fc.SetParam(param);
-  // fc.SetContext(std::move(ctx));
-  fc.Run();
-
-  VLOG(3) << "output vs ref";
-  for (int i = 0; i < out.dims().production(); i++) {
-    VLOG(3) << out_data[i];
-  }
-
-  /* for (int i = 0; i < out.dims().production(); ++i) {
-     EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
-   }*/
-}
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(fc, kX86, kFloat, kNCHW, def);
diff --git a/lite/kernels/x86/layer_norm_compute.h b/lite/kernels/x86/layer_norm_compute.h
index bbbdb91deb..ca2ddf60c5 100644
--- a/lite/kernels/x86/layer_norm_compute.h
+++ b/lite/kernels/x86/layer_norm_compute.h
@@ -78,7 +78,7 @@ class LayerNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
         Scale->data<T>(),
         Bias->data<T>(),
         static_cast<int>(left),
-        static_cast<const float>(epsilon),
+        epsilon,
         right);
   }
 
diff --git a/lite/kernels/x86/relu_compute.cc b/lite/kernels/x86/relu_compute.cc
deleted file mode 100644
index 684b144254..0000000000
--- a/lite/kernels/x86/relu_compute.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/x86/relu_compute.h"
-
-REGISTER_LITE_KERNEL(relu,
-                     kX86,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::x86::ReluCompute<float>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
-    .Finalize();
diff --git a/lite/kernels/x86/relu_compute.h b/lite/kernels/x86/relu_compute.h
deleted file mode 100644
index b80a99302a..0000000000
--- a/lite/kernels/x86/relu_compute.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <Eigen/Core>
-#include <algorithm>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-#include "lite/operators/relu_op.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace x86 {
-
-template <typename T>
-class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
- public:
-  using param_t = operators::ActivationParam;
-
-  void Run() override {
-    auto& param = *param_.get_mutable<param_t>();
-    auto n = param.X->dims().production();
-    const float* input = param.X->data<float>();
-    float* output = param.Out->mutable_data<float>();
-    for (int i = 0; i < n; i++) {
-      output[i] = std::max(0.f, input[i]);
-    }
-  }
-
-  virtual ~ReluCompute() = default;
-};
-
-}  // namespace x86
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/act_op.cc b/lite/kernels/xpu/bridges/act_op.cc
index f674af84ca..e3d4588aa2 100644
--- a/lite/kernels/xpu/bridges/act_op.cc
+++ b/lite/kernels/xpu/bridges/act_op.cc
@@ -43,20 +43,21 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Act node
   if (op_type == "relu") {
-    graph->AddNode(out_name, graph->builder_.CreateRelu(*x_node));
+    graph->Add(out_name, graph->builder_.CreateRelu(*x_node->data()));
   } else if (op_type == "tanh") {
-    graph->AddNode(out_name, graph->builder_.CreateUnaryOp("tanh", *x_node));
+    graph->Add(out_name,
+               graph->builder_.CreateUnaryOp("tanh", *x_node->data()));
   } else if (op_type == "gelu") {
-    graph->AddNode(out_name, graph->builder_.CreateGelu(*x_node));
+    graph->Add(out_name, graph->builder_.CreateGelu(*x_node->data()));
   } else {
     // TODO(hong19860320) supports more activation ops
     LOG(WARNING) << "[XPU] Unsupported activation type " << op_type;
@@ -70,6 +71,6 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU, relu, paddle::lite::subgraph::xpu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(XPU, tanh, paddle::lite::subgraph::xpu::ActConverter);
-REGISTER_SUBGRAPH_BRIDGE(XPU, gelu, paddle::lite::subgraph::xpu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(relu, kXPU, paddle::lite::subgraph::xpu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(tanh, kXPU, paddle::lite::subgraph::xpu::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(gelu, kXPU, paddle::lite::subgraph::xpu::ActConverter);
diff --git a/lite/kernels/xpu/bridges/act_op_test.cc b/lite/kernels/xpu/bridges/act_op_test.cc
deleted file mode 100644
index 1a3efab46e..0000000000
--- a/lite/kernels/xpu/bridges/act_op_test.cc
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/test_helper.h"
-#include "lite/operators/activation_ops.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-void relu_ref(const std::shared_ptr<operators::ActivationOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->data<float>();
-  auto out_data = out->mutable_data<float>();
-  DDim x_dims = x->dims();
-  DDim out_dims = out->dims();
-  CHECK_EQ(x_dims.production(), out_dims.production());
-  for (int i = 0; i < out_dims.production(); i++) {
-    out_data[i] = std::max(0.f, x_data[i]);
-  }
-}
-
-void test_relu(int bs, int ic, int ih, int iw) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("relu");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-
-  // create and convert op to XPU model, and run it on XPU
-  auto op = CreateOp<operators::ActivationOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  relu_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, relu) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {3, 4}) {
-      for (auto ih : {2, 5}) {
-        for (auto iw : {5, 9}) {
-          VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
-                  << " iw: " << iw;
-          test_relu(bs, ic, ih, iw);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(relu);
-USE_XPU_BRIDGE(relu);
diff --git a/lite/kernels/xpu/bridges/batch_norm_op.cc b/lite/kernels/xpu/bridges/batch_norm_op.cc
index 980f241660..d84b9cc4f1 100644
--- a/lite/kernels/xpu/bridges/batch_norm_op.cc
+++ b/lite/kernels/xpu/bridges/batch_norm_op.cc
@@ -37,55 +37,61 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
+
   auto scale_name = op_info->Input("Scale").front();
   auto scale_type = kernel->GetInputDeclType("Scale");
   CHECK(scale_type->precision() == PRECISION(kFloat));
   CHECK(scale_type->layout() == DATALAYOUT(kNCHW));
   auto scale = scope->FindMutableTensor(scale_name);
+
   auto bias_name = op_info->Input("Bias").front();
   auto bias_type = kernel->GetInputDeclType("Bias");
   CHECK(bias_type->precision() == PRECISION(kFloat));
   CHECK(bias_type->layout() == DATALAYOUT(kNCHW));
   auto bias = scope->FindMutableTensor(bias_name);
+
   auto mean_name = op_info->Input("Mean").front();
   auto mean_type = kernel->GetInputDeclType("Mean");
   CHECK(mean_type->precision() == PRECISION(kFloat));
   CHECK(mean_type->layout() == DATALAYOUT(kNCHW));
   auto mean = scope->FindMutableTensor(mean_name);
+
   auto variance_name = op_info->Input("Variance").front();
   auto variance_type = kernel->GetInputDeclType("Variance");
   CHECK(variance_type->precision() == PRECISION(kFloat));
   CHECK(variance_type->layout() == DATALAYOUT(kNCHW));
   auto variance = scope->FindMutableTensor(variance_name);
+
   auto y_name = op_info->Output("Y").front();
   auto y_type = kernel->GetOutputDeclType("Y");
   CHECK(y_type->precision() == PRECISION(kFloat));
   CHECK(y_type->layout() == DATALAYOUT(kNCHW));
+
   auto epsilon = op_info->GetAttr<float>("epsilon");
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Scale, Bias, Mean, Variance node
-  auto scale_const_node = graph->AddNode(scale_name, *scale);
-  auto bias_const_node = graph->AddNode(bias_name, *bias);
-  auto mean_const_node = graph->AddNode(mean_name, *mean);
-  auto variance_const_node = graph->AddNode(variance_name, *variance);
+  auto scale_node = graph->Add(scale_name, *scale);
+  auto bias_node = graph->Add(bias_name, *bias);
+  auto mean_node = graph->Add(mean_name, *mean);
+  auto variance_node = graph->Add(variance_name, *variance);
 
   // Batch Norm node and extract the first field as the output node
-  auto batch_norm_node = graph->builder_.CreateBatchNorm(*x_node,
-                                                         *scale_const_node,
-                                                         *bias_const_node,
-                                                         *mean_const_node,
-                                                         *variance_const_node,
+  auto batch_norm_data = graph->builder_.CreateBatchNorm(*x_node->data(),
+                                                         *scale_node->data(),
+                                                         *bias_node->data(),
+                                                         *mean_node->data(),
+                                                         *variance_node->data(),
                                                          1,
                                                          epsilon);
-  graph->AddNode(y_name, graph->builder_.GetField(batch_norm_node, 0));
+  graph->Add(y_name, graph->builder_.GetField(batch_norm_data, 0));
   return SUCCESS;
 }
 
@@ -94,6 +100,6 @@ int BatchNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         batch_norm,
+REGISTER_SUBGRAPH_BRIDGE(batch_norm,
+                         kXPU,
                          paddle::lite::subgraph::xpu::BatchNormConverter);
diff --git a/lite/kernels/xpu/bridges/batch_norm_op_test.cc b/lite/kernels/xpu/bridges/batch_norm_op_test.cc
deleted file mode 100644
index dec475530a..0000000000
--- a/lite/kernels/xpu/bridges/batch_norm_op_test.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/batch_norm_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/xpu/bridges/registry.h"
-#include "lite/kernels/xpu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-template <typename dtype>
-void batch_norm_ref(const std::shared_ptr<operators::BatchNormOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto y = scope->FindVar(op_info->Output("Y").front())->GetMutable<Tensor>();
-  auto bias =
-      scope->FindVar(op_info->Input("Bias").front())->GetMutable<Tensor>();
-  auto scale =
-      scope->FindVar(op_info->Input("Scale").front())->GetMutable<Tensor>();
-  auto mean =
-      scope->FindVar(op_info->Input("Mean").front())->GetMutable<Tensor>();
-  auto variance =
-      scope->FindVar(op_info->Input("Variance").front())->GetMutable<Tensor>();
-
-  auto x_data = x->data<dtype>();
-  auto y_data = y->mutable_data<dtype>();
-  auto scale_data = scale->mutable_data<dtype>();
-  auto bias_data = bias->mutable_data<dtype>();
-  auto mean_data = mean->mutable_data<dtype>();
-  auto variance_data = variance->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-
-  float epsilon = op_info->GetAttr<float>("epsilon");
-  auto data_layout = op_info->GetAttr<std::string>("data_layout");
-
-  bool global_stats = op_info->GetAttr<bool>("use_global_stats");
-  if (global_stats) {
-    int64_t outer_size = 0;
-    int64_t channel_size = 0;
-    int64_t inner_size = 0;
-    if (data_layout == "NCHW") {
-      outer_size = x_dims[0];
-      channel_size = x_dims[1];
-      inner_size = x_dims.Slice(2, x_dims.size()).production();
-    } else {
-      LOG(FATAL) << "Unknown storage order: " << data_layout;
-    }
-    auto x_ptr = x_data;
-    auto y_ptr = y_data;
-    for (int o = 0; o < outer_size; o++) {
-      for (int c = 0; c < channel_size; c++) {
-        for (int i = 0; i < inner_size; i++) {
-          dtype norm_x =
-              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon);
-          *y_ptr = norm_x * scale_data[c] + bias_data[c];
-          x_ptr++;
-          y_ptr++;
-        }
-      }
-    }
-  }
-}
-
-void test_batch_norm(int bs, int ic, int ih, int iw, float epsilon) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  std::string scale_var_name = "scale";
-  std::string bias_var_name = "bias";
-  std::string mean_var_name = "mean";
-  std::string variance_var_name = "variance";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* scale = scope.Var(scale_var_name)->GetMutable<Tensor>();
-  auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-  auto* mean = scope.Var(mean_var_name)->GetMutable<Tensor>();
-  auto* variance = scope.Var(variance_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  scale->Resize({ic});
-  bias->Resize({ic});
-  mean->Resize({ic});
-  variance->Resize({ic});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(scale);
-  FillTensor<float>(bias);
-  FillTensor<float>(mean);
-  // variance > 0
-  FillTensor<float>(variance, 1.f, 5.f);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("batch_norm");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetInput("Scale", {scale_var_name});
-  opdesc.SetInput("Bias", {bias_var_name});
-  opdesc.SetInput("Mean", {mean_var_name});
-  opdesc.SetInput("Variance", {variance_var_name});
-  opdesc.SetOutput("Y", {out_var_name});
-  opdesc.SetAttr("is_test", 1);
-  opdesc.SetAttr("use_global_stats", true);
-  opdesc.SetAttr("epsilon", epsilon);
-  opdesc.SetAttr("momentum", 0.9f);
-  opdesc.SetAttr("data_layout", std::string("NCHW"));
-
-  // create and convert op to XPU model, then run it on XPU
-  auto op = CreateOp<operators::BatchNormOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  batch_norm_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, batch_norm) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {2, 3}) {
-      for (auto ih : {4}) {
-        for (auto iw : {5}) {
-          for (auto epsilon : {1e-5f}) {
-            test_batch_norm(bs, ic, ih, iw, epsilon);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(batch_norm);
-USE_XPU_BRIDGE(batch_norm);
diff --git a/lite/kernels/xpu/bridges/conv_op.cc b/lite/kernels/xpu/bridges/conv_op.cc
index 5e9e5448a1..fe9c598847 100644
--- a/lite/kernels/xpu/bridges/conv_op.cc
+++ b/lite/kernels/xpu/bridges/conv_op.cc
@@ -61,11 +61,11 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(dilations.size(), 2L);
 
   // Input node
-  std::shared_ptr<xtcl::xExpr> input_node = nullptr;
-  if (graph->HasNode(input_name)) {
-    input_node = graph->GetNode(input_name);
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
   } else {
-    input_node = graph->AddNode(input_name, input_dims);
+    input_node = graph->Add(input_name, *input);
   }
 
   if (paddings.size() == 2L) {
@@ -99,7 +99,7 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   DDim output_dims(output_shape);
 
   // Filter node
-  auto filter_const_node = graph->AddNode(filter_name, *filter);
+  auto filter_node = graph->Add(filter_name, *filter);
 
   // Conv node
   auto conv_attrs = xtcl::make_node<xtcl::network::Conv2DAttrs>();
@@ -114,9 +114,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   conv_attrs->out_layout = "";
   // conv_attrs->out_dtype = "";
   auto conv_node =
-      graph->AddNode(output_name,
-                     graph->builder_.CreateConv2D(
-                         *input_node, *filter_const_node, conv_attrs));
+      graph->Add(output_name,
+                 graph->builder_.CreateConv2D(
+                     *input_node->data(), *filter_node->data(), conv_attrs));
 
   // Add bias node if exists bias
   // supports the bias nodes with the following dimensions
@@ -149,30 +149,27 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
                  << " isn't supported in conv2d Op when output dimension is "
                  << output_dims;
     }
-    std::shared_ptr<xtcl::xExpr> bias_node = nullptr;
-    if (graph->HasNode(bias_name)) {
-      // Bias node from input node
-      bias_node = graph->GetNode(bias_name);
+    std::shared_ptr<Node> bias_node = nullptr;
+    if (graph->Has(bias_name)) {
+      bias_node = graph->Get(bias_name);
     } else {
-      // Bias node with const data
-      bias_node = graph->AddNode(bias_name, *bias, bias_shape);
+      bias_node = graph->Add(bias_name, *bias, bias_shape);
     }
-    std::shared_ptr<xtcl::xExpr> add_node = nullptr;
     if (is_channel_bias) {
-      add_node = graph->AddNode(
-          output_name,
-          graph->builder_.CreateBiasAdd(*conv_node, 1, *bias_node));
+      conv_node = graph->Add(output_name,
+                             graph->builder_.CreateBiasAdd(
+                                 *conv_node->data(), 1, *bias_node->data()));
     } else {
-      add_node = graph->AddNode(
-          output_name,
-          graph->builder_.CreateBinaryOp("add", *conv_node, *bias_node));
+      conv_node =
+          graph->Add(output_name,
+                     graph->builder_.CreateBinaryOp(
+                         "add", *conv_node->data(), *bias_node->data()));
     }
-    conv_node = add_node;
   }
 
   if (fuse_relu) {
     // Append relu node if fuse_relu is true
-    graph->AddNode(output_name, graph->builder_.CreateRelu(*conv_node));
+    graph->Add(output_name, graph->builder_.CreateRelu(*conv_node->data()));
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
@@ -182,9 +179,9 @@ int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         conv2d,
+REGISTER_SUBGRAPH_BRIDGE(conv2d,
+                         kXPU,
                          paddle::lite::subgraph::xpu::ConvConverter);
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         depthwise_conv2d,
+REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
+                         kXPU,
                          paddle::lite::subgraph::xpu::ConvConverter);
diff --git a/lite/kernels/xpu/bridges/dropout_op.cc b/lite/kernels/xpu/bridges/dropout_op.cc
old mode 100755
new mode 100644
index ae81facd53..df869e17ff
--- a/lite/kernels/xpu/bridges/dropout_op.cc
+++ b/lite/kernels/xpu/bridges/dropout_op.cc
@@ -46,21 +46,21 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       op_info->GetAttr<std::string>("dropout_implementation");
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Dropout node
   if (dropout_implementation == "downgrade_in_infer") {
-    graph->AddNode(
-        out_name,
-        graph->builder_.CreateScale(*x_node, 1.f - dropout_prob, 0.0f, false));
+    graph->Add(out_name,
+               graph->builder_.CreateScale(
+                   *x_node->data(), 1.f - dropout_prob, 0.0f, false));
   } else if (dropout_implementation == "upscale_in_train") {
-    graph->AddNode(out_name,
-                   graph->builder_.CreateScale(*x_node, 1.0f, 0.0f, false));
+    graph->Add(out_name,
+               graph->builder_.CreateScale(*x_node->data(), 1.0f, 0.0f, false));
   } else {
     LOG(WARNING) << "[XPU] Unsupported dropout_implementation == "
                  << dropout_implementation << " for dropout";
@@ -74,6 +74,6 @@ int DropoutConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         dropout,
+REGISTER_SUBGRAPH_BRIDGE(dropout,
+                         kXPU,
                          paddle::lite::subgraph::xpu::DropoutConverter);
diff --git a/lite/kernels/xpu/bridges/elementwise_ops.cc b/lite/kernels/xpu/bridges/elementwise_ops.cc
index 49a42c55d6..7fcae312b9 100644
--- a/lite/kernels/xpu/bridges/elementwise_ops.cc
+++ b/lite/kernels/xpu/bridges/elementwise_ops.cc
@@ -50,29 +50,31 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto axis = op_info->GetAttr<int>("axis");
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Y node
-  std::shared_ptr<xtcl::xExpr> y_node = nullptr;
-  if (graph->HasNode(y_name)) {
-    y_node = graph->GetNode(y_name);
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
   } else {
-    y_node = graph->AddNode(y_name, y_dims);
+    y_node = graph->Add(y_name, *y);
   }
 
   // Elementwise node
-  std::shared_ptr<xtcl::xExpr> elementwise_node = nullptr;
+  std::shared_ptr<Node> elt_node = nullptr;
   if (y_dims.size() == 1) {
-    elementwise_node = graph->AddNode(
-        out_name, graph->builder_.CreateBiasAdd(*x_node, axis, *y_node));
+    elt_node = graph->Add(
+        out_name,
+        graph->builder_.CreateBiasAdd(*x_node->data(), axis, *y_node->data()));
   } else if (x_dims.size() == y_dims.size()) {
-    elementwise_node = graph->AddNode(
-        out_name, graph->builder_.CreateBinaryOp("add", *x_node, *y_node));
+    elt_node = graph->Add(out_name,
+                          graph->builder_.CreateBinaryOp(
+                              "add", *x_node->data(), *y_node->data()));
   } else {
     LOG(WARNING)
         << "[XPU] elementwise_add only support y of one dimension, or x "
@@ -88,6 +90,6 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         elementwise_add,
+REGISTER_SUBGRAPH_BRIDGE(elementwise_add,
+                         kXPU,
                          paddle::lite::subgraph::xpu::ElementwiseConverter);
diff --git a/lite/kernels/xpu/bridges/gather_op.cc b/lite/kernels/xpu/bridges/gather_op.cc
old mode 100755
new mode 100644
index 06d1c67b0d..845bbb8d98
--- a/lite/kernels/xpu/bridges/gather_op.cc
+++ b/lite/kernels/xpu/bridges/gather_op.cc
@@ -54,38 +54,42 @@ int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto out_dims = out->dims();
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Index node
-  std::shared_ptr<xtcl::xExpr> index_node = nullptr;
-  if (graph->HasNode(index_name)) {
-    index_node = graph->GetNode(index_name);
+  std::shared_ptr<Node> index_node = nullptr;
+  if (graph->Has(index_name)) {
+    index_node = graph->Get(index_name);
   } else {
-    index_node = graph->AddNode(
-        index_name, index_dims, index_type->precision(), index_type->layout());
+    index_node = graph->Add(index_name, *index);
   }
   // Flatten index node
   if (index_dims.size() != 1) {
     index_node =
-        graph->AddNode(index_name + "/reshape",
-                       graph->builder_.CreateReshape(*index_node, {-1}),
-                       index_type->precision(),
-                       index_type->layout());
+        graph->Add(index_name + "/reshape",
+                   graph->builder_.CreateReshape(*index_node->data(), {-1}),
+                   index_node->precision(),
+                   index_node->layout());
   }
 
   // Reshape the gather node with the inferred shape as the output node
-  auto gather_node = graph->AddNode(
-      out_name,
-      graph->builder_.CreateGather(*x_node, *index_node, /* axis= */ 0));
+  auto gather_node =
+      graph->Add(out_name,
+                 graph->builder_.CreateGather(
+                     *x_node->data(), *index_node->data(), /* axis= */ 0),
+                 x_node->precision(),
+                 x_node->layout());
   if (out_dims.size() != 2) {
-    graph->AddNode(out_name,
-                   graph->builder_.CreateReshape(
-                       *gather_node, CvtShape<xtcl::Integer>(out_dims)));
+    graph->Add(out_name,
+               graph->builder_.CreateReshape(*gather_node->data(),
+                                             CvtShape<xtcl::Integer>(out_dims)),
+               gather_node->precision(),
+               gather_node->layout());
   }
   return SUCCESS;
 }
@@ -95,6 +99,6 @@ int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         gather,
+REGISTER_SUBGRAPH_BRIDGE(gather,
+                         kXPU,
                          paddle::lite::subgraph::xpu::GatherConverter);
diff --git a/lite/kernels/xpu/bridges/graph.cc b/lite/kernels/xpu/bridges/graph.cc
old mode 100755
new mode 100644
index 1691e4b0c5..43aaad3402
--- a/lite/kernels/xpu/bridges/graph.cc
+++ b/lite/kernels/xpu/bridges/graph.cc
@@ -21,71 +21,70 @@ namespace lite {
 namespace subgraph {
 namespace xpu {
 
-std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
-                                            const xtcl::xExpr& layer,
-                                            PrecisionType precision,
-                                            DataLayoutType layout) {
-  auto unique_name = [&](const std::string& key) {
-    int idx = 1;
-    auto it = counts_.find(key);
-    if (it == counts_.end()) {
-      counts_.insert(std::make_pair(key, idx));
-    } else {
-      idx = ++(it->second);
-    }
-    return key + "_" + std::to_string(idx);
-  };
+int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
   auto it = nodes_.find(name);
   if (it != nodes_.end()) {
-    // Only variable can rebind the name
-    CHECK(!it->second.second.persistable()) << "[XPU] Node " << name
-                                            << " redefined.";
-    // Generate a new unique name as the key to bind the origin node if the
-    // origin node isn't a const node: new_name->node
-    nodes_.insert(std::make_pair(unique_name(name + "_var"), it->second));
-    nodes_.erase(it);
+    // Only variable node can be shared with the same name
+    if (!node->is_var() || !it->second.back()->is_var()) {
+      LOG(FATAL) << "[XPU] Const or data node " << name << " is redefined.";
+      return -1;
+    }
+  } else {
+    auto ret = nodes_.insert(
+        std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
+    CHECK(ret.second);
+    it = ret.first;
   }
-  // Create a new node and bind with the name: name->new_node
-  auto node = std::make_shared<xtcl::xExpr>(layer);
-  nodes_.insert(std::make_pair(
-      name, std::make_pair(node, Type(precision, layout, false))));
-  builder_.SetLayer(unique_name(name + "_op"));
-  return node;
+  it->second.push_back(node);
+  return it->second.size();
 }
 
-// Const node
-std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
-                                            const Tensor& tensor,
-                                            PrecisionType precision,
-                                            DataLayoutType layout) {
-  return AddNode(name, tensor, tensor.dims().Vectorize(), precision, layout);
+// Variable node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const xtcl::xExpr& layer,
+                                 PrecisionType precision,
+                                 DataLayoutType layout) {
+  auto node = std::make_shared<Node>(precision, layout, Node::Role::kVar);
+  auto idx = Add(name, node);
+  CHECK_GE(idx, 1);
+  node->set_data(std::make_shared<xtcl::xExpr>(layer));
+  // Generate a unique name for the current XTCL layer
+  builder_.SetLayer(name + "__" + std::to_string(idx));
+  return node;
 }
 
-std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
-                                            const Tensor& tensor,
-                                            std::vector<int64_t> shape,
-                                            PrecisionType precision,
-                                            DataLayoutType layout) {
-  CHECK(!HasNode(name)) << "[NPU] Node " << name << " redefined.";
-  auto node = std::make_shared<xtcl::xExpr>(builder_.CreateTensor(
-      name, CvtShape<xtcl::xIndexExpr>(shape), CvtPrecisionType(precision)));
-  nodes_.insert(std::make_pair(
-      name, std::make_pair(node, Type(precision, layout, true))));
-  params_.emplace(
-      std::make_pair(name, *CvtTensor(tensor, shape, precision, layout)));
+// Const or data node
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 const Tensor& tensor,
+                                 std::vector<int64_t> shape,
+                                 DataLayoutType layout) {
+  std::shared_ptr<Node> node = nullptr;
+  PrecisionType precision = tensor.precision();
+  if (tensor.persistable()) {
+    // Const node
+    node = std::make_shared<Node>(precision, layout, Node::Role::kConst);
+    auto idx = Add(name, node);
+    CHECK_EQ(idx, 1);
+    node->set_data(std::make_shared<xtcl::xExpr>(builder_.CreateTensor(
+        name, CvtShape<xtcl::xIndexExpr>(shape), CvtPrecisionType(precision))));
+    params_.emplace(std::make_pair(name, *CvtTensor(tensor, shape, layout)));
+  } else {
+    // Data node
+    node = Add(name, shape, precision, layout);
+  }
   return node;
 }
 
 // Data node
-std::shared_ptr<xtcl::xExpr> Graph::AddNode(const std::string& name,
-                                            std::vector<int64_t> shape,
-                                            PrecisionType precision,
-                                            DataLayoutType layout) {
-  CHECK(!HasNode(name)) << "[NPU] Node " << name << " redefined.";
-  auto node = std::make_shared<xtcl::xExpr>(builder_.CreateTensor(
-      name, CvtShape<xtcl::xIndexExpr>(shape), CvtPrecisionType(precision)));
-  nodes_.insert(std::make_pair(
-      name, std::make_pair(node, Type(precision, layout, false))));
+std::shared_ptr<Node> Graph::Add(const std::string& name,
+                                 std::vector<int64_t> shape,
+                                 PrecisionType precision,
+                                 DataLayoutType layout) {
+  auto node = std::make_shared<Node>(precision, layout, Node::Role::kData);
+  auto idx = Add(name, node);
+  CHECK_EQ(idx, 1);
+  node->set_data(std::make_shared<xtcl::xExpr>(builder_.CreateTensor(
+      name, CvtShape<xtcl::xIndexExpr>(shape), CvtPrecisionType(precision))));
   return node;
 }
 
diff --git a/lite/kernels/xpu/bridges/graph.h b/lite/kernels/xpu/bridges/graph.h
old mode 100755
new mode 100644
index 3107346851..dafd8d8532
--- a/lite/kernels/xpu/bridges/graph.h
+++ b/lite/kernels/xpu/bridges/graph.h
@@ -28,78 +28,78 @@ namespace lite {
 namespace subgraph {
 namespace xpu {
 
-// Type of graph nodes
-class Type {
+// Graph and node is defined to collect all of converted XTCL IR nodes
+class Node {
  public:
-  Type(PrecisionType precision = PRECISION(kFloat),
-       DataLayoutType layout = DATALAYOUT(kNCHW),
-       bool persistable = false)
-      : precision_(precision), layout_(layout), persistable_(persistable) {}
-
+  enum class Role {
+    kVar = 0,
+    kConst,
+    kData,
+  };
+
+  Node(std::shared_ptr<xtcl::xExpr> data,
+       PrecisionType precision,
+       DataLayoutType layout,
+       Role role)
+      : data_(data), precision_(precision), layout_(layout), role_(role) {}
+  Node(PrecisionType precision, DataLayoutType layout, Role role)
+      : precision_(precision), layout_(layout), role_(role) {}
+
+  void set_data(std::shared_ptr<xtcl::xExpr> data) { data_ = data; }
   void set_precision(PrecisionType precision) { precision_ = precision; }
   void set_layout(DataLayoutType layout) { layout_ = layout; }
-  void set_persistable(bool persistable) { persistable_ = persistable; }
+  void set_role(Role role) { role_ = role; }
 
+  std::shared_ptr<xtcl::xExpr> data() { return data_; }
   PrecisionType precision() const { return precision_; }
   DataLayoutType layout() const { return layout_; }
-  bool persistable() const { return persistable_; }
+  Role role() const { return role_; }
+  bool is_var() const { return role_ == Role::kVar; }
+  bool is_const() const { return role_ == Role::kConst; }
+  bool is_data() const { return role_ == Role::kData; }
 
  private:
+  std::shared_ptr<xtcl::xExpr> data_{nullptr};
   PrecisionType precision_{PRECISION(kFloat)};
   DataLayoutType layout_{DATALAYOUT(kNCHW)};
-  bool persistable_{false};
+  Role role_{Role::kVar};
 };
 
-// Graph to collect all of converted XPU IR nodes
 class Graph {
  public:
-  // Layer node
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      const xtcl::xExpr& layer,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW));
+  int Add(const std::string& name, std::shared_ptr<Node> node);
+
+  // Variable node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const xtcl::xExpr& layer,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  // Const or data node
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            std::vector<int64_t> shape,
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, tensor.dims().Vectorize(), layout);
+  }
 
-  // Const node
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      const Tensor& tensor,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW));
-
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      const Tensor& tensor,
-      std::vector<int64_t> shape,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW));
-
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      const Tensor& tensor,
-      DDim dims,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    return AddNode(name, tensor, dims.Vectorize(), precision, layout);
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const Tensor& tensor,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, tensor, dims.Vectorize(), layout);
   }
 
+  // Const node
   template <typename T>
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      const std::vector<T>& data,
-      std::vector<int64_t> shape = {},
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    const std::type_info& info = typeid(T);
-    PrecisionType precision = PRECISION(kFloat);
-    if (info == typeid(float)) {
-      precision = PRECISION(kFloat);
-    } else if (info == typeid(int8_t)) {
-      precision = PRECISION(kFloat);
-    } else if (info == typeid(int32_t)) {
-      precision = PRECISION(kInt32);
-    } else {
-      LOG(FATAL) << "[XPU] Unknow data type " << info.name();
-    }
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            std::vector<int64_t> shape = {},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
     if (shape.empty()) {
       shape = {static_cast<int64_t>(data.size())};
     } else {
@@ -111,70 +111,61 @@ class Graph {
     }
     Tensor tensor;
     tensor.Resize(shape);
+    tensor.set_persistable(true);
     std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
                 reinterpret_cast<const uint8_t*>(data.data()),
                 data.size() * sizeof(T));
-    return AddNode(name, tensor, precision, layout);
+    return Add(name, tensor, layout);
   }
 
   template <typename T>
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      const std::vector<T>& data,
-      DDim dims,
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    return AddNode(name, data, dims.Vectorize(), layout);
+  std::shared_ptr<Node> Add(const std::string& name,
+                            const std::vector<T>& data,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, data, dims.Vectorize(), layout);
   }
 
   template <typename T>
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      T value,
-      std::vector<int64_t> shape = {1},
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            std::vector<int64_t> shape = {1},
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
     int64_t size = 1;
     for (auto i : shape) {
       size *= i;
     }
     std::vector<T> data(size, value);
-    return AddNode(name, data, shape, layout);
+    return Add(name, data, shape, layout);
   }
 
   template <typename T>
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      T value,
-      DDim dims,
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    return AddNode(name, value, dims.Vectorize(), layout);
+  std::shared_ptr<Node> Add(const std::string& name,
+                            T value,
+                            DDim dims,
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, value, dims.Vectorize(), layout);
   }
 
   // Data node
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      std::vector<int64_t> shape,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW));
-
-  std::shared_ptr<xtcl::xExpr> AddNode(
-      const std::string& name,
-      DDim dims,
-      PrecisionType precision = PRECISION(kFloat),
-      DataLayoutType layout = DATALAYOUT(kNCHW)) {
-    return AddNode(name, dims.Vectorize(), precision, layout);
-  }
-
-  std::shared_ptr<xtcl::xExpr> GetNode(const std::string& name) {
-    CHECK(HasNode(name)) << "[XPU] Node " << name << " not found.";
-    return nodes_.at(name).first;
+  std::shared_ptr<Node> Add(const std::string& name,
+                            std::vector<int64_t> shape,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW));
+
+  std::shared_ptr<Node> Add(const std::string& name,
+                            DDim dims,
+                            PrecisionType precision = PRECISION(kFloat),
+                            DataLayoutType layout = DATALAYOUT(kNCHW)) {
+    return Add(name, dims.Vectorize(), precision, layout);
   }
 
-  const Type& GetType(const std::string& name) {
-    CHECK(HasNode(name)) << "[XPU] Node " << name << " not found.";
-    return nodes_.at(name).second;
+  std::shared_ptr<Node> Get(const std::string& name) {
+    CHECK(Has(name)) << "[XPU] Node " << name << " not found.";
+    return nodes_.at(name).back();
   }
 
-  bool HasNode(const std::string& name) {
+  bool Has(const std::string& name) {
     return nodes_.find(name) != nodes_.end();
   }
 
@@ -184,9 +175,7 @@ class Graph {
   xtcl::network::xTensorCompiler::ParamNDArrayMap params_;
 
  private:
-  std::unordered_map<std::string, std::pair<std::shared_ptr<xtcl::xExpr>, Type>>
-      nodes_;
-  std::unordered_map<std::string, int> counts_;
+  std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
 };
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/bridges/layer_norm_op.cc b/lite/kernels/xpu/bridges/layer_norm_op.cc
old mode 100755
new mode 100644
index 601dd42770..3ad190b73f
--- a/lite/kernels/xpu/bridges/layer_norm_op.cc
+++ b/lite/kernels/xpu/bridges/layer_norm_op.cc
@@ -51,23 +51,23 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto x_inner_size = x_dims.Slice(axis, x_rank).production();
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
   if (reshape) {
     auto reshaped_x_dims = x_dims.Slice(0, axis).Vectorize();
     reshaped_x_dims.push_back(x_inner_size);
-    x_node =
-        graph->AddNode(x_name + "/reshape",
-                       graph->builder_.CreateReshape(
-                           *x_node, CvtShape<xtcl::Integer>(reshaped_x_dims)));
+    x_node = graph->Add(
+        x_name + "/reshape",
+        graph->builder_.CreateReshape(
+            *x_node->data(), CvtShape<xtcl::Integer>(reshaped_x_dims)));
   }
 
   // Scale node
-  std::shared_ptr<xtcl::xExpr> scale_const_node = nullptr;
+  std::shared_ptr<Node> scale_node = nullptr;
   if (HasInputArg(op_info, scope, "Scale")) {
     auto scale_name = op_info->Input("Scale").front();
     auto scale_type = kernel->GetInputDeclType("Scale");
@@ -77,14 +77,13 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     auto scale_dims = scale->dims();
     CHECK_EQ(scale_dims.size(), 1);
     CHECK_EQ(scale_dims.production(), x_inner_size);
-    scale_const_node = graph->AddNode(scale_name, *scale);
+    scale_node = graph->Add(scale_name, *scale);
   } else {
-    scale_const_node =
-        graph->AddNode(y_name + "/scale_one", 1.0f, {x_inner_size});
+    scale_node = graph->Add(y_name + "/scale_one", 1.0f, {x_inner_size});
   }
 
   // Bias node
-  std::shared_ptr<xtcl::xExpr> bias_const_node = nullptr;
+  std::shared_ptr<Node> bias_node = nullptr;
   if (HasInputArg(op_info, scope, "Bias")) {
     auto bias_name = op_info->Input("Bias").front();
     auto bias_type = kernel->GetInputDeclType("Bias");
@@ -94,26 +93,25 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     auto bias_dims = bias->dims();
     CHECK_EQ(bias_dims.size(), 1);
     CHECK_EQ(bias_dims.production(), x_inner_size);
-    bias_const_node = graph->AddNode(bias_name, *bias);
+    bias_node = graph->Add(bias_name, *bias);
   } else {
-    bias_const_node =
-        graph->AddNode(y_name + "/bias_zero", 0.0f, {x_inner_size});
+    bias_node = graph->Add(y_name + "/bias_zero", 0.0f, {x_inner_size});
   }
 
   // Layer Norm node
   auto layer_norm_node =
-      graph->AddNode(y_name,
-                     graph->builder_.CreateLayerNorm(*x_node,
-                                                     *scale_const_node,
-                                                     *bias_const_node,
-                                                     axis,
-                                                     epsilon,
-                                                     true,
-                                                     true));
+      graph->Add(y_name,
+                 graph->builder_.CreateLayerNorm(*x_node->data(),
+                                                 *scale_node->data(),
+                                                 *bias_node->data(),
+                                                 axis,
+                                                 epsilon,
+                                                 true,
+                                                 true));
   if (reshape) {
-    graph->AddNode(y_name,
-                   graph->builder_.CreateReshape(
-                       *layer_norm_node, CvtShape<xtcl::Integer>(y_dims)));
+    graph->Add(y_name,
+               graph->builder_.CreateReshape(*layer_norm_node->data(),
+                                             CvtShape<xtcl::Integer>(y_dims)));
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
@@ -123,6 +121,6 @@ int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         layer_norm,
+REGISTER_SUBGRAPH_BRIDGE(layer_norm,
+                         kXPU,
                          paddle::lite::subgraph::xpu::LayerNormConverter);
diff --git a/lite/kernels/xpu/bridges/lookup_table_op.cc b/lite/kernels/xpu/bridges/lookup_table_op.cc
old mode 100755
new mode 100644
index a03e0c2d24..eecf50b5bd
--- a/lite/kernels/xpu/bridges/lookup_table_op.cc
+++ b/lite/kernels/xpu/bridges/lookup_table_op.cc
@@ -57,30 +57,37 @@ int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   }
 
   // Ids node
-  std::shared_ptr<xtcl::xExpr> ids_node = nullptr;
-  if (graph->HasNode(ids_name)) {
-    ids_node = graph->GetNode(ids_name);
+  std::shared_ptr<Node> ids_node = nullptr;
+  if (graph->Has(ids_name)) {
+    ids_node = graph->Get(ids_name);
   } else {
-    ids_node = graph->AddNode(
-        ids_name, ids_dims, ids_type->precision(), ids_type->layout());
+    ids_node = graph->Add(ids_name, *ids);
   }
   // Flatten Ids node
   if (ids_dims.size() != 1) {
-    ids_node = graph->AddNode(ids_name + "/reshape",
-                              graph->builder_.CreateReshape(*ids_node, {-1}),
-                              ids_type->precision(),
-                              ids_type->layout());
+    ids_node =
+        graph->Add(ids_name + "/reshape",
+                   graph->builder_.CreateReshape(*ids_node->data(), {-1}),
+                   ids_node->precision(),
+                   ids_node->layout());
   }
-  auto w_const_node = graph->AddNode(w_name, *w);
+
+  // W node
+  auto w_node = graph->Add(w_name, *w);
 
   // Reshape the gather node with the inferred shape as the output node
-  auto gather_node = graph->AddNode(
-      out_name,
-      graph->builder_.CreateGather(*w_const_node, *ids_node, /* axis= */ 0));
+  auto gather_node =
+      graph->Add(out_name,
+                 graph->builder_.CreateGather(
+                     *w_node->data(), *ids_node->data(), /* axis= */ 0),
+                 w_node->precision(),
+                 w_node->layout());
   if (out_dims.size() != 2) {
-    graph->AddNode(out_name,
-                   graph->builder_.CreateReshape(
-                       *gather_node, CvtShape<xtcl::Integer>(out_dims)));
+    graph->Add(out_name,
+               graph->builder_.CreateReshape(*gather_node->data(),
+                                             CvtShape<xtcl::Integer>(out_dims)),
+               gather_node->precision(),
+               gather_node->layout());
   }
   return SUCCESS;
 }
@@ -90,6 +97,6 @@ int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         lookup_table,
+REGISTER_SUBGRAPH_BRIDGE(lookup_table,
+                         kXPU,
                          paddle::lite::subgraph::xpu::LookupTableConverter);
diff --git a/lite/kernels/xpu/bridges/matmul_op.cc b/lite/kernels/xpu/bridges/matmul_op.cc
old mode 100755
new mode 100644
index 330b336840..c17ba8423c
--- a/lite/kernels/xpu/bridges/matmul_op.cc
+++ b/lite/kernels/xpu/bridges/matmul_op.cc
@@ -57,19 +57,19 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto alpha = op_info->GetAttr<float>("alpha");
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Y node
-  std::shared_ptr<xtcl::xExpr> y_node = nullptr;
-  if (graph->HasNode(y_name)) {
-    y_node = graph->GetNode(y_name);
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
   } else {
-    y_node = graph->AddNode(y_name, y_dims);
+    y_node = graph->Add(y_name, *y);
   }
 
   // Matmul node
@@ -80,52 +80,55 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     if (x_dims.size() != 3) {
       auto m = static_cast<int>(x_dims[x_dims.size() - 2]);
       auto k = static_cast<int>(x_dims[x_dims.size() - 1]);
-      x_node =
-          graph->AddNode(x_name + "/reshape",
-                         graph->builder_.CreateReshape(*x_node, {-1, m, k}));
+      x_node = graph->Add(
+          x_name + "/reshape",
+          graph->builder_.CreateReshape(*x_node->data(), {-1, m, k}));
       if (transpose_x) {
-        x_node =
-            graph->AddNode(x_name + "/reshape/transpose",
-                           graph->builder_.CreateTranspose(*x_node, {0, 2, 1}));
+        x_node = graph->Add(
+            x_name + "/reshape/transpose",
+            graph->builder_.CreateTranspose(*x_node->data(), {0, 2, 1}));
       }
     }
     // Reshape and transposed Y node
     if (y_dims.size() != 3) {
       auto k = static_cast<int>(y_dims[y_dims.size() - 2]);
       auto n = static_cast<int>(y_dims[y_dims.size() - 1]);
-      y_node =
-          graph->AddNode(y_name + "/reshape",
-                         graph->builder_.CreateReshape(*y_node, {-1, k, n}));
+      y_node = graph->Add(
+          y_name + "/reshape",
+          graph->builder_.CreateReshape(*y_node->data(), {-1, k, n}));
       if (!transpose_y) {
-        y_node =
-            graph->AddNode(y_name + "/reshape/transpose",
-                           graph->builder_.CreateTranspose(*y_node, {0, 2, 1}));
+        y_node = graph->Add(
+            y_name + "/reshape/transpose",
+            graph->builder_.CreateTranspose(*y_node->data(), {0, 2, 1}));
       }
     }
     // Matmul node
-    auto matmul_node = graph->AddNode(
-        out_name, graph->builder_.CreateBatchMatmul(*x_node, *y_node));
+    auto matmul_node = graph->Add(
+        out_name,
+        graph->builder_.CreateBatchMatmul(*x_node->data(), *y_node->data()));
     if (fabs(alpha - 1) > 1e-6f) {
-      matmul_node = graph->AddNode(
-          out_name, graph->builder_.CreateScale(*matmul_node, alpha));
+      matmul_node = graph->Add(
+          out_name, graph->builder_.CreateScale(*matmul_node->data(), alpha));
     }
     if (out_dims.size() != 3) {
-      graph->AddNode(out_name,
-                     graph->builder_.CreateReshape(
-                         *matmul_node, CvtShape<xtcl::Integer>(out_dims)));
+      graph->Add(out_name,
+                 graph->builder_.CreateReshape(
+                     *matmul_node->data(), CvtShape<xtcl::Integer>(out_dims)));
     }
   } else if (x_dims.size() == 2 && y_dims.size() == 2) {
     // x: [M, K], y: [K, N], out: [M, N]
     if (transpose_x) {
-      x_node = graph->AddNode(x_name + "/transpose",
-                              graph->builder_.CreateTranspose(*x_node, {1, 0}));
+      x_node =
+          graph->Add(x_name + "/transpose",
+                     graph->builder_.CreateTranspose(*x_node->data(), {1, 0}));
     }
-    auto matmul_node = graph->AddNode(
-        out_name,
-        graph->builder_.CreateMatmul2D(*x_node, *y_node, transpose_y));
+    auto matmul_node =
+        graph->Add(out_name,
+                   graph->builder_.CreateMatmul2D(
+                       *x_node->data(), *y_node->data(), transpose_y));
     if (fabs(alpha - 1) > 1e-6f) {
-      matmul_node = graph->AddNode(
-          out_name, graph->builder_.CreateScale(*matmul_node, alpha));
+      matmul_node = graph->Add(
+          out_name, graph->builder_.CreateScale(*matmul_node->data(), alpha));
     }
   } else if (x_dims.size() == 1 && y_dims.size() == 1) {
     // x: [K], y: [K], out: [1]
@@ -141,6 +144,6 @@ int MatmulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         matmul,
+REGISTER_SUBGRAPH_BRIDGE(matmul,
+                         kXPU,
                          paddle::lite::subgraph::xpu::MatmulConverter);
diff --git a/lite/kernels/xpu/bridges/mul_op.cc b/lite/kernels/xpu/bridges/mul_op.cc
index 4078055745..e12f767d13 100644
--- a/lite/kernels/xpu/bridges/mul_op.cc
+++ b/lite/kernels/xpu/bridges/mul_op.cc
@@ -56,49 +56,50 @@ int MulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   CHECK_EQ(x_matrix_dims[1], y_matrix_dims[0]);
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
   // Flatten X node
   if (x_dims.size() != 2) {
-    x_node =
-        graph->AddNode(x_name + "/reshape",
-                       graph->builder_.CreateReshape(
-                           *x_node, {-1, static_cast<int>(x_matrix_dims[1])}));
+    x_node = graph->Add(
+        x_name + "/reshape",
+        graph->builder_.CreateReshape(
+            *x_node->data(), {-1, static_cast<int>(x_matrix_dims[1])}));
   }
 
   // Y node
-  std::shared_ptr<xtcl::xExpr> y_node = nullptr;
-  if (graph->HasNode(y_name)) {
-    y_node = graph->GetNode(y_name);
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
   } else {
-    y_node = graph->AddNode(y_name, y_dims);
+    y_node = graph->Add(y_name, *y);
   }
   // Flatten Y node
   if (y_dims.size() != 2) {
-    y_node =
-        graph->AddNode(y_name + "/reshape",
-                       graph->builder_.CreateReshape(
-                           *y_node, {static_cast<int>(y_matrix_dims[0]), -1}));
+    y_node = graph->Add(
+        y_name + "/reshape",
+        graph->builder_.CreateReshape(
+            *y_node->data(), {static_cast<int>(y_matrix_dims[0]), -1}));
   }
 
   // Reshape the matmul node with the inferred shape as the output node
-  auto matmul_node = graph->AddNode(
-      out_name, graph->builder_.CreateMatmul2D(*x_node, *y_node, false));
+  auto matmul_node = graph->Add(
+      out_name,
+      graph->builder_.CreateMatmul2D(*x_node->data(), *y_node->data(), false));
   if (out_dims.size() != 2) {
-    graph->AddNode(out_name,
-                   graph->builder_.CreateReshape(
-                       *matmul_node, CvtShape<xtcl::Integer>(out_dims)));
+    graph->Add(out_name,
+               graph->builder_.CreateReshape(
+                   *matmul_node->data(), CvtShape<xtcl::Integer>(out_dims)));
   }
   return REBUILD_WHEN_SHAPE_CHANGED;
-}
+}  // namespace xpu
 
 }  // namespace xpu
 }  // namespace subgraph
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU, mul, paddle::lite::subgraph::xpu::MulConverter);
+REGISTER_SUBGRAPH_BRIDGE(mul, kXPU, paddle::lite::subgraph::xpu::MulConverter);
diff --git a/lite/kernels/xpu/bridges/paddle_use_bridges.h b/lite/kernels/xpu/bridges/paddle_use_bridges.h
old mode 100755
new mode 100644
index 588fcdd6e4..bed88034ae
--- a/lite/kernels/xpu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/xpu/bridges/paddle_use_bridges.h
@@ -14,25 +14,25 @@
 
 #pragma once
 
-USE_SUBGRAPH_BRIDGE(XPU, relu);
-USE_SUBGRAPH_BRIDGE(XPU, tanh);
-USE_SUBGRAPH_BRIDGE(XPU, conv2d);
-USE_SUBGRAPH_BRIDGE(XPU, depthwise_conv2d);
-USE_SUBGRAPH_BRIDGE(XPU, elementwise_add);
-USE_SUBGRAPH_BRIDGE(XPU, pool2d);
-USE_SUBGRAPH_BRIDGE(XPU, softmax);
-USE_SUBGRAPH_BRIDGE(XPU, mul);
-USE_SUBGRAPH_BRIDGE(XPU, batch_norm);
-USE_SUBGRAPH_BRIDGE(XPU, stack);
-USE_SUBGRAPH_BRIDGE(XPU, gather);
-USE_SUBGRAPH_BRIDGE(XPU, scale);
-USE_SUBGRAPH_BRIDGE(XPU, lookup_table);
-USE_SUBGRAPH_BRIDGE(XPU, slice);
-USE_SUBGRAPH_BRIDGE(XPU, transpose);
-USE_SUBGRAPH_BRIDGE(XPU, transpose2);
-USE_SUBGRAPH_BRIDGE(XPU, reshape);
-USE_SUBGRAPH_BRIDGE(XPU, reshape2);
-USE_SUBGRAPH_BRIDGE(XPU, layer_norm);
-USE_SUBGRAPH_BRIDGE(XPU, gelu);
-USE_SUBGRAPH_BRIDGE(XPU, dropout);
-USE_SUBGRAPH_BRIDGE(XPU, matmul);
+USE_SUBGRAPH_BRIDGE(relu, kXPU);
+USE_SUBGRAPH_BRIDGE(tanh, kXPU);
+USE_SUBGRAPH_BRIDGE(conv2d, kXPU);
+USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kXPU);
+USE_SUBGRAPH_BRIDGE(elementwise_add, kXPU);
+USE_SUBGRAPH_BRIDGE(pool2d, kXPU);
+USE_SUBGRAPH_BRIDGE(softmax, kXPU);
+USE_SUBGRAPH_BRIDGE(mul, kXPU);
+USE_SUBGRAPH_BRIDGE(batch_norm, kXPU);
+USE_SUBGRAPH_BRIDGE(stack, kXPU);
+USE_SUBGRAPH_BRIDGE(gather, kXPU);
+USE_SUBGRAPH_BRIDGE(scale, kXPU);
+USE_SUBGRAPH_BRIDGE(lookup_table, kXPU);
+USE_SUBGRAPH_BRIDGE(slice, kXPU);
+USE_SUBGRAPH_BRIDGE(transpose, kXPU);
+USE_SUBGRAPH_BRIDGE(transpose2, kXPU);
+USE_SUBGRAPH_BRIDGE(reshape, kXPU);
+USE_SUBGRAPH_BRIDGE(reshape2, kXPU);
+USE_SUBGRAPH_BRIDGE(layer_norm, kXPU);
+USE_SUBGRAPH_BRIDGE(gelu, kXPU);
+USE_SUBGRAPH_BRIDGE(dropout, kXPU);
+USE_SUBGRAPH_BRIDGE(matmul, kXPU);
diff --git a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h b/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
deleted file mode 100644
index 3c76e0e8b5..0000000000
--- a/lite/kernels/xpu/bridges/paddle_use_xpu_bridges.h
+++ /dev/null
@@ -1,26 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "lite/kernels/xpu/bridges/registry.h"
-
-USE_XPU_BRIDGE(relu);
-USE_XPU_BRIDGE(conv2d);
-USE_XPU_BRIDGE(depthwise_conv2d);
-USE_XPU_BRIDGE(elementwise_add);
-USE_XPU_BRIDGE(pool2d);
-USE_XPU_BRIDGE(softmax);
-USE_XPU_BRIDGE(mul);
-USE_XPU_BRIDGE(batch_norm);
diff --git a/lite/kernels/xpu/bridges/pool_op.cc b/lite/kernels/xpu/bridges/pool_op.cc
index 60787a3429..90653edcce 100644
--- a/lite/kernels/xpu/bridges/pool_op.cc
+++ b/lite/kernels/xpu/bridges/pool_op.cc
@@ -50,21 +50,22 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto exclusive = op_info->GetAttr<bool>("exclusive");
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Pool node
   if (pooling_type == "max") {
     if (global_pooling) {
-      graph->AddNode(out_name, graph->builder_.CreateGlobalMaxPool2D(*x_node));
+      graph->Add(out_name,
+                 graph->builder_.CreateGlobalMaxPool2D(*x_node->data()));
     } else {
-      graph->AddNode(
+      graph->Add(
           out_name,
-          graph->builder_.CreateMaxPool2D(*x_node,
+          graph->builder_.CreateMaxPool2D(*x_node->data(),
                                           CvtShape<xtcl::xIndexExpr>(ksize),
                                           CvtShape<xtcl::xIndexExpr>(strides),
                                           CvtShape<xtcl::xIndexExpr>(paddings),
@@ -73,12 +74,13 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     }
   } else if (pooling_type == "avg") {
     if (global_pooling) {
-      graph->AddNode(out_name, graph->builder_.CreateGlobalAvgPool2D(*x_node));
+      graph->Add(out_name,
+                 graph->builder_.CreateGlobalAvgPool2D(*x_node->data()));
     } else {
       // !exclusive ---> count_include_pad
-      graph->AddNode(
+      graph->Add(
           out_name,
-          graph->builder_.CreateAvgPool2D(*x_node,
+          graph->builder_.CreateAvgPool2D(*x_node->data(),
                                           CvtShape<xtcl::xIndexExpr>(ksize),
                                           CvtShape<xtcl::xIndexExpr>(strides),
                                           CvtShape<xtcl::xIndexExpr>(paddings),
@@ -98,6 +100,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         pool2d,
+REGISTER_SUBGRAPH_BRIDGE(pool2d,
+                         kXPU,
                          paddle::lite::subgraph::xpu::PoolConverter);
diff --git a/lite/kernels/xpu/bridges/registry.cc b/lite/kernels/xpu/bridges/registry.cc
deleted file mode 100644
index 4ab1b69a25..0000000000
--- a/lite/kernels/xpu/bridges/registry.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/xpu/bridges/registry.h"
-#include <utility>
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-Factory& Factory::Instance() {
-  static Factory g_xpu_bridge;
-  return g_xpu_bridge;
-}
-
-bool Factory::HasType(const std::string& op_type) const {
-  return map_.count(op_type);
-}
-
-void Factory::Insert(const std::string& op_type, const func_type& func_name) {
-  map_.insert(std::make_pair(op_type, func_name));
-}
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/xpu/bridges/registry.h b/lite/kernels/xpu/bridges/registry.h
deleted file mode 100644
index c990399c1c..0000000000
--- a/lite/kernels/xpu/bridges/registry.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <xtcl/xtcl.h>
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-#include "lite/core/op_lite.h"
-#include "lite/utils/macros.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-namespace bridges {
-
-// xpu network builder and constant tensors
-class graph_ctx_type {
- public:
-  std::shared_ptr<xtcl::network::xNetworkBuilder> builder;
-  std::shared_ptr<xtcl::network::xTensorCompiler::ParamNDArrayMap> params;
-};
-
-// var_name, xpu node pointer
-using node_map_type =
-    std::unordered_map<std::string, std::shared_ptr<xtcl::xExpr>>;
-
-using func_type = std::function<node_map_type(
-    const std::shared_ptr<OpLite>, graph_ctx_type*, const node_map_type&)>;
-using cvt_map_type = std::unordered_map<std::string, func_type>;
-class Factory {
- public:
-  static Factory& Instance();
-
-  const cvt_map_type& AllFunctions() const { return map_; }
-  bool HasType(const std::string& op_type) const;
-  void Insert(const std::string& op_type, const func_type& func_name);
-  Factory() = default;
-
- private:
-  cvt_map_type map_;
-  DISALLOW_COPY_AND_ASSIGN(Factory);
-};
-
-}  // namespace bridges
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-// some platform-independent defintion
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
-#define STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(uniq_name, msg)              \
-  struct __test_global_namespace_##uniq_name##__ {};                          \
-  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
-                             __test_global_namespace_##uniq_name##__>::value, \
-                msg)
-
-#define REGISTER_XPU_BRIDGE(op_type, cvt_func_name)                         \
-  STATIC_ASSERT_JITKERNEL_GLOBAL_NAMESPACE(                                 \
-      __reg_xpu_bridge_##op_type##__,                                       \
-      "REGISTER_XPU_BRIDGE must be called in global namespace only once!"); \
-  int __reg_xpu_bridge_##op_type##_Insert() {                               \
-    paddle::lite::kernels::xpu::bridges::Factory::Instance().Insert(        \
-        #op_type, cvt_func_name);                                           \
-    return 0;                                                               \
-  }
-
-#define USE_XPU_BRIDGE(op_type)                                  \
-  extern int __reg_xpu_bridge_##op_type##_Insert();              \
-  static int __reg_xpu_bridge_##op_type##_Insert_return UNUSED = \
-      __reg_xpu_bridge_##op_type##_Insert();
diff --git a/lite/kernels/xpu/bridges/reshape_op.cc b/lite/kernels/xpu/bridges/reshape_op.cc
old mode 100755
new mode 100644
index eeee6c7244..5e9a37d18e
--- a/lite/kernels/xpu/bridges/reshape_op.cc
+++ b/lite/kernels/xpu/bridges/reshape_op.cc
@@ -33,22 +33,16 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 
   // Get input and output vars and op attributes
   auto x_name = op_info->Input("X").front();
-  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
   auto x = scope->FindMutableTensor(x_name);
   auto x_dims = x->dims();
   auto out_name = op_info->Output("Out").front();
-  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   std::vector<int> shape;
@@ -59,6 +53,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     // CHECK(shape_tensor_type->layout() == DATALAYOUT(kNCHW));
     for (auto shape_tensor_name : shape_tensor_names) {
       auto shape_tensor = scope->FindMutableTensor(shape_tensor_name);
+      CHECK(shape_tensor->persistable());
       auto shape_tensor_data = shape_tensor->mutable_data<int>();
       shape.emplace_back(shape_tensor_data[0]);
     }
@@ -73,6 +68,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
     // CHECK(actual_shape_type->precision() == PRECISION(kInt32));
     // CHECK(actual_shape_type->layout() == DATALAYOUT(kNCHW));
     auto actual_shape = scope->FindMutableTensor(actual_shape_name);
+    CHECK(actual_shape->persistable());
     auto actual_shape_dims = actual_shape->dims();
     auto actual_shape_data = actual_shape->mutable_data<int>();
     auto shape = std::vector<int>(
@@ -86,9 +82,11 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto out_dims = operators::ValidateShape(shape, x_dims);
 
   // Reshape node
-  graph->AddNode(out_name,
-                 graph->builder_.CreateReshape(
-                     *x_node, CvtShape<xtcl::Integer>(out_dims)));
+  graph->Add(out_name,
+             graph->builder_.CreateReshape(*x_node->data(),
+                                           CvtShape<xtcl::Integer>(out_dims)),
+             x_node->precision(),
+             x_node->layout());
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
@@ -97,9 +95,9 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         reshape2,
+REGISTER_SUBGRAPH_BRIDGE(reshape2,
+                         kXPU,
                          paddle::lite::subgraph::xpu::ReshapeConverter);
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         reshape,
+REGISTER_SUBGRAPH_BRIDGE(reshape,
+                         kXPU,
                          paddle::lite::subgraph::xpu::ReshapeConverter);
diff --git a/lite/kernels/xpu/bridges/scale_op.cc b/lite/kernels/xpu/bridges/scale_op.cc
old mode 100755
new mode 100644
index a3423d290c..e6871390ac
--- a/lite/kernels/xpu/bridges/scale_op.cc
+++ b/lite/kernels/xpu/bridges/scale_op.cc
@@ -46,17 +46,17 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   float bias = op_info->GetAttr<float>("bias");
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Scale node
-  graph->AddNode(
-      out_name,
-      graph->builder_.CreateScale(*x_node, scale, bias, bias_after_scale));
+  graph->Add(out_name,
+             graph->builder_.CreateScale(
+                 *x_node->data(), scale, bias, bias_after_scale));
   return SUCCESS;
 }
 
@@ -65,6 +65,6 @@ int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         scale,
+REGISTER_SUBGRAPH_BRIDGE(scale,
+                         kXPU,
                          paddle::lite::subgraph::xpu::ScaleConverter);
diff --git a/lite/kernels/xpu/bridges/slice_op.cc b/lite/kernels/xpu/bridges/slice_op.cc
old mode 100755
new mode 100644
index 90c91d3b59..3e4592d454
--- a/lite/kernels/xpu/bridges/slice_op.cc
+++ b/lite/kernels/xpu/bridges/slice_op.cc
@@ -46,11 +46,11 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto ends = op_info->GetAttr<std::vector<int>>("ends");
 
   // Input node
-  std::shared_ptr<xtcl::xExpr> input_node = nullptr;
-  if (graph->HasNode(input_name)) {
-    input_node = graph->GetNode(input_name);
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
   } else {
-    input_node = graph->AddNode(input_name, input_dims);
+    input_node = graph->Add(input_name, *input);
   }
 
   // Calculate the begin and end of the slice in all of
@@ -74,9 +74,9 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) {
       strides.push_back(1);
     }
   }
-  graph->AddNode(
-      out_name,
-      graph->builder_.CreateStridedSlice(*input_node, begin, end, strides));
+  graph->Add(out_name,
+             graph->builder_.CreateStridedSlice(
+                 *input_node->data(), begin, end, strides));
   return REBUILD_WHEN_SHAPE_CHANGED;
 }
 
@@ -85,6 +85,6 @@ int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         slice,
+REGISTER_SUBGRAPH_BRIDGE(slice,
+                         kXPU,
                          paddle::lite::subgraph::xpu::SliceConverter);
diff --git a/lite/kernels/xpu/bridges/softmax_op.cc b/lite/kernels/xpu/bridges/softmax_op.cc
index 6deb536ef1..d964f29a86 100644
--- a/lite/kernels/xpu/bridges/softmax_op.cc
+++ b/lite/kernels/xpu/bridges/softmax_op.cc
@@ -44,15 +44,15 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto axis = op_info->GetAttr<int>("axis");
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Softmax node
-  graph->AddNode(out_name, graph->builder_.CreateSoftmax(*x_node, axis));
+  graph->Add(out_name, graph->builder_.CreateSoftmax(*x_node->data(), axis));
   return SUCCESS;
 }
 
@@ -61,6 +61,6 @@ int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         softmax,
+REGISTER_SUBGRAPH_BRIDGE(softmax,
+                         kXPU,
                          paddle::lite::subgraph::xpu::SoftmaxConverter);
diff --git a/lite/kernels/xpu/bridges/stack_op.cc b/lite/kernels/xpu/bridges/stack_op.cc
old mode 100755
new mode 100644
index eb7d6d7b79..69673aaeba
--- a/lite/kernels/xpu/bridges/stack_op.cc
+++ b/lite/kernels/xpu/bridges/stack_op.cc
@@ -46,19 +46,19 @@ int StackConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   for (auto& x_name : x_names) {
     auto x = scope->FindMutableTensor(x_name);
     auto x_dims = x->dims();
-    std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-    if (graph->HasNode(x_name)) {
-      x_node = graph->GetNode(x_name);
+    std::shared_ptr<Node> x_node = nullptr;
+    if (graph->Has(x_name)) {
+      x_node = graph->Get(x_name);
     } else {
-      x_node = graph->AddNode(x_name, x_dims);
+      x_node = graph->Add(x_name, *x);
     }
-    x_nodes.push_back(*x_node);
+    x_nodes.push_back(*x_node->data());
   }
 
   // Stack node
-  graph->AddNode(y_name,
-                 graph->builder_.CreateStack(
-                     xtcl::network::TupleNode::make(x_nodes), axis));
+  graph->Add(y_name,
+             graph->builder_.CreateStack(
+                 xtcl::network::TupleNode::make(x_nodes), axis));
   return SUCCESS;
 }
 
@@ -67,6 +67,6 @@ int StackConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         stack,
+REGISTER_SUBGRAPH_BRIDGE(stack,
+                         kXPU,
                          paddle::lite::subgraph::xpu::StackConverter);
diff --git a/lite/kernels/xpu/bridges/transpose_op.cc b/lite/kernels/xpu/bridges/transpose_op.cc
old mode 100755
new mode 100644
index b6823dd6a8..4217fe0119
--- a/lite/kernels/xpu/bridges/transpose_op.cc
+++ b/lite/kernels/xpu/bridges/transpose_op.cc
@@ -44,19 +44,19 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
   auto axis = op_info->GetAttr<std::vector<int>>("axis");
 
   // X node
-  std::shared_ptr<xtcl::xExpr> x_node = nullptr;
-  if (graph->HasNode(x_name)) {
-    x_node = graph->GetNode(x_name);
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
   } else {
-    x_node = graph->AddNode(x_name, x_dims);
+    x_node = graph->Add(x_name, *x);
   }
 
   // Transpose node
-  graph->AddNode(out_name,
-                 graph->builder_.CreateTranspose(
-                     *x_node,
-                     CvtShape<xtcl::Integer>(
-                         std::vector<int64_t>(axis.begin(), axis.end()))));
+  graph->Add(out_name,
+             graph->builder_.CreateTranspose(
+                 *x_node->data(),
+                 CvtShape<xtcl::Integer>(
+                     std::vector<int64_t>(axis.begin(), axis.end()))));
 
   return SUCCESS;
 }
@@ -66,9 +66,9 @@ int TransposeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         transpose,
+REGISTER_SUBGRAPH_BRIDGE(transpose,
+                         kXPU,
                          paddle::lite::subgraph::xpu::TransposeConverter);
-REGISTER_SUBGRAPH_BRIDGE(XPU,
-                         transpose2,
+REGISTER_SUBGRAPH_BRIDGE(transpose2,
+                         kXPU,
                          paddle::lite::subgraph::xpu::TransposeConverter);
diff --git a/lite/kernels/xpu/bridges/utility.cc b/lite/kernels/xpu/bridges/utility.cc
old mode 100755
new mode 100644
index 79fad7c8b4..ce28f38019
--- a/lite/kernels/xpu/bridges/utility.cc
+++ b/lite/kernels/xpu/bridges/utility.cc
@@ -103,7 +103,7 @@ DLDeviceType CvtDLDeviceType(TargetType in_type) {
       out_type = kDLGPU;
       break;
     case TARGET(kXPU):
-      out_type = kDLCPU;
+      out_type = static_cast<DLDeviceType>(kDLXPU);
       break;
     default:
       LOG(FATAL) << "[XPU] Can not convert target type(" << TargetToStr(in_type)
@@ -115,8 +115,8 @@ DLDeviceType CvtDLDeviceType(TargetType in_type) {
 
 std::shared_ptr<xtcl::xNDArray> CvtTensor(const Tensor& in_tensor,
                                           std::vector<int64_t> out_shape,
-                                          PrecisionType in_precision,
                                           DataLayoutType in_layout) {
+  PrecisionType in_precision = in_tensor.precision();
   auto in_shape = in_tensor.dims().Vectorize();
   if (out_shape.empty()) {
     out_shape = in_shape;
diff --git a/lite/kernels/xpu/bridges/utility.h b/lite/kernels/xpu/bridges/utility.h
old mode 100755
new mode 100644
index a02a5ddff0..7769558545
--- a/lite/kernels/xpu/bridges/utility.h
+++ b/lite/kernels/xpu/bridges/utility.h
@@ -58,7 +58,6 @@ xtcl::Array<T> CvtShape(const DDim& in_dims) {
 std::shared_ptr<xtcl::xNDArray> CvtTensor(
     const Tensor& in_tensor,
     std::vector<int64_t> out_shape = {},
-    PrecisionType in_precision = PRECISION(kFloat),
     DataLayoutType in_layout = DATALAYOUT(kNCHW));
 
 }  // namespace xpu
diff --git a/lite/kernels/xpu/graph_compute.cc b/lite/kernels/xpu/graph_compute.cc
deleted file mode 100644
index b9e5be1a1d..0000000000
--- a/lite/kernels/xpu/graph_compute.cc
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/xpu/graph_compute.h"
-#include <sys/time.h>
-#include <time.h>
-#include <string>
-#include <vector>
-#include "lite/backends/xpu/runtime.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-
-void GraphCompute::PrepareForRun() {
-  // auto& ctx = this->ctx_->template As<XPUContext>();
-  auto& param = this->Param<param_t>();
-  CHECK(param.weight);
-  CHECK(lite::xpu::LoadModel(*param.weight, &runtime_));
-  CHECK(runtime_ != nullptr);
-}
-
-void GraphCompute::Run() {
-  auto& param = this->Param<param_t>();
-  auto GetCurrentUS = []() -> double {
-    struct timeval time;
-    gettimeofday(&time, NULL);
-    return 1e+6 * time.tv_sec + time.tv_usec;
-  };
-  auto start_time = GetCurrentUS();
-  for (int i = 0; i < param.inputs.size(); i++) {
-    auto input_var_name = param.inputs[i].first;
-    auto input_tensor = param.inputs[i].second;
-    LOG(INFO) << "input dims[" << i << ":" << input_var_name
-              << "]: " << input_tensor->dims();
-    auto input_tensor_data = input_tensor->data<float>();
-    for (int j = 0; j < input_tensor->dims().production(); j++) {
-      VLOG(3) << input_tensor_data[j];
-    }
-    auto input_ndarray = xtcl::xNDArray::Empty(
-        input_tensor->dims().Vectorize(), {kDLFloat, 32, 1}, {kDLCPU, 0});
-    auto input_ndarray_data =
-        static_cast<float*>(input_ndarray.ToDLPack()->dl_tensor.data);
-    std::memcpy(input_ndarray_data,
-                input_tensor_data,
-                sizeof(float) * input_tensor->dims().production());
-    runtime_->SetInputZeroCopy(input_var_name,
-                               &input_ndarray.ToDLPack()->dl_tensor);
-  }
-  runtime_->Run();
-  for (int i = 0; i < param.outputs.size(); i++) {
-    auto output_ndarray = runtime_->GetOutput(i);
-    auto output_var_name = param.outputs[i].first;
-    auto output_tensor = param.outputs[i].second;
-    output_tensor->Resize(output_ndarray.Shape());
-    LOG(INFO) << "output dims[" << i << ":" << output_var_name
-              << "]: " << output_tensor->dims();
-    auto output_ndarray_data =
-        static_cast<float*>(output_ndarray.ToDLPack()->dl_tensor.data);
-    auto output_tensor_data = output_tensor->mutable_data<float>();
-    std::memcpy(output_tensor_data,
-                output_ndarray_data,
-                sizeof(float) * output_tensor->dims().production());
-    for (int j = 0; j < output_tensor->dims().production(); j++) {
-      VLOG(3) << output_tensor_data[j];
-    }
-  }
-  LOG(INFO) << "[XPU] Process cost " << GetCurrentUS() - start_time << " us";
-}
-
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(graph_op,
-                     kXPU,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::xpu::GraphCompute,
-                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Weight", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
diff --git a/lite/kernels/xpu/graph_compute.h b/lite/kernels/xpu/graph_compute.h
deleted file mode 100644
index 5406daa8a1..0000000000
--- a/lite/kernels/xpu/graph_compute.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <xtcl/xtcl.h>
-#include <memory>
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace xpu {
-
-class GraphCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
- public:
-  using param_t = operators::GraphParam;
-
-  void PrepareForRun() override;
-
-  void Run() override;
-
-  virtual ~GraphCompute() = default;
-
- private:
-  std::shared_ptr<xtcl::network::xRuntimeInstance> runtime_{nullptr};
-};
-
-}  // namespace xpu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/kernels/xpu/subgraph_compute.cc b/lite/kernels/xpu/subgraph_compute.cc
old mode 100755
new mode 100644
index 07a74b0454..15df4f80ca
--- a/lite/kernels/xpu/subgraph_compute.cc
+++ b/lite/kernels/xpu/subgraph_compute.cc
@@ -39,13 +39,13 @@ int SubgraphEngine::BuildDeviceProgram() {
     op->CheckShape();
     op->InferShape();
     std::string op_type = op->op_info()->Type();
-    if (!bridges.Exists("XPU", op_type)) {
+    if (!bridges.Exists(op_type, "kXPU")) {
       return subgraph::FAILED;
     }
     auto kernel = inst.kernel();
-    status |= bridges.Select("XPU", op_type)(reinterpret_cast<void*>(&graph),
-                                             const_cast<OpLite*>(op),
-                                             const_cast<KernelBase*>(kernel));
+    status |= bridges.Select(op_type, "kXPU")(reinterpret_cast<void*>(&graph),
+                                              const_cast<OpLite*>(op),
+                                              const_cast<KernelBase*>(kernel));
     if (subgraph::CHECK_FAILED(status)) {
       return subgraph::FAILED;
     }
@@ -57,26 +57,26 @@ int SubgraphEngine::BuildDeviceProgram() {
   std::vector<xtcl::xExpr*> device_inodes;
   std::vector<xtcl::xExpr*> device_onodes;
   for (auto& input_name : input_names_) {
-    if (graph.HasNode(input_name)) {
-      if (!graph.GetType(input_name).persistable()) {
-        device_inodes.push_back(graph.GetNode(input_name).get());
+    if (graph.Has(input_name)) {
+      if (graph.Get(input_name)->is_data()) {
+        device_inodes.push_back(graph.Get(input_name)->data().get());
         device_inames_.push_back(input_name);
       } else {
         LOG(WARNING) << "[XPU] Input node " << input_name
-                     << " is skipped because it is a persistable node.";
+                     << " is ignored because it is not a data node.";
       }
     } else {
       LOG(WARNING) << "[XPU] Input node " << input_name
-                   << " is skipped because it does not exist.";
+                   << " is ignored because it does not exist.";
     }
   }
   for (auto& output_name : output_names_) {
-    if (graph.HasNode(output_name)) {
-      device_onodes.push_back(graph.GetNode(output_name).get());
+    if (graph.Has(output_name)) {
+      device_onodes.push_back(graph.Get(output_name)->data().get());
       device_onames_.push_back(output_name);
     } else {
       LOG(WARNING) << "[XPU] Output node " << output_name
-                   << " is skipped because it does not exist.";
+                   << " is ignored because it does not exist.";
     }
   }
   CHECK(!device_inames_.empty())
@@ -98,14 +98,14 @@ int SubgraphEngine::BuildDeviceProgram() {
   origin_otensors_.resize(device_onames_.size());
   device_otensors_.resize(device_onames_.size());
   for (int i = 0; i < device_inames_.size(); i++) {
-    auto type = graph.GetType(device_inames_[i]);
-    auto precision = type.precision();
-    auto layout = type.layout();
+    auto node = graph.Get(device_inames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
     origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
     CHECK(origin_itensors_[i]);
     origin_idims_[i] = origin_itensors_[i]->dims();
-    VLOG(3) << "[XPU] Inputs[" << i
-            << "] precision: " << PrecisionToStr(precision)
+    VLOG(3) << "[XPU] Inputs[" << i << "] name: " << device_inames_[i]
+            << " precision: " << PrecisionToStr(precision)
             << " layout: " << DataLayoutToStr(layout)
             << " dims: " << origin_idims_[i];
     // Prepare the device input tensors which share data with the origin input
@@ -122,14 +122,14 @@ int SubgraphEngine::BuildDeviceProgram() {
     device_itensors_[i].byte_offset = 0;
   }
   for (int i = 0; i < device_onames_.size(); i++) {
-    auto type = graph.GetType(device_onames_[i]);
-    auto precision = type.precision();
-    auto layout = type.layout();
+    auto node = graph.Get(device_onames_[i]);
+    auto precision = node->precision();
+    auto layout = node->layout();
     origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
     CHECK(origin_otensors_[i]);
     origin_odims_[i] = origin_otensors_[i]->dims();
-    VLOG(3) << "[XPU] Outputs[" << i
-            << "] precision: " << PrecisionToStr(precision)
+    VLOG(3) << "[XPU] Outputs[" << i << "] name: " << device_onames_[i]
+            << " precision: " << PrecisionToStr(precision)
             << " layout: " << DataLayoutToStr(layout)
             << " dims: " << origin_odims_[i];
     // Prepare the device output tensors which share data with the origin output
@@ -175,7 +175,7 @@ int SubgraphEngine::LaunchDeviceProgram() {
     // Update the data pointer of DLTensor to track the origin input tensors
     device_itensors_[i].data =
         const_cast<void*>(origin_itensors_[i]->raw_data());
-    device_program_->SetInputZeroCopy(device_inames_[i], &device_itensors_[i]);
+    device_program_->SetInput(device_inames_[i], &device_itensors_[i]);
   }
   // Run the XPU model
   auto GetCurrentUS = []() -> double {
diff --git a/lite/kernels/xpu/subgraph_compute.h b/lite/kernels/xpu/subgraph_compute.h
old mode 100755
new mode 100644
diff --git a/lite/model_parser/naive_buffer/naive_buffer.h b/lite/model_parser/naive_buffer/naive_buffer.h
index 717dd3c5a6..9be2be9543 100644
--- a/lite/model_parser/naive_buffer/naive_buffer.h
+++ b/lite/model_parser/naive_buffer/naive_buffer.h
@@ -128,19 +128,23 @@ using Float64Builder = PrimaryBuilder<double>;
 
 template <typename Primary>
 class PrimaryListBuilder : public FieldBuilder {
-  std::vector<Primary> data_;
+  const Primary* data_{nullptr};
+  int size_{0};
 
  public:
   using value_type = Primary;
 
   explicit PrimaryListBuilder(BinaryTable* table) : FieldBuilder(table) {}
-  PrimaryListBuilder(BinaryTable* table, const std::vector<Primary>& val)
-      : FieldBuilder(table), data_(val) {}
+  PrimaryListBuilder(BinaryTable* table, const Primary* val, int size)
+      : FieldBuilder(table), data_(val), size_(size) {}
 
   /// Set data.
-  void set(const std::vector<Primary>& x) { data_ = x; }
+  void set(const Primary* x, int size) {
+    data_ = x;
+    size_ = size;
+  }
 
-  const std::vector<Primary>& data() const { return data_; }
+  const Primary* data() const { return data_; }
 
   /// Save information to the corresponding BinaryTable.
   void Save() override;
@@ -149,14 +153,12 @@ class PrimaryListBuilder : public FieldBuilder {
   void Load() override;
 
   /// Number of elements.
-  size_t size() const { return data_.size(); }
+  size_t size() const { return size_; }
 
-  Type type() const override {
-    return core::StdTypeToRepr<std::vector<Primary>>();
-  }
+  Type type() const override { return core::StdTypeToRepr<const Primary*>(); }
 
   /// clear builder
-  void Clear() { data_.clear(); }
+  void Clear() { size_ = 0; }
 
   ~PrimaryListBuilder() = default;
 };
@@ -381,17 +383,14 @@ void PrimaryBuilder<Primary>::Load() {
 
 template <typename Primary>
 void PrimaryListBuilder<Primary>::Load() {
-  CHECK(data_.empty()) << "Duplicate load";
+  CHECK(data_ == nullptr) << "Duplicate load";
   // Load number of elements first.
   uint64_t num_elems{};
   memcpy(&num_elems, table()->cursor(), sizeof(uint64_t));
   table()->Consume(sizeof(uint64_t));
 
-  data_.resize(num_elems);
-  for (uint64_t i = 0; i < num_elems; i++) {
-    memcpy(&data_[i], table()->cursor(), sizeof(value_type));
-    table()->Consume(sizeof(value_type));
-  }
+  set(reinterpret_cast<Primary*>(table()->cursor()), num_elems);
+  table()->Consume(num_elems * sizeof(value_type));
 }
 
 template <typename Primary>
@@ -404,7 +403,7 @@ void PrimaryListBuilder<Primary>::Save() {
 
   table()->Require(num_elems * sizeof(value_type));
   memcpy(table()->cursor(),
-         reinterpret_cast<byte_t*>(&data_[0]),
+         reinterpret_cast<const byte_t*>(data_),
          num_elems * sizeof(value_type));
   table()->Consume(num_elems * sizeof(value_type));
 }
diff --git a/lite/model_parser/naive_buffer/param_desc.cc b/lite/model_parser/naive_buffer/param_desc.cc
index 4397b3c413..cc97b02716 100644
--- a/lite/model_parser/naive_buffer/param_desc.cc
+++ b/lite/model_parser/naive_buffer/param_desc.cc
@@ -150,9 +150,9 @@ void ParamDesc::SetDim(const std::vector<int64_t>& dim) {
         << "Data Type mismatch";                                            \
     std::vector<T> res;                                                     \
     auto& data_builder = desc_->GetField<PrimaryListBuilder<char>>("data"); \
-    auto& data = data_builder.data();                                       \
-    size_t size = data.size() / sizeof(T);                                  \
-    auto* data_ptr = reinterpret_cast<const T*>(&data[0]);                  \
+    auto data = data_builder.data();                                        \
+    size_t size = data_builder.size() / sizeof(T);                          \
+    auto* data_ptr = reinterpret_cast<const T*>(data);                      \
     for (size_t i = 0; i < size; ++i) {                                     \
       res.push_back(data_ptr[i]);                                           \
     }                                                                       \
@@ -178,8 +178,7 @@ GET_DATA_IMPL(double, FP64);
   data_builder->Clear();                                        \
   size_t size = size__ * sizeof(T);                             \
   auto* data_ptr = reinterpret_cast<const char*>(data_ptr__);   \
-  std::vector<char> data_vec(data_ptr, data_ptr + size);        \
-  data_builder->set(data_vec);
+  data_builder->set(data_ptr, size);
 
 #define SET_DATA_IMPL(T, type__)                                \
   template <>                                                   \
diff --git a/lite/operators/CMakeLists.txt b/lite/operators/CMakeLists.txt
index 190cf7194c..f307cb66ac 100644
--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -50,6 +50,7 @@ add_operator(layout_op basic SRCS layout_op.cc DEPS ${op_DEPS})
 add_operator(instance_norm_op basic SRCS instance_norm_op.cc DEPS ${op_DEPS})
 add_operator(subgraph_op basic SRCS subgraph_op.cc DEPS ${op_DEPS})
 add_operator(grid_sampler_op basic SRCS grid_sampler_op.cc DEPS ${op_DEPS})
+add_operator(flatten_op basic SRCS flatten_op.cc DEPS ${op_DEPS})
 
 # 2.basic ops not used in basic models
 add_operator(negative_op extra SRCS negative_op.cc DEPS ${op_DEPS})
@@ -78,11 +79,9 @@ add_operator(anchor_generator_op extra SRCS anchor_generator_op.cc DEPS ${op_DEP
 add_operator(generate_proposals_op extra SRCS generate_proposals_op.cc DEPS ${op_DEPS})
 add_operator(roi_align_op extra SRCS roi_align_op.cc DEPS ${op_DEPS})
 add_operator(box_clip_op extra SRCS box_clip_op.cc DEPS ${op_DEPS})
-add_operator(flatten_op extra SRCS flatten_op.cc DEPS ${op_DEPS})
 add_operator(fake_quantize_range_abs_max_op extra SRCS fake_quantize_range_abs_max.cc DEPS ${op_DEPS})
 add_operator(sequence_expand_as_op_lite extra SRCS sequence_expand_as_op.cc DEPS ${op_DEPS})
 add_operator(assign_value_op extra SRCS assign_value_op.cc DEPS ${op_DEPS})
-
 add_operator(fake_quantize_dequantize_moving_avg_abs_max_op extra SRCS fake_quantize_dequantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
 add_operator(fake_channel_wise_dequantize_max_abs_op extra SRCS fake_channel_wise_dequantize_max_abs.cc DEPS ${op_DEPS})
 add_operator(split_lod_tensor_op_lite extra SRCS split_lod_tensor_op.cc DEPS ${op_DEPS})
diff --git a/lite/operators/collect_fpn_proposals_op.cc b/lite/operators/collect_fpn_proposals_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/collect_fpn_proposals_op.h b/lite/operators/collect_fpn_proposals_op.h
old mode 100755
new mode 100644
diff --git a/lite/operators/compare_op.cc b/lite/operators/compare_op.cc
index 3210520cd5..aa500ba35c 100644
--- a/lite/operators/compare_op.cc
+++ b/lite/operators/compare_op.cc
@@ -54,7 +54,7 @@ bool CompareOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
 }  // namespace paddle
 
 REGISTER_LITE_OP(equal, paddle::lite::operators::CompareOp);
-REGISTER_LITE_OP(notequal, paddle::lite::operators::CompareOp);
+REGISTER_LITE_OP(not_equal, paddle::lite::operators::CompareOp);
 REGISTER_LITE_OP(less_than, paddle::lite::operators::CompareOp);
 REGISTER_LITE_OP(less_equal, paddle::lite::operators::CompareOp);
 REGISTER_LITE_OP(greater_than, paddle::lite::operators::CompareOp);
diff --git a/lite/operators/conditional_block_op.cc b/lite/operators/conditional_block_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/conditional_block_op.h b/lite/operators/conditional_block_op.h
old mode 100755
new mode 100644
diff --git a/lite/operators/distribute_fpn_proposals_op.cc b/lite/operators/distribute_fpn_proposals_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/distribute_fpn_proposals_op.h b/lite/operators/distribute_fpn_proposals_op.h
old mode 100755
new mode 100644
diff --git a/lite/operators/dropout_op.cc b/lite/operators/dropout_op.cc
index bef0891847..03047de3b3 100644
--- a/lite/operators/dropout_op.cc
+++ b/lite/operators/dropout_op.cc
@@ -33,7 +33,7 @@ bool DropoutOp::InferShape() const {
     param_.mask->Resize(x_dims);
   }
   // share LoD
-  // param_.output->set_lod(param_.input->lod());
+  param_.output->set_lod(param_.x->lod());
   return true;
 }
 
diff --git a/lite/operators/fc_op.cc b/lite/operators/fc_op.cc
index ad3fcf79a3..702950ae18 100644
--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -61,7 +61,7 @@ bool FcOpLite::InferShape() const {
   param_.output->Resize(lite::DDim(output_dims));
 
   // share LoD
-  // param_.output->set_lod(param_.input->lod());
+  param_.output->set_lod(param_.input->lod());
   return true;
 }
 
diff --git a/lite/operators/graph_op.cc b/lite/operators/graph_op.cc
deleted file mode 100644
index 018ce264e2..0000000000
--- a/lite/operators/graph_op.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/graph_op.h"
-#include <utility>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-bool GraphOpLite::CheckShape() const {
-  CHECK_GE_OR_FALSE(param_.inputs.size(), 1UL);
-  CHECK_GE_OR_FALSE(param_.outputs.size(), 1UL);
-  return true;
-}
-
-bool GraphOpLite::InferShape() const { return CheckShape(); /* enrich me */ }
-
-bool GraphOpLite::AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) {
-  auto inputs = op_desc.Input("Inputs");
-  auto weight = op_desc.Input("Weight");
-  auto outputs = op_desc.Output("Outputs");
-
-  for (auto var : inputs) {
-    CHECK(scope->FindVar(var));
-    param_.inputs.push_back(
-        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
-  }
-
-  param_.weight = scope->FindVar(weight.front())->GetMutable<lite::Tensor>();
-  CHECK(param_.weight);
-
-  for (auto var : outputs) {
-    CHECK(scope->FindVar(var));
-    param_.outputs.push_back(
-        std::make_pair(var, scope->FindVar(var)->GetMutable<lite::Tensor>()));
-  }
-
-  return true;
-}
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_OP(graph_op, paddle::lite::operators::GraphOpLite);
diff --git a/lite/operators/graph_op.h b/lite/operators/graph_op.h
deleted file mode 100644
index 20a7cd9b8d..0000000000
--- a/lite/operators/graph_op.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <vector>
-#include "lite/core/kernel.h"
-#include "lite/core/op_lite.h"
-#include "lite/core/scope.h"
-#include "lite/core/tensor.h"
-#include "lite/operators/op_params.h"
-#include "lite/utils/all.h"
-
-namespace paddle {
-namespace lite {
-namespace operators {
-
-class GraphOpLite : public OpLite {
- public:
-  GraphOpLite() {}
-
-  explicit GraphOpLite(const std::string &type) : OpLite(type) {}
-
-  bool CheckShape() const override;
-
-  bool InferShape() const override;
-
-  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override;
-
-  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
-
-  std::string DebugString() const override { return "graph_op"; }
-
- private:
-  mutable GraphParam param_;
-};
-
-}  // namespace operators
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/operators/grid_sampler_op.cc b/lite/operators/grid_sampler_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/grid_sampler_op.h b/lite/operators/grid_sampler_op.h
old mode 100755
new mode 100644
diff --git a/lite/operators/instance_norm_op.cc b/lite/operators/instance_norm_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/instance_norm_op.h b/lite/operators/instance_norm_op.h
old mode 100755
new mode 100644
diff --git a/lite/operators/merge_lod_tensor_op.cc b/lite/operators/merge_lod_tensor_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/merge_lod_tensor_op.h b/lite/operators/merge_lod_tensor_op.h
old mode 100755
new mode 100644
diff --git a/lite/operators/reduce_prod_op.cc b/lite/operators/reduce_prod_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/reduce_prod_op.h b/lite/operators/reduce_prod_op.h
old mode 100755
new mode 100644
diff --git a/lite/operators/sequence_pool_concat_op.cc b/lite/operators/sequence_pool_concat_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/sequence_pool_concat_op.h b/lite/operators/sequence_pool_concat_op.h
old mode 100755
new mode 100644
diff --git a/lite/operators/split_lod_tensor_op.cc b/lite/operators/split_lod_tensor_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/split_lod_tensor_op.h b/lite/operators/split_lod_tensor_op.h
old mode 100755
new mode 100644
diff --git a/lite/operators/subgraph_op.cc b/lite/operators/subgraph_op.cc
old mode 100755
new mode 100644
diff --git a/lite/operators/subgraph_op.h b/lite/operators/subgraph_op.h
old mode 100755
new mode 100644
diff --git a/lite/tests/cv/CMakeLists.txt b/lite/tests/cv/CMakeLists.txt
index 05fcc06b10..697c9874ef 100644
--- a/lite/tests/cv/CMakeLists.txt
+++ b/lite/tests/cv/CMakeLists.txt
@@ -1,3 +1,3 @@
 if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
-    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm paddle_api_light ${lite_cv_deps} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(image_convert_test SRCS image_convert_test.cc DEPS paddle_cv_arm)
 endif()
diff --git a/lite/tests/cv/cv_basic.h b/lite/tests/cv/cv_basic.h
index 728d316714..92f68543bb 100644
--- a/lite/tests/cv/cv_basic.h
+++ b/lite/tests/cv/cv_basic.h
@@ -192,7 +192,6 @@ void nv21_bgra_basic(const uint8_t* in_data,
   nv2bgra(in_data, out_data, srcw, srch, 0, 1);
 }
 
-/*
 /*
 采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
 采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
@@ -217,6 +216,21 @@ void bgr_gray_basic(const uint8_t* in_data,
     }
   }
 }
+void bgra_gray_basic(const uint8_t* in_data,
+                     uint8_t* out_data,
+                     int srcw,
+                     int srch) {
+  for (int i = 0; i < srch; i++) {
+    const uint8_t* din_ptr = in_data + i * 4 * srcw;
+    uint8_t* dout_ptr = out_data + i * srcw;
+    for (int j = 0; j < srcw; j++) {
+      int sum = din_ptr[0] * 15 + din_ptr[1] * 75 + din_ptr[2] * 38;
+      sum = sum >> 7;
+      *dout_ptr++ = sum;
+      din_ptr += 4;
+    }
+  }
+}
 
 void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   for (int i = 0; i < srch; i++) {
@@ -228,6 +242,17 @@ void gray_bgr_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
     }
   }
 }
+void gray_bgra_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  for (int i = 0; i < srch; i++) {
+    for (int j = 0; j < srcw; j++) {
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = 255;
+      src++;
+    }
+  }
+}
 // bgr2bgra, rgb2rgba
 void hwc3_to_hwc4_basic(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   for (int i = 0; i < srch; i++) {
@@ -340,6 +365,16 @@ void image_convert_basic(const uint8_t* in_data,
                (srcFormat == ImageFormat::GRAY &&
                 dstFormat == ImageFormat::BGR)) {
       gray_bgr_basic(in_data, out_data, srcw, srch);
+    } else if ((srcFormat == ImageFormat::RGBA &&
+                dstFormat == ImageFormat::GRAY) ||
+               (srcFormat == ImageFormat::BGRA &&
+                dstFormat == ImageFormat::GRAY)) {
+      bgra_gray_basic(in_data, out_data, srcw, srch);
+    } else if ((srcFormat == ImageFormat::GRAY &&
+                dstFormat == ImageFormat::RGBA) ||
+               (srcFormat == ImageFormat::GRAY &&
+                dstFormat == ImageFormat::BGRA)) {
+      gray_bgra_basic(in_data, out_data, srcw, srch);
     } else if ((srcFormat == ImageFormat::RGBA &&
                 dstFormat == ImageFormat::RGB) ||
                (srcFormat == ImageFormat::BGRA &&
@@ -525,6 +560,7 @@ void image_resize_basic(const uint8_t* in_data,
     int y_flag = 0;  // only one line
     if (y_in_start < 0) {
       y_flag = 1;
+      y_in_end = 0;
     }
     float b0 = ibeta[dy * 2];
     float b1 = ibeta[dy * 2 + 1];
@@ -750,6 +786,26 @@ void image_flip_basic(const uint8_t* in_data,
     flipxy_basic(in_data, srch, srcw, out_data, num);
   }
 }
+void gray_to_tensor_basic(const uint8_t* bgr,
+                          float* output,
+                          int width,
+                          int height,
+                          float* means,
+                          float* scales,
+                          int num) {
+  int size = width * height;
+  float mean_val = means[0];
+  float scale_val = scales[0];
+
+  for (int h = 0; h < height; h++) {
+    const uint8_t* ptr_bgr = bgr + h * width * num;
+    float* ptr_h = output + h * width;
+    for (int i = 0; i < width; i++) {
+      *ptr_h++ = (ptr_bgr[0] - mean_val) * scale_val;
+      ptr_bgr += num;
+    }
+  }
+}
 
 void bgr_to_tensor_chw_basic(const uint8_t* bgr,
                              float* output,
@@ -828,5 +884,8 @@ void image_to_tensor_basic(const uint8_t* in_data,
   } else if (layout == LayoutType::kNHWC && (srcFormat == ImageFormat::BGRA ||
                                              srcFormat == ImageFormat::RGBA)) {
     bgr_to_tensor_hwc_basic(in_data, output, srcw, srch, means, scales, 4);
+  } else if (srcFormat == ImageFormat::GRAY &&
+             (layout == LayoutType::kNHWC || layout == LayoutType::kNCHW)) {
+    gray_to_tensor_basic(in_data, output, srcw, srch, means, scales, 1);
   }
 }
diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc
index eefd30f74f..e22e327e8b 100644
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
@@ -20,6 +20,7 @@
 #include "lite/core/profile/timer.h"
 #include "lite/tests/cv/cv_basic.h"
 #include "lite/utils/cv/paddle_image_preprocess.h"
+#include "time.h"  // NOLINT
 
 DEFINE_int32(cluster, 3, "cluster id");
 DEFINE_int32(threads, 1, "threads num");
@@ -28,15 +29,15 @@ DEFINE_int32(repeats, 1, "repeats times");
 DEFINE_bool(basic_test, false, "do all tests");
 DEFINE_bool(check_result, true, "check the result");
 
-DEFINE_int32(srcFormat, 0, "input image format");
-DEFINE_int32(dstFormat, 1, "output image format");
+DEFINE_int32(srcFormat, 0, "input image format RGBA");
+DEFINE_int32(dstFormat, 2, "output image format RGB");
 DEFINE_int32(srch, 1920, "input height");
 DEFINE_int32(srcw, 1080, "input width");
 DEFINE_int32(dsth, 960, "output height");
 DEFINE_int32(dstw, 540, "output width");
 DEFINE_int32(angle, 90, "rotate angel");
 DEFINE_int32(flip_num, 0, "flip x");
-DEFINE_int32(layout, 0, "layout nchw");
+DEFINE_int32(layout, 1, "layout nchw");
 
 typedef paddle::lite::utils::cv::ImageFormat ImageFormat;
 typedef paddle::lite::utils::cv::FlipParam FlipParam;
@@ -99,7 +100,7 @@ void test_img(const std::vector<int>& cluster_id,
               float rotate,
               FlipParam flip,
               LayoutType layout,
-              int test_iter = 1) {
+              int test_iter = 10) {
 #ifdef LITE_WITH_ARM
   paddle::lite::DeviceInfo::Init();
 #endif
@@ -221,7 +222,7 @@ void test_img(const std::vector<int>& cluster_id,
       float scales[3] = {1 / 127.5f, 1 / 127.5f, 1 / 127.5f};
 
       if (FLAGS_check_result) {
-        LOG(INFO) << "image convert basic compute";
+        // LOG(INFO) << "image convert basic compute";
         image_convert_basic(src,
                             basic_dst,
                             (ImageFormat)srcFormat,
@@ -230,7 +231,7 @@ void test_img(const std::vector<int>& cluster_id,
                             srch,
                             out_size);
 
-        LOG(INFO) << "image resize basic compute";
+        // LOG(INFO) << "image resize basic compute";
         image_resize_basic(basic_dst,
                            resize_basic,
                            (ImageFormat)dstFormat,
@@ -239,7 +240,7 @@ void test_img(const std::vector<int>& cluster_id,
                            dstw,
                            dsth);
 
-        LOG(INFO) << "image rotate basic compute";
+        // LOG(INFO) << "image rotate basic compute";
         image_rotate_basic(resize_basic,
                            tv_out_ratote_basic,
                            (ImageFormat)dstFormat,
@@ -247,7 +248,7 @@ void test_img(const std::vector<int>& cluster_id,
                            dsth,
                            rotate);
 
-        LOG(INFO) << "image flip basic compute";
+        // LOG(INFO) << "image flip basic compute";
         image_flip_basic(resize_basic,
                          tv_out_flip_basic,
                          (ImageFormat)dstFormat,
@@ -255,7 +256,7 @@ void test_img(const std::vector<int>& cluster_id,
                          dsth,
                          flip);
 
-        LOG(INFO) << "image to tensor basic compute";
+        // LOG(INFO) << "image to tensor basic compute";
         image_to_tensor_basic(resize_basic,
                               &tensor_basic,
                               (ImageFormat)dstFormat,
@@ -267,10 +268,13 @@ void test_img(const std::vector<int>& cluster_id,
       }
 
       Timer t1;
+      Timer t_convert;
+      Timer t_resize;
+      Timer t_flip;
+      Timer t_rotate;
+      Timer t_tensor;
 
       LOG(INFO) << "saber cv compute";
-      double to = 0;
-      double min_time = 100000;
       TransParam tparam;
       tparam.ih = srch;
       tparam.iw = srcw;
@@ -285,15 +289,17 @@ void test_img(const std::vector<int>& cluster_id,
       ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
 
       for (int i = 0; i < test_iter; ++i) {
-        t1.Reset();
         t1.Start();
 
-        LOG(INFO) << "image convert saber compute";
+        // LOG(INFO) << "image convert saber compute";
+        t_convert.Start();
         // 方法一: image_preprocess.imageCovert(src, lite_dst);
-        image_preprocess.imageCovert(
+        image_preprocess.imageConvert(
             src, lite_dst, (ImageFormat)srcFormat, (ImageFormat)dstFormat);
+        t_convert.Stop();
 
-        LOG(INFO) << "image resize saber compute";
+        // LOG(INFO) << "image resize saber compute";
+        t_resize.Start();
         // 方法一:image_preprocess.imageResize(lite_dst, resize_tmp);
         image_preprocess.imageResize(lite_dst,
                                      resize_tmp,
@@ -302,8 +308,10 @@ void test_img(const std::vector<int>& cluster_id,
                                      srch,
                                      dstw,
                                      dsth);
+        t_resize.Stop();
 
-        LOG(INFO) << "image rotate saber compute";
+        // LOG(INFO) << "image rotate saber compute";
+        t_rotate.Start();
         // 方法一: image_preprocess.imageRotate(resize_tmp, tv_out_ratote);
         image_preprocess.imageRotate(resize_tmp,
                                      tv_out_ratote,
@@ -311,13 +319,17 @@ void test_img(const std::vector<int>& cluster_id,
                                      dstw,
                                      dsth,
                                      rotate);
+        t_rotate.Stop();
 
-        LOG(INFO) << "image flip saber compute";
+        // LOG(INFO) << "image flip saber compute";
+        t_flip.Start();
         // 方法一: image_preprocess.imageFlip(resize_tmp, tv_out_flip);
         image_preprocess.imageFlip(
             resize_tmp, tv_out_flip, (ImageFormat)dstFormat, dstw, dsth, flip);
+        t_flip.Stop();
 
-        LOG(INFO) << "image to tensor compute";
+        // LOG(INFO) << "image to tensor compute";
+        t_tensor.Start();
         // 方法一: image_preprocess.image2Tensor(
         //  resize_tmp, &dst_tensor, layout, means, scales);
         image_preprocess.image2Tensor(resize_tmp,
@@ -328,16 +340,27 @@ void test_img(const std::vector<int>& cluster_id,
                                       layout,
                                       means,
                                       scales);
-
+        t_tensor.Stop();
         t1.Stop();
-        double tdiff = t1.LapTimes().Avg();
-        to += tdiff;
-        if (tdiff < min_time) {
-          min_time = tdiff;
-        }
       }
-      LOG(INFO) << "image trans total time : " << to
-                << ",  avg time : " << to / test_iter;
+      LOG(INFO) << "image convert avg time : " << t_convert.LapTimes().Avg()
+                << ", min time: " << t_convert.LapTimes().Min()
+                << ", max time: " << t_convert.LapTimes().Max();
+      LOG(INFO) << "image resize avg time : " << t_resize.LapTimes().Avg()
+                << ", min time: " << t_resize.LapTimes().Min()
+                << ", max time: " << t_resize.LapTimes().Max();
+      LOG(INFO) << "image rotate avg time : " << t_rotate.LapTimes().Avg()
+                << ", min time: " << t_rotate.LapTimes().Min()
+                << ", max time: " << t_rotate.LapTimes().Max();
+      LOG(INFO) << "image flip avg time : " << t_flip.LapTimes().Avg()
+                << ", min time: " << t_flip.LapTimes().Min()
+                << ", max time: " << t_flip.LapTimes().Max();
+      LOG(INFO) << "image tensor avg time : " << t_tensor.LapTimes().Avg()
+                << ", min time: " << t_tensor.LapTimes().Min()
+                << ", max time: " << t_tensor.LapTimes().Max();
+      LOG(INFO) << "image trans total avg time : " << t1.LapTimes().Avg()
+                << ", min time: " << t1.LapTimes().Min()
+                << ", max time: " << t1.LapTimes().Max();
 
       double max_ratio = 0;
       double max_diff = 0;
@@ -536,7 +559,7 @@ void test_img(const std::vector<int>& cluster_id,
   }
 }
 
-#if 1
+#if 0
 TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
   if (FLAGS_basic_test) {
     for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
@@ -546,19 +569,16 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
             for (auto rotate : {180}) {
               for (auto flip : {0}) {
                 for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
-                  for (auto dstFormat : {0, 1, 2, 3}) {
+                  for (auto dstFormat : {0, 1, 2, 3, 4}) {
                     for (auto layout : {1}) {
-                      if ((dstFormat == ImageFormat::GRAY &&
-                           (srcFormat == ImageFormat::RGBA ||
-                            srcFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::GRAY &&
-                           (dstFormat == ImageFormat::RGBA ||
-                            dstFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::NV12 ||
+                      if ((srcFormat == ImageFormat::NV12 ||
                            srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY ||
-                               dstFormat == ImageFormat::RGBA ||
-                               dstFormat == ImageFormat::BGRA)) {
+                              (dstFormat == ImageFormat::GRAY)) {
+                        continue;
+                      }
+                      if ((dstFormat == ImageFormat::NV12 ||
+                           dstFormat == ImageFormat::NV21) &&
+                              (srcFormat == ImageFormat::GRAY)) {
                         continue;
                       }
                       if (srcFormat == ImageFormat::NV12 ||
@@ -591,7 +611,7 @@ TEST(TestImageConvertRand, test_func_image_convert_preprocess) {
   }
 }
 #endif
-#if 1
+#if 0
 TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
   if (FLAGS_basic_test) {
     for (auto w : {1, 4, 8, 16, 112, 224, 1092}) {
@@ -601,21 +621,13 @@ TEST(TestImageConvertRand, test_func_image_resize_preprocess) {
             for (auto rotate : {180}) {
               for (auto flip : {0}) {
                 for (auto srcFormat : {0, 1, 2, 3, 4, 11, 12}) {
-                  for (auto dstFormat : {0, 1, 2, 3}) {
+                  for (auto dstFormat : {0, 1, 2, 3, 4, 11}) {
                     for (auto layout : {1}) {
                       if (dstFormat == ImageFormat::NV12 ||
-                          dstFormat == ImageFormat::NV21 ||
-                          (dstFormat == ImageFormat::GRAY &&
-                           (srcFormat == ImageFormat::RGBA ||
-                            srcFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::GRAY &&
-                           (dstFormat == ImageFormat::RGBA ||
-                            dstFormat == ImageFormat::BGRA)) ||
+                           dstFormat == ImageFormat::NV21 ||
                           (srcFormat == ImageFormat::NV12 ||
                            srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY ||
-                               dstFormat == ImageFormat::RGBA ||
-                               dstFormat == ImageFormat::BGRA)) {
+                              dstFormat == ImageFormat::GRAY) {
                         continue;
                       }
                       if (srcFormat == ImageFormat::NV12 ||
@@ -656,25 +668,10 @@ TEST(TestImageConvertRand, test_func_image_trans_preprocess) {
         for (auto ww : {32, 112}) {
           for (auto hh : {112}) {
             for (auto rotate : {90, 180, 270}) {
-              for (auto flip : {0, 1, 2}) {
-                for (auto srcFormat : {11}) {
-                  for (auto dstFormat : {3}) {
+              for (auto flip : {-1, 0, 1}) {
+                for (auto srcFormat : {0}) {
+                  for (auto dstFormat : {0, 1, 2, 3, 4}) {
                     for (auto layout : {1, 3}) {
-                      if (dstFormat == ImageFormat::NV12 ||
-                          dstFormat == ImageFormat::NV21 ||
-                          (dstFormat == ImageFormat::GRAY &&
-                           (srcFormat == ImageFormat::RGBA ||
-                            srcFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::GRAY &&
-                           (dstFormat == ImageFormat::RGBA ||
-                            dstFormat == ImageFormat::BGRA)) ||
-                          (srcFormat == ImageFormat::NV12 ||
-                           srcFormat == ImageFormat::NV21) &&
-                              (dstFormat == ImageFormat::GRAY ||
-                               dstFormat == ImageFormat::RGBA ||
-                               dstFormat == ImageFormat::BGRA)) {
-                        continue;
-                      }
                       if (srcFormat == ImageFormat::NV12 ||
                           srcFormat == ImageFormat::NV21) {
                         if (w % 2) {  // is not ou shu, two line y == one line
@@ -717,7 +714,8 @@ TEST(TestImageConvertCustom, test_func_image_preprocess_custom) {
            (ImageFormat)FLAGS_dstFormat,
            FLAGS_angle,
            (FlipParam)FLAGS_flip_num,
-           (LayoutType)FLAGS_layout);
+           (LayoutType)FLAGS_layout,
+           20);
 }
 #endif
 #endif
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index 4622376742..a7ae414573 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -1,68 +1,70 @@
-if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
-    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${bm_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${bm_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
+    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_conv2d_transpose_compute SRCS conv2d_transpose_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 
 if(LITE_BUILD_EXTRA)
-    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
-    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_squeeze_compute SRCS squeeze_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_slice_compute SRCS slice_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_expand_compute SRCS expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_matmul_compute SRCS matmul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()
diff --git a/lite/tests/kernels/batch_norm_compute_test.cc b/lite/tests/kernels/batch_norm_compute_test.cc
new file mode 100644
index 0000000000..ae65e0e3c3
--- /dev/null
+++ b/lite/tests/kernels/batch_norm_compute_test.cc
@@ -0,0 +1,181 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class BatchNormComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "batch_norm";
+  std::string input_ = "x";
+  std::string scale_ = "scale";
+  std::string bias_ = "bias";
+  std::string mean_ = "mean";
+  std::string variance_ = "variance";
+  std::string output_ = "y";
+  std::string mean_out_ = "mean_out";
+  std::string saved_mean_ = "saved_mean";
+  std::string variance_out_ = "variance_out";
+  std::string saved_variance_ = "saved_variance";
+  DDim dims_{{1, 2, 3, 4}};
+  bool use_global_stats_ = false;
+  float momentum_ = 0.9;
+  float epsilon_ = 1e-5f;
+  std::string data_layout_ = "NCHW";
+  int is_test_ = 1;
+
+ public:
+  BatchNormComputeTest(const Place& place,
+                       const std::string& alias,
+                       DDim dims,
+                       float epsilon)
+      : TestCase(place, alias), dims_(dims), epsilon_(epsilon) {}
+
+  void RunBaseline(Scope* scope) override {
+    auto x = scope->FindTensor(input_);
+    auto scale = scope->FindTensor(scale_);
+    auto bias = scope->FindTensor(bias_);
+    auto mean = scope->FindTensor(mean_);
+    auto variance = scope->FindTensor(variance_);
+
+    auto y = scope->NewTensor(output_);
+    auto mean_out = scope->NewTensor(mean_out_);
+    auto variance_out = scope->NewTensor(variance_out_);
+    auto saved_mean = scope->NewTensor(saved_mean_);
+    auto saved_variance = scope->NewTensor(saved_variance_);
+    CHECK(y);
+    CHECK(mean_out);
+    CHECK(variance_out);
+    CHECK(saved_mean);
+    CHECK(saved_variance);
+    y->Resize(dims_);
+
+    int64_t channel_size = 0;
+    if (data_layout_ == "NCHW") {
+      channel_size = dims_[1];
+    } else {
+      LOG(FATAL) << "Unknown storage order: " << data_layout_;
+    }
+    mean_out->Resize({channel_size});
+    variance_out->Resize({channel_size});
+    saved_mean->Resize({channel_size});
+    saved_variance->Resize({channel_size});
+
+    auto x_data = x->data<float>();
+    auto y_data = y->mutable_data<float>();
+    auto scale_data = scale->data<float>();
+    auto bias_data = bias->data<float>();
+    auto mean_data = mean->data<float>();
+    auto variance_data = variance->data<float>();
+
+    int64_t outer_size = 0;
+    int64_t inner_size = 0;
+    if (data_layout_ == "NCHW") {
+      outer_size = dims_[0];
+      inner_size = dims_.Slice(2, dims_.size()).production();
+    } else {
+      LOG(FATAL) << "Unknown storage order: " << data_layout_;
+    }
+    auto x_ptr = x_data;
+    auto y_ptr = y_data;
+    for (int o = 0; o < outer_size; o++) {
+      for (int c = 0; c < channel_size; c++) {
+        for (int i = 0; i < inner_size; i++) {
+          float norm_x =
+              (*x_ptr - mean_data[c]) / std::sqrt(variance_data[c] + epsilon_);
+          *y_ptr = norm_x * scale_data[c] + bias_data[c];
+          x_ptr++;
+          y_ptr++;
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {input_});
+    op_desc->SetInput("Bias", {bias_});
+    op_desc->SetInput("Scale", {scale_});
+    op_desc->SetInput("Mean", {mean_});
+    op_desc->SetInput("Variance", {variance_});
+    op_desc->SetOutput("Y", {output_});
+    op_desc->SetOutput("MeanOut", {mean_out_});
+    op_desc->SetOutput("VarianceOut", {variance_out_});
+    op_desc->SetOutput("SavedMean", {saved_mean_});
+    op_desc->SetOutput("SavedVariance", {saved_variance_});
+    op_desc->SetAttr("epsilon", epsilon_);
+    op_desc->SetAttr("momentum", momentum_);
+    op_desc->SetAttr("use_global_stats", use_global_stats_);
+    op_desc->SetAttr("data_layout", data_layout_);
+    op_desc->SetAttr("is_test", is_test_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+
+    DDim scale_dim({dims_[1]});
+    std::vector<float> scale(scale_dim.production());
+    fill_data_rand(scale.data(), -1.f, 1.f, scale_dim.production());
+
+    std::vector<float> bias(scale_dim.production());
+    fill_data_rand(bias.data(), -1.f, 1.f, scale_dim.production());
+
+    std::vector<float> mean(scale_dim.production());
+    fill_data_rand(mean.data(), -1.f, 1.f, scale_dim.production());
+
+    std::vector<float> variance(scale_dim.production());
+    fill_data_rand(variance.data(), 0.f, 1.f, scale_dim.production());
+
+    SetCommonTensor(input_, dims_, din.data());
+    SetCommonTensor(scale_, scale_dim, scale.data());
+    SetCommonTensor(bias_, scale_dim, bias.data());
+    SetCommonTensor(mean_, scale_dim, mean.data());
+    SetCommonTensor(variance_, scale_dim, variance.data());
+  }
+};
+
+TEST(BatchNorm, precision) {
+  LOG(INFO) << "test BatchNorm op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_XPU)
+  place = TARGET(kXPU);
+#elif defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+#else
+  return;
+#endif
+
+  for (auto dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {5, 6, 7, 8}}) {
+    for (auto epsilon : {1e-5f}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new BatchNormComputeTest(place, "def", DDim(dims), epsilon));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision(
+          {"mean_out", "saved_mean", "variance_out", "saved_variance"});
+    }
+  }
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/dropout_compute_test.cc b/lite/tests/kernels/dropout_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/tests/kernels/gather_compute_test.cc b/lite/tests/kernels/gather_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/tests/kernels/grid_sampler_compute_test.cc b/lite/tests/kernels/grid_sampler_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/tests/kernels/instance_norm_compute_test.cc b/lite/tests/kernels/instance_norm_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/tests/kernels/layer_norm_compute_test.cc b/lite/tests/kernels/layer_norm_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/tests/kernels/lookup_table_compute_test.cc b/lite/tests/kernels/lookup_table_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/tests/kernels/mul_compute_test.cc b/lite/tests/kernels/mul_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/tests/kernels/pool_compute_test.cc b/lite/tests/kernels/pool_compute_test.cc
new file mode 100644
index 0000000000..d94c2e5154
--- /dev/null
+++ b/lite/tests/kernels/pool_compute_test.cc
@@ -0,0 +1,367 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class PoolComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "pool2d";
+  std::string x_ = "x";
+  std::string out_ = "out";
+  DDim dims_{{1, 2, 3, 4}};
+  std::string pooling_type_ = "max";
+  bool global_pooling_ = false;
+  std::vector<int> strides_{1, 1};
+  std::vector<int> paddings_{0, 0};
+  std::vector<int> ksize_{2, 2};
+  bool exclusive_ = true;
+  bool ceil_mode_ = false;
+  bool adaptive_ = false;
+  std::string padding_algorithm_;
+
+ public:
+  PoolComputeTest(const Place& place,
+                  const std::string& alias,
+                  DDim dims,
+                  std::string pooling_type,
+                  bool global_pooling,
+                  std::vector<int> strides = {1, 1},
+                  std::vector<int> paddings = {0, 0},
+                  std::vector<int> ksize = {2, 2},
+                  bool exclusive = true,
+                  bool ceil_mode = false,
+                  bool adaptive = false,
+                  std::string padding_algorithm = "")
+      : TestCase(place, alias),
+        dims_(dims),
+        pooling_type_(pooling_type),
+        global_pooling_(global_pooling),
+        strides_(strides),
+        paddings_(paddings),
+        ksize_(ksize),
+        exclusive_(exclusive),
+        ceil_mode_(ceil_mode),
+        adaptive_(adaptive) {}
+
+  void RunBaseline(Scope* scope) override {
+    std::vector<int> paddings_new{paddings_};
+    if (paddings_new.size() == 1L) {
+      paddings_new = std::vector<int>(4, paddings_new[0]);
+    } else if (paddings_new.size() == 2L) {
+      paddings_new.insert(paddings_new.begin(), paddings_new[0]);
+      paddings_new.insert(paddings_new.begin() + 2, paddings_new[2]);
+    }
+    CHECK_EQ(paddings_new.size(), 4L);
+    if (padding_algorithm_ == "SAME") {
+      for (int i = 0; i < strides_.size(); ++i) {
+        int out_size = (dims_[i + 2] + strides_[i] - 1) / strides_[i];
+        int pad_sum =
+            std::max((out_size - 1) * strides_[i] + ksize_[i] - dims_[i + 2],
+                     (int64_t)0);
+        int pad_0 = pad_sum / 2;
+        int pad_1 = pad_sum - pad_0;
+        *(paddings_new.begin() + i * 2) = pad_0;
+        *(paddings_new.begin() + i * 2 + 1) = pad_1;
+      }
+    }
+    if (padding_algorithm_ == "VALID" || global_pooling_ || adaptive_) {
+      for (size_t i = 0; i < paddings_new.size(); i++) {
+        paddings_new[i] = 0;
+      }
+    }
+
+    std::vector<int> ksize_new{ksize_};
+    if (global_pooling_) {
+      ksize_new.clear();
+      ksize_new.push_back(dims_[2]);
+      ksize_new.push_back(dims_[3]);
+    }
+
+    std::vector<int64_t> out_shape{dims_[0], dims_[1]};
+    if (adaptive_) {
+      out_shape.insert(out_shape.end(), ksize_new.begin(), ksize_new.end());
+    } else {
+      for (size_t i = 0; i < ksize_new.size(); ++i) {
+        int out_size;
+        if (!ceil_mode_) {
+          out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] +
+                      paddings_new[2 * i + 1]) /
+                         strides_[i] +
+                     1;
+        } else {
+          out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] +
+                      paddings_new[2 * i + 1] + strides_[i] - 1) /
+                         strides_[i] +
+                     1;
+        }
+        out_shape.push_back(out_size);
+      }
+    }
+
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+    out->Resize(DDim(out_shape));
+    auto out_dims = out->dims();
+    auto dst_ptr = out->mutable_data<float>();
+
+    auto x = scope->FindTensor(x_);
+    auto src_ptr = x->data<float>();
+
+    int in_n = dims_[0];
+    int in_c = dims_[1];
+    int in_h = dims_[2];
+    int in_w = dims_[3];
+    int size_in_n = in_c * in_h * in_w;
+    int size_in_c = in_h * in_w;
+
+    int out_h = out_dims[2];
+    int out_w = out_dims[3];
+    int size_out_n = in_c * out_h * out_w;
+    int size_out_c = out_h * out_w;
+
+    int window_h = ksize_new[0];
+    int window_w = ksize_new[1];
+    int stride_h = strides_[0];
+    int stride_w = strides_[1];
+    int pad_t = paddings_new[0];
+    int pad_l = paddings_new[2];
+
+    if (global_pooling_) {
+      for (int n = 0; n < in_n; ++n) {
+        for (int c = 0; c < in_c; ++c) {
+          const float* src = src_ptr + n * size_in_n + c * size_in_c;
+          float res = src[0];
+          if (pooling_type_ == "max") {
+            for (int i = 1; i < size_in_c; ++i) {
+              float cur_val = src[i];
+              res = cur_val > res ? cur_val : res;
+            }
+          } else if (pooling_type_ == "avg") {
+            for (int i = 1; i < size_in_c; ++i) {
+              float cur_val = src[i];
+              res += cur_val;
+            }
+            res /= size_in_c;
+          }
+          dst_ptr[n * size_out_n + c] = res;
+        }
+      }
+    } else {
+      for (int n = 0; n < in_n; ++n) {
+        for (int c = 0; c < in_c; ++c) {
+          for (int h = 0; h < out_h; ++h) {
+            int sh = h * stride_h;
+            int eh = sh + window_h;
+            sh = (sh - pad_t) < 0 ? 0 : sh - pad_t;
+            eh = (eh - pad_t) > in_h ? in_h : eh - pad_t;
+            for (int w = 0; w < out_w; ++w) {
+              int sw = w * stride_w;
+              int ew = sw + window_w;
+              sw = (sw - pad_l) < 0 ? 0 : sw - pad_l;
+              ew = (ew - pad_l) > in_w ? in_w : ew - pad_l;
+              int pooling_size = (ew - sw) * (eh - sh);
+              if (pooling_size == 0) continue;
+              float res = 0.f;
+              for (int kh = sh; kh < eh; ++kh) {
+                for (int kw = sw; kw < ew; ++kw) {
+                  int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
+                  if (kh == sh && kw == sw) {
+                    res = src_ptr[src_idx];
+                  } else {
+                    if (pooling_type_ == "max") {
+                      res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
+                    }
+                    if (pooling_type_ == "avg") {
+                      res += src_ptr[src_idx];
+                    }
+                  }
+                }
+              }
+              if (pooling_type_ == "avg") {
+                if (exclusive_) {
+                  res /= pooling_size;
+                } else {
+                  res /= window_h * window_w;
+                }
+              }
+              dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("pooling_type", pooling_type_);
+    op_desc->SetAttr("global_pooling", global_pooling_);
+    op_desc->SetAttr("strides", strides_);
+    op_desc->SetAttr("paddings", paddings_);
+    op_desc->SetAttr("ksize", ksize_);
+    op_desc->SetAttr("exclusive", exclusive_);
+    op_desc->SetAttr("ceil_mode", ceil_mode_);
+    op_desc->SetAttr("adaptive", adaptive_);
+    if (!padding_algorithm_.empty()) {
+      op_desc->SetAttr("padding_algorithm", padding_algorithm_);
+    }
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(x_, dims_, din.data());
+  }
+};
+
+void TestPoolGlobal(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{2, 3, 4, 5}}) {
+    for (std::string pooling_type : {"max", "avg"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new PoolComputeTest(place, "def", DDim(dims), pooling_type, true));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestPoolAlgorithm(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{2, 3, 4, 5}}) {
+    for (auto pooling_type : {"max", "avg"}) {
+      for (auto padding_algorithm : {"SAME", "VALID"}) {
+        std::unique_ptr<arena::TestCase> tester(
+            new PoolComputeTest(place,
+                                "def",
+                                DDim(dims),
+                                pooling_type,
+                                false,
+                                {2, 2},
+                                {0, 0},
+                                {2, 2},
+                                true,
+                                false,
+                                false,
+                                padding_algorithm));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestPoolHelper(Place place,
+                    float abs_error,
+                    std::vector<int64_t> dims,
+                    std::string pooling_type,
+                    std::vector<int> strides,
+                    std::vector<int> paddings,
+                    std::vector<int> ksize) {
+  std::unique_ptr<arena::TestCase> tester(new PoolComputeTest(
+      place, "def", DDim(dims), pooling_type, false, strides, paddings, ksize));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void TestPoolStrides(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2});
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 2}, {0, 0}, {2, 2});
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {2, 2}, {0, 0}, {2, 2});
+  }
+}
+
+void TestPoolPaddings(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2});
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {1, 1}, {2, 2});
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 7},
+                   pooling_type,
+                   {1, 1},
+                   {0, 0, 1, 1},
+                   {2, 2});
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 7},
+                   pooling_type,
+                   {1, 1},
+                   {1, 0, 1, 0},
+                   {2, 2});
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 7},
+                   pooling_type,
+                   {1, 1},
+                   {1, 0, 0, 1},
+                   {2, 2});
+  }
+}
+
+void TestPoolKsize(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    for (auto ksize : {2, 3}) {
+      TestPoolHelper(place,
+                     abs_error,
+                     {2, 3, 6, 7},
+                     pooling_type,
+                     {1, 1},
+                     {0, 0},
+                     {ksize, ksize});
+      TestPoolHelper(place,
+                     abs_error,
+                     {2, 3, 6, 7},
+                     pooling_type,
+                     {2, 2},
+                     {1, 1},
+                     {ksize, ksize});
+    }
+  }
+}
+
+TEST(Pool, precision) {
+  LOG(INFO) << "test pool op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#else
+  return;
+#endif
+
+  TestPoolGlobal(place, abs_error);
+  TestPoolAlgorithm(place, abs_error);
+  TestPoolStrides(place, abs_error);
+  TestPoolPaddings(place, abs_error);
+  TestPoolKsize(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/tests/kernels/reduce_prod_compute_test.cc b/lite/tests/kernels/reduce_prod_compute_test.cc
old mode 100755
new mode 100644
diff --git a/lite/tests/kernels/reshape_compute_test.cc b/lite/tests/kernels/reshape_compute_test.cc
old mode 100755
new mode 100644
index 85cd724148..b82c291a41
--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -16,6 +16,7 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -29,19 +30,19 @@ class ReshapeComputeTester : public arena::TestCase {
   std::string xshape_ = "xshape";
   std::vector<std::string> shape_tensor_vct_;
   std::string shape_tensor_;
-  DDim x_dims_;
+  DDim dims_;
   std::vector<int> shape_;
   bool inplace_ = false;
 
  public:
   ReshapeComputeTester(const Place& place,
                        const std::string& alias,
-                       DDim x_dims,
+                       DDim dims,
                        std::vector<int> shape,
                        bool is_shape_tensor_vct = false,
                        bool is_shape_tensor = false,
                        bool is_shape = true)
-      : TestCase(place, alias), x_dims_(x_dims) {
+      : TestCase(place, alias), dims_(dims) {
     if (is_shape_tensor_vct) {
       for (size_t i = 0; i < shape.size(); i++) {
         shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i));
@@ -60,7 +61,6 @@ class ReshapeComputeTester : public arena::TestCase {
     CHECK(out);
 
     auto* x = scope->FindTensor(input_);
-    auto x_dims = x->dims();
 
     std::vector<int> out_shape;
     if (shape_tensor_vct_.size() > 0) {
@@ -86,8 +86,8 @@ class ReshapeComputeTester : public arena::TestCase {
         CHECK_EQ(unk_dim_idx, -1);
         unk_dim_idx = i;
       } else if (out_shape[i] == 0) {
-        CHECK_LE(i, x_dims.size());
-        final_out_shape[i] = x_dims[i];
+        CHECK_LE(i, dims_.size());
+        final_out_shape[i] = dims_[i];
       } else if (out_shape[i] > 0) {
         final_out_shape[i] = out_shape[i];
       } else {
@@ -97,18 +97,18 @@ class ReshapeComputeTester : public arena::TestCase {
     }
 
     if (unk_dim_idx > -1) {
-      final_out_shape[unk_dim_idx] = x_dims.production() / cap;
+      final_out_shape[unk_dim_idx] = dims_.production() / cap;
     }
 
     out->Resize(final_out_shape);
 
     auto x_data = x->data<float>();
     auto out_data = out->mutable_data<float>();
-    memcpy(out_data, x_data, sizeof(float) * x_dims.production());
+    memcpy(out_data, x_data, sizeof(float) * dims_.production());
 
     if (op_type_ == "reshape2") {
       auto* xshape = scope->NewTensor(xshape_);
-      auto xshape_dims = x_dims.Vectorize();
+      auto xshape_dims = dims_.Vectorize();
       xshape_dims.insert(xshape_dims.begin(), 0);
       xshape->Resize(xshape_dims);
     }
@@ -134,11 +134,9 @@ class ReshapeComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<float> data(x_dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-    SetCommonTensor(input_, x_dims_, data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
 
     if (shape_tensor_vct_.size() > 0) {
       for (size_t i = 0; i < shape_.size(); i++) {
@@ -161,13 +159,16 @@ TEST(Reshape, precision) {
   LOG(INFO) << "test Reshape op";
   float abs_error = 2e-5;
   Place place;
-#ifdef LITE_WITH_XPU
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
 #else
   return;
 #endif
 
-  DDim x_dims{{2, 3, 4, 5}};
+  DDim dims{{2, 3, 4, 5}};
   std::vector<std::vector<int>> shapes{{5, 4, 3, 2},
                                        {2, 3, 20},
                                        {2, 60},
@@ -176,8 +177,11 @@ TEST(Reshape, precision) {
                                        {0, 0, 20},
                                        {0, 0, -1}};
   for (auto shape : shapes) {
+#ifdef LITE_WITH_NPU
+    if (dims.size() > 4 || shape.size() > 4) continue;
+#endif
     std::unique_ptr<arena::TestCase> tester(
-        new ReshapeComputeTester(place, "def", x_dims, shape));
+        new ReshapeComputeTester(place, "def", dims, shape));
     arena::Arena arena(std::move(tester), place, abs_error);
     arena.TestPrecision({"xshape"});
   }
diff --git a/lite/tests/kernels/scale_compute_test.cc b/lite/tests/kernels/scale_compute_test.cc
index 706936d2b1..1ededcd52d 100644
--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
@@ -16,6 +16,7 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -23,31 +24,33 @@ namespace lite {
 class ScaleComputeTester : public arena::TestCase {
  protected:
   // common attributes for this op.
-  std::string input_ = "x";
-  std::string output_ = "out";
+  std::string x_ = "x";
+  std::string out_ = "out";
+  DDim x_dims_{{100, 20}};
   float scale_ = 0.;
   float bias_ = 0.;
-  DDim dims_{{100, 20}};
   bool bias_after_scale_;
 
  public:
   ScaleComputeTester(const Place& place,
                      const std::string& alias,
+                     const DDim& x_dims,
                      float scale,
                      float bias,
                      bool bias_after_scale)
       : TestCase(place, alias),
+        x_dims_(x_dims),
         scale_(scale),
         bias_(bias),
         bias_after_scale_(bias_after_scale) {}
 
   void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
+    auto* out = scope->NewTensor(out_);
     CHECK(out);
-    out->Resize(dims_);
+    out->Resize(x_dims_);
     auto* out_data = out->mutable_data<float>();
 
-    auto* x = scope->FindTensor(input_);
+    auto* x = scope->FindTensor(x_);
     const auto* x_data = x->data<float>();
 
     float bias = bias_;
@@ -56,35 +59,34 @@ class ScaleComputeTester : public arena::TestCase {
       bias *= scale_;
     }
 
-    for (int i = 0; i < dims_.production(); i++) {
+    for (int i = 0; i < x_dims_.production(); i++) {
       out_data[i] = x_data[i] * scale_ + bias;
     }
   }
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType("scale");
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
     op_desc->SetAttr("scale", scale_);
     op_desc->SetAttr("bias", bias_);
     op_desc->SetAttr("bias_after_scale", bias_after_scale_);
   }
 
   void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
+    std::vector<float> x(x_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, x.data());
   }
 };
 
 TEST(Scale, precision) {
   Place place;
   float abs_error = 2e-5;
-#if defined(LITE_WITH_ARM)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 4e-3;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
   place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
@@ -95,13 +97,16 @@ TEST(Scale, precision) {
   return;
 #endif
 
-  for (float scale : {0.123, 2., -1.2}) {
-    for (float bias : {1., 0., -1.2331}) {
-      for (bool bias_before : {true, false}) {
-        std::unique_ptr<arena::TestCase> tester(
-            new ScaleComputeTester(place, "def", scale, bias, bias_before));
-        arena::Arena arena(std::move(tester), place, abs_error);
-        arena.TestPrecision();
+  for (auto x_dims :
+       std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
+    for (float scale : {0.123, 2., -1.2}) {
+      for (float bias : {1., 0., -1.2331}) {
+        for (bool bias_after_scale : {true, false}) {
+          std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+              place, "def", DDim(x_dims), scale, bias, bias_after_scale));
+          arena::Arena arena(std::move(tester), place, abs_error);
+          arena.TestPrecision();
+        }
       }
     }
   }
@@ -117,8 +122,8 @@ TEST(Scale, performance) {
   return;
 #endif
 
-  std::unique_ptr<arena::TestCase> tester(
-      new ScaleComputeTester(place, "def", 1.2, 1.1, true));
+  std::unique_ptr<arena::TestCase> tester(new ScaleComputeTester(
+      place, "def", DDim(std::vector<int64_t>{5, 2, 3, 4}), 1.2, 1.1, true));
 
   // To modify the arm context, one can retrive the context as follows.
   // #ifdef LITE_WITH_ARM
diff --git a/lite/tests/kernels/shuffle_channel_compute_test.cc b/lite/tests/kernels/shuffle_channel_compute_test.cc
index 66123625fa..66dd7bbe37 100644
--- a/lite/tests/kernels/shuffle_channel_compute_test.cc
+++ b/lite/tests/kernels/shuffle_channel_compute_test.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// TODO(FrostML): shaffle_channel cannot pass on CI, but ok in local machine.
-// Open this.
-/*#include <gtest/gtest.h>
+#include <gtest/gtest.h>
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -40,28 +39,29 @@ class ShuffleChannelComputeTester : public arena::TestCase {
     auto* out = scope->NewTensor(output_);
     CHECK(out);
     out->Resize(dims_);
-    auto* outputs = out->mutable_data<float>();
+    auto* out_data = out->mutable_data<float>();
+
     auto* x = scope->FindTensor(input_);
-    const auto* inputs = x->data<float>();
-    DDim x_dims = x->dims();
-    int num = x->dims()[0];
-    int channel = x->dims()[1];
-    int height = x->dims()[2];
-    int width = x->dims()[3];
-    int fea_size = channel * height * width;
+    const auto* in_data = x->data<float>();
+
+    int num = dims_[0];
+    int channel = dims_[1];
+    int height = dims_[2];
+    int width = dims_[3];
+    int feather_size = channel * height * width;
     int spatial_size = height * width;
-    int group_row = group_;
-    int group_col = channel / group_;
-    for (int k = 0; k < num; ++k) {
-      inputs += k * fea_size;
-      outputs += k * fea_size;
-      for (int i = 0; i < group_row; ++i) {
-        for (int j = 0; j < group_col; ++j) {
-          const float* p_i = inputs + (i * group_col + j) * spatial_size;
-          float* p_o = outputs + (j * group_row + i) * spatial_size;
+    int group_num = group_;
+    int group_size = channel / group_;
+    for (int n = 0; n < num; n++) {
+      for (int i = 0; i < group_num; ++i) {
+        for (int j = 0; j < group_size; ++j) {
+          const float* p_i = in_data + (i * group_size + j) * spatial_size;
+          float* p_o = out_data + (j * group_num + i) * spatial_size;
           memcpy(p_o, p_i, spatial_size * sizeof(float));
         }
       }
+      in_data += feather_size;
+      out_data += feather_size;
     }
   }
 
@@ -73,35 +73,33 @@ class ShuffleChannelComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
   }
 };
 
-void test_shuffle_channel(Place place) {
-  for (int group : {4}) {
+void test_shuffle_channel(Place place, float abs_error = 2e-5) {
+  for (int group : {2, 4, 8}) {
     std::unique_ptr<arena::TestCase> tester(
         new ShuffleChannelComputeTester(place, "def", group));
-    arena::Arena arena(std::move(tester), place, 2e-5);
+    arena::Arena arena(std::move(tester), place, abs_error);
     arena.TestPrecision();
   }
 }
 
 TEST(ShuffleChannel, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_shuffle_channel(place);
+  Place place;
+  float abs_error = 2e-5;
+#ifdef LITE_WITH_NPU
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#else
+  return;
 #endif
+
+  test_shuffle_channel(place, abs_error);
 }
 
 }  // namespace lite
 }  // namespace paddle
-*/
diff --git a/lite/tests/kernels/softmax_compute_test.cc b/lite/tests/kernels/softmax_compute_test.cc
old mode 100755
new mode 100644
index 94100da2b1..a91f6534ff
--- a/lite/tests/kernels/softmax_compute_test.cc
+++ b/lite/tests/kernels/softmax_compute_test.cc
@@ -25,33 +25,33 @@ class SoftmaxComputeTest : public arena::TestCase {
  protected:
   // common attributes for this op.
   std::string op_type_ = "softmax";
-  std::string input_ = "x";
-  std::string output_ = "out";
-  DDim dims_{{1, 2, 3, 4}};
+  DDim x_dims_{{1, 2, 3, 4}};
+  std::string x_ = "x";
+  std::string out_ = "out";
   int axis_ = 1;
 
  public:
   SoftmaxComputeTest(const Place& place,
                      const std::string& alias,
-                     DDim dims,
+                     DDim x_dims,
                      int axis)
-      : TestCase(place, alias), dims_(dims), axis_(axis) {}
+      : TestCase(place, alias), x_dims_(x_dims), axis_(axis) {}
 
   void RunBaseline(Scope* scope) override {
-    auto x = scope->FindTensor(input_);
-    auto out = scope->NewTensor(output_);
+    auto x = scope->FindTensor(x_);
+    auto out = scope->NewTensor(out_);
     CHECK(out);
-    out->Resize(dims_);
+    out->Resize(x_dims_);
 
     auto x_data = x->data<float>();
     auto out_data = out->mutable_data<float>();
-    auto x_rank = dims_.size();
+    auto x_rank = x_dims_.size();
     if (axis_ < 0) {
       axis_ += x_rank;
     }
-    int axis_size = dims_[axis_];
-    int outer_num = dims_.Slice(0, axis_).production();
-    int inner_num = dims_.Slice(axis_ + 1, x_rank).production();
+    int axis_size = x_dims_[axis_];
+    int outer_num = x_dims_.Slice(0, axis_).production();
+    int inner_num = x_dims_.Slice(axis_ + 1, x_rank).production();
     int compute_size = outer_num * inner_num;
     for (int i = 0; i < compute_size; i++) {
       int idx_inner = i % inner_num;
@@ -84,15 +84,15 @@ class SoftmaxComputeTest : public arena::TestCase {
 
   void PrepareOpDesc(cpp::OpDesc* op_desc) {
     op_desc->SetType(op_type_);
-    op_desc->SetInput("X", {input_});
-    op_desc->SetOutput("Out", {output_});
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
     op_desc->SetAttr("axis", axis_);
   }
 
   void PrepareData() override {
-    std::vector<float> din(dims_.production());
-    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
-    SetCommonTensor(input_, dims_, din.data());
+    std::vector<float> x(x_dims_.production());
+    fill_data_rand(x.data(), -1.f, 1.f, x_dims_.production());
+    SetCommonTensor(x_, x_dims_, x.data());
   }
 };
 
@@ -100,18 +100,21 @@ TEST(Softmax, precision) {
   LOG(INFO) << "test softmax op";
   float abs_error = 2e-5;
   Place place;
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 4e-3;  // Using fp16 in NPU
+#elif defined(LITE_WITH_XPU)
   place = TARGET(kXPU);
 #else
   return;
 #endif
 
-  std::vector<std::vector<int64_t>> dims{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}};
-  for (auto dim_in : dims) {
+  for (auto x_dims :
+       std::vector<std::vector<int64_t>>{{1, 2, 3, 4}, {2, 3, 4}, {3, 4}}) {
     for (auto axis : {-1, 0, 1, 2, 3}) {
-      if (axis >= dim_in.size()) continue;
+      if (axis >= x_dims.size()) continue;
       std::unique_ptr<arena::TestCase> tester(
-          new SoftmaxComputeTest(place, "def", DDim(dim_in), axis));
+          new SoftmaxComputeTest(place, "def", DDim(x_dims), axis));
       arena::Arena arena(std::move(tester), place, abs_error);
       arena.TestPrecision();
     }
diff --git a/lite/tests/kernels/transpose_compute_test.cc b/lite/tests/kernels/transpose_compute_test.cc
old mode 100755
new mode 100644
index 62e0fc8e41..b4407bb569
--- a/lite/tests/kernels/transpose_compute_test.cc
+++ b/lite/tests/kernels/transpose_compute_test.cc
@@ -16,6 +16,7 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 
 namespace paddle {
 namespace lite {
@@ -24,13 +25,13 @@ int data_index(std::vector<int> pos, DDimLite dims) {
   int d1 = dims[1];
   int d2 = dims[2];
   int d3 = dims[3];
-  return pos[3] + pos[2] * d3 + pos[1] * d3 * d2 + pos[0] * d3 * d2 * d1;
+  return pos[0] * d1 * d2 * d3 + pos[1] * d2 * d3 + pos[2] * d3 + pos[3];
 }
 
 std::vector<int> pos_trans(std::vector<int> in_pos, std::vector<int> axis) {
   std::vector<int> out_pos(in_pos.size());
   for (int i = 0; i < axis.size(); i++) {
-    out_pos[axis[i]] = in_pos[i];
+    out_pos[i] = in_pos[axis[i]];
   }
   return out_pos;
 }
@@ -42,35 +43,34 @@ class TransposeComputeTester : public arena::TestCase {
   std::string input_ = "x";
   std::string output_ = "out";
   std::string xshape_ = "xshape";
-  DDim x_dims_;
+  DDim dims_;
   std::vector<int> axis_;
 
  public:
   TransposeComputeTester(const Place& place,
                          const std::string& alias,
-                         DDim x_dims,
+                         DDim dims,
                          std::vector<int> axis)
-      : TestCase(place, alias), x_dims_(x_dims), axis_(axis) {}
+      : TestCase(place, alias), dims_(dims), axis_(axis) {}
 
   void RunBaseline(Scope* scope) override {
     auto* out = scope->NewTensor(output_);
     CHECK(out);
 
     auto* x = scope->FindTensor(input_);
-    auto x_dims = x->dims();
 
-    std::vector<int64_t> out_shape(x_dims.size(), 0);
-    for (size_t i = 0; i < x_dims.size(); i++) {
-      out_shape[i] = x_dims[axis_[i]];
+    std::vector<int64_t> out_shape(dims_.size(), 0);
+    for (size_t i = 0; i < dims_.size(); i++) {
+      out_shape[i] = dims_[axis_[i]];
     }
     out->Resize(out_shape);
 
     auto y_dims = out->dims();
 
-    int input_n = x_dims[0];
-    int input_c = x_dims[1];
-    int input_h = x_dims[2];
-    int input_w = x_dims[3];
+    int input_n = dims_[0];
+    int input_c = dims_[1];
+    int input_h = dims_[2];
+    int input_w = dims_[3];
 
     auto input_data = x->data<float>();
     auto output_data = out->mutable_data<float>();
@@ -81,7 +81,7 @@ class TransposeComputeTester : public arena::TestCase {
           for (int w = 0; w < input_w; ++w) {
             std::vector<int> in_pos{n, c, h, w};
             std::vector<int> out_pos = pos_trans(in_pos, axis_);
-            int in_index = data_index(in_pos, x_dims);
+            int in_index = data_index(in_pos, dims_);
             int out_index = data_index(out_pos, y_dims);
             output_data[out_index] = input_data[in_index];
           }
@@ -91,7 +91,7 @@ class TransposeComputeTester : public arena::TestCase {
 
     if (op_type_ == "transpose2") {
       auto* xshape = scope->NewTensor(xshape_);
-      auto xshape_dims = x_dims.Vectorize();
+      auto xshape_dims = dims_.Vectorize();
       xshape_dims.insert(xshape_dims.begin(), 0);
       xshape->Resize(xshape_dims);
     }
@@ -108,11 +108,9 @@ class TransposeComputeTester : public arena::TestCase {
   }
 
   void PrepareData() override {
-    std::vector<float> data(x_dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-    SetCommonTensor(input_, x_dims_, data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
   }
 };
 
@@ -122,14 +120,16 @@ TEST(Transpose, precision) {
   Place place;
 #ifdef LITE_WITH_XPU
   place = TARGET(kXPU);
+#elif defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
 #else
   return;
 #endif
 
   DDim x_dims{{2, 3, 4, 5}};
-  // [XPU]: {3, 1, 0, 2} is unsupported
   std::vector<std::vector<int>> axes{
-      {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}};
+      {0, 1, 2, 3}, {0, 1, 3, 2}, {0, 2, 1, 3}, {3, 1, 2, 0}, {3, 1, 0, 2}};
   for (auto axis : axes) {
     std::unique_ptr<arena::TestCase> tester(
         new TransposeComputeTester(place, "def", x_dims, axis));
diff --git a/lite/tests/kernels/unsqueeze_compute_test.cc b/lite/tests/kernels/unsqueeze_compute_test.cc
index 590d3fd29c..d8ec2b01f7 100644
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
@@ -223,67 +223,73 @@ class Unsqueeze2ComputeTester : public arena::TestCase {
   }
 };
 
-void test_unsqueeze(Place place) {
+void test_unsqueeze(Place place, float abs_error = 2e-5) {
   for (std::vector<int> axes : {std::vector<int>({1}),
                                 std::vector<int>({0, 2}),
                                 std::vector<int>({0, -2})}) {
-    for (int N : {1}) {
-      for (int C : {3}) {
-        for (int H : {1}) {
-          for (int W : {5}) {
-            for (int input_axes_flag : {1, 2, 3}) {
-              LOG(INFO) << N << " " << C << " " << H << " " << W << " "
-                        << input_axes_flag;
-              std::unique_ptr<arena::TestCase> tester(
-                  new UnsqueezeComputeTester(
-                      place, "def", axes, DDim({N, C, H, W}), input_axes_flag));
-              arena::Arena arena(std::move(tester), place, 2e-5);
-              arena.TestPrecision();
-            }
-          }
-        }
+    for (auto dims : std::vector<std::vector<int64_t>>{{3}, {3, 5}, {3, 5, 7}})
+      for (int input_axes_flag : {1, 2, 3}) {
+#ifdef LITE_WITH_NPU
+        if (input_axes_flag != 1) continue;
+        if (dims.size() + axes.size() > 4) continue;
+#endif
+        std::unique_ptr<arena::TestCase> tester(new UnsqueezeComputeTester(
+            place, "def", axes, DDim(dims), input_axes_flag));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
       }
-    }
   }
 }
 
-void test_unsqueeze2(Place place) {
+void test_unsqueeze2(Place place,
+                     float abs_error = 2e-5,
+                     std::vector<std::string> ignored_outs = {}) {
   for (std::vector<int> axes : {std::vector<int>({0}),
                                 std::vector<int>({0, 2}),
                                 std::vector<int>({0, -2})}) {
-    for (int N : {1}) {
-      for (int C : {3}) {
-        for (int H : {1}) {
-          for (int W : {5}) {
-            std::unique_ptr<arena::TestCase> tester(new Unsqueeze2ComputeTester(
-                place, "def", axes, DDim({N, C, H, W})));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
+    for (auto dims :
+         std::vector<std::vector<int64_t>>{{3}, {3, 5}, {3, 5, 7}}) {
+#ifdef LITE_WITH_NPU
+      if (dims.size() + axes.size() > 4) continue;
+#endif
+      std::unique_ptr<arena::TestCase> tester(
+          new Unsqueeze2ComputeTester(place, "def", axes, DDim(dims)));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision(ignored_outs);
     }
   }
 }
 
 TEST(squeeze, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_unsqueeze(place);
+  Place place;
+  float abs_error = 2e-5;
+#ifdef LITE_WITH_NPU
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_unsqueeze(place, abs_error);
 }
 
 TEST(squeeze2, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_unsqueeze2(place);
+  Place place;
+  float abs_error = 2e-5;
+  std::vector<std::string> ignored_outs = {};
+#ifdef LITE_WITH_NPU
+  place = TARGET(kNPU);
+  abs_error = 1e-2;                  // Using fp16 in NPU
+  ignored_outs.push_back("XShape");  // not supported out in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_unsqueeze2(place, abs_error, ignored_outs);
 }
 
 }  // namespace lite
diff --git a/lite/tests/utils/timer.h b/lite/tests/utils/timer.h
deleted file mode 100644
index 095f32046e..0000000000
--- a/lite/tests/utils/timer.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <chrono>  // NOLINT
-#include <list>
-
-namespace paddle {
-namespace lite {
-
-class Timer final {
- public:
-  Timer() {}
-
-  ~Timer() {}
-
-  void clear() { ms_time_.clear(); }
-
-  void start() { tstart_ = std::chrono::system_clock::now(); }
-
-  void end() {
-    tend_ = std::chrono::system_clock::now();
-    auto ts =
-        std::chrono::duration_cast<std::chrono::microseconds>(tend_ - tstart_);
-    latest_time_ = 1000.f * static_cast<float>(ts.count()) *
-                   std::chrono::microseconds::period::num /
-                   std::chrono::microseconds::period::den;
-    ms_time_.push_back(latest_time_);
-  }
-
-  float latest_time() const { return latest_time_; }
-
-  float get_average_ms() {
-    if (ms_time_.size() == 0) {
-      return 0.f;
-    }
-    float sum = 0.f;
-    for (auto i : ms_time_) {
-      sum += i;
-    }
-    return sum / ms_time_.size();
-  }
-
-  float get_sum_ms() {
-    if (ms_time_.size() == 0) {
-      return 0.f;
-    }
-    float sum = 0.f;
-    for (auto i : ms_time_) {
-      sum += i;
-    }
-    return sum;
-  }
-
-  // return tile (0-99) time.
-  float get_tile_time(float tile) {
-    if (tile < 0 || tile > 100) {
-      return -1.f;
-    }
-    int total_items = static_cast<int>(ms_time_.size());
-    if (total_items <= 0) {
-      return -2.f;
-    }
-    ms_time_.sort();
-    int pos = static_cast<int>(tile * total_items / 100);
-    auto it = ms_time_.begin();
-    for (int i = 0; i < pos; ++i) {
-      ++it;
-    }
-    return *it;
-  }
-
-  std::list<float> get_time_stat() { return ms_time_; }
-
-  float get_min_time() {
-    ms_time_.sort();
-    return *ms_time_.begin();
-  }
-
-  float get_max_time() {
-    ms_time_.sort([](int a, int b) { return a > b; });
-    return *ms_time_.begin();
-  }
-
- private:
-  std::chrono::time_point<std::chrono::system_clock> tstart_;
-  std::chrono::time_point<std::chrono::system_clock> tend_;
-  std::list<float> ms_time_;
-  float latest_time_;
-};
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tools/build_bm.sh b/lite/tools/build_bm.sh
deleted file mode 100755
index f4cfee5ec6..0000000000
--- a/lite/tools/build_bm.sh
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/bin/bash
-set -ex
-
-# global variables with default value
-BM_SDK_ROOT="$(pwd)/../BM_SDK"     # BM SDK
-TARGET_NAME="BM1682"     # default target
-BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
-WITH_TESTING=ON                    # ON/OFF
-
-function print_usage {
-    echo -e "\nUSAGE:"
-    echo
-    echo "----------------------------------------"
-    echo -e "--bm_sdk_root=<bm sdk directory>"
-    echo -e "--target_name=<target name>"
-    echo "----------------------------------------"
-    echo
-}
-
-# readonly variables with default value
-readonly CMAKE_COMMON_OPTIONS="-DWITH_LITE=ON \
-                               -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF \
-                               -DWITH_PYTHON=OFF \
-                               -DLITE_WITH_ARM=OFF"
-
-readonly NUM_CORES_FOR_COMPILE=${LITE_BUILD_THRLITE_BUILD_THREADSEADS:-1}
-
-readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
-readonly workspace=$(pwd)
-
-function prepare_thirdparty {
-    if [ ! -d $workspace/third-party -o -f $workspace/third-party-05b862.tar.gz ]; then
-        rm -rf $workspace/third-party
-
-        if [ ! -f $workspace/third-party-05b862.tar.gz ]; then
-            wget $THIRDPARTY_TAR
-        fi
-        tar xzf third-party-05b862.tar.gz
-    else
-        git submodule update --init --recursive
-    fi
-}
-
-# for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
-# here we fake an empty file to make cmake works.
-function prepare_workspace {
-    # in build directory
-    # 1. Prepare gen_code file
-    GEN_CODE_PATH_PREFIX=lite/gen_code
-    mkdir -p ./${GEN_CODE_PATH_PREFIX}
-    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
-
-    # 2.Prepare debug tool
-    DEBUG_TOOL_PATH_PREFIX=lite/tools/debug
-    mkdir -p ./${DEBUG_TOOL_PATH_PREFIX}
-    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
-
-    # clone submodule
-    # git submodule update --init --recursive
-    prepare_thirdparty
-}
-
-function build_bm {
-    build_dir=${workspace}/build.lite.bm
-    mkdir -p $build_dir
-    cd $build_dir
-
-    prepare_workspace
-    cmake .. \
-        ${CMAKE_COMMON_OPTIONS} \
-        -DWITH_GPU=OFF \
-        -DWITH_MKLDNN=OFF \
-        -DLITE_WITH_X86=ON \
-        -DWITH_MKL=ON \
-        -DLITE_BUILD_EXTRA=ON \
-        -DLITE_WITH_XPU=OFF \
-        -DLITE_WITH_BM=ON \
-        -DWITH_TESTING=${WITH_TESTING} \
-        -DBM_SDK_ROOT=${BM_SDK_ROOT}
-
-    make -j$NUM_CORES_FOR_COMPILE
-
-    cd -
-    echo "Done"
-}
-
-function main {
-    # Parse command line.
-    for i in "$@"; do
-        case $i in
-            --target_name=*)
-                TARGET_NAME="${i#*=}"
-                shift
-                ;;
-            --bm_sdk_root=*)
-                BM_SDK_ROOT="${i#*=}"
-                shift
-                ;;
-            bm)
-                build_bm
-                shift
-                ;;
-            *)
-                # unknown option
-                print_usage
-                exit 1
-                ;;
-        esac
-    done
-}
-
-main $@
diff --git a/lite/tools/build_xpu.sh b/lite/tools/build_xpu.sh
index 9f28274471..fdf287501e 100755
--- a/lite/tools/build_xpu.sh
+++ b/lite/tools/build_xpu.sh
@@ -104,6 +104,11 @@ function main {
                 build_xpu
                 shift
                 ;;
+            full_publish)
+                TARGET_NAME=publish_inference
+                build_xpu
+                shift
+                ;;
             *)
                 # unknown option
                 print_usage
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 91afc5039c..a0273efe13 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -610,6 +610,44 @@ function build_arm {
 
 }
 
+# $1: ARM_TARGET_OS in "ios", "ios64"
+# $2: ARM_TARGET_ARCH_ABI in "armv7", "armv8"
+function build_ios {
+    local os=$1
+    local abi=$2
+    build_dir=build.ios.${os}.${abi}
+    echo "building ios target into $build_dir"
+    echo "target os: $os"
+    echo "target abi: $abi"
+    mkdir -p ${build_dir}
+    cd ${build_dir}
+    GEN_CODE_PATH_PREFIX=lite/gen_code
+    mkdir -p ./${GEN_CODE_PATH_PREFIX}
+    touch ./${GEN_CODE_PATH_PREFIX}/__generated_code__.cc
+
+    cmake .. \
+            -DWITH_GPU=OFF \
+            -DWITH_MKL=OFF \
+            -DWITH_LITE=ON \
+            -DLITE_WITH_CUDA=OFF \
+            -DLITE_WITH_X86=OFF \
+            -DLITE_WITH_ARM=ON \
+            -DWITH_TESTING=OFF \
+            -DLITE_WITH_JAVA=OFF \
+            -DLITE_SHUTDOWN_LOG=ON \
+            -DLITE_ON_TINY_PUBLISH=ON \
+            -DLITE_WITH_OPENMP=OFF \
+            -DWITH_ARM_DOTPROD=OFF \
+            -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=ON \
+            -DARM_TARGET_ARCH_ABI=$abi \
+            -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
+            -DLITE_WITH_CV=$BUILD_CV \
+            -DARM_TARGET_OS=$os
+
+    make -j4 publish_inference
+    cd -
+}
+
 # $1: ARM_TARGET_OS in "android"
 # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7"
 # $3: ARM_TARGET_LANG in "gcc" "clang"
@@ -773,6 +811,21 @@ function build_test_arm_subtask_armlinux {
     echo "Done"
 }
 
+# sub-task3
+# this task will test IOS compiling, which requires cmake_version>=3.15
+function build_test_arm_subtask_ios {
+    cur=$PWD
+    # job 8
+    build_ios "ios" "armv7"
+    cd $cur
+
+    # job 9
+    build_ios "ios64" "armv8"
+    cd $cur
+
+    echo "Done"
+}
+
 # this method need to invoke `build_test_arm_subtask_android` first.
 function build_test_arm_subtask_model {
     # We just test following single one environment to limit the CI time.
@@ -1042,6 +1095,10 @@ function main {
                 build_test_arm_subtask_armlinux
                 shift
                 ;;
+            build_test_arm_subtask_ios)
+                build_test_arm_subtask_ios
+                shift
+                ;;
             check_style)
                 check_style
                 shift
diff --git a/lite/tools/cmake_tools/create_fake_kernel_registry.py b/lite/tools/cmake_tools/create_fake_kernel_registry.py
index 140d773207..35012d5b16 100644
--- a/lite/tools/cmake_tools/create_fake_kernel_registry.py
+++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py
@@ -18,6 +18,9 @@ import logging
 from ast import RegisterLiteKernelParser
 from utils import *
 
+if len(sys.argv) != 4:
+    print("Error: create_fake_kernel_registry.py requires three inputs!")
+    exit(1)
 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
 kernelmap_path = sys.argv[3]
diff --git a/lite/tools/cmake_tools/parse_kernel_registry.py b/lite/tools/cmake_tools/parse_kernel_registry.py
index f4f0b95483..6c020ec438 100644
--- a/lite/tools/cmake_tools/parse_kernel_registry.py
+++ b/lite/tools/cmake_tools/parse_kernel_registry.py
@@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import print_function
 import sys
 import logging
 from ast import RegisterLiteKernelParser
 
+if len(sys.argv) != 5:
+    print("Error: parse_kernel_registry.py requires four inputs!")
+    exit(1)
 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
 minkernels_list_path = sys.argv[3]
diff --git a/lite/tools/cmake_tools/parse_op_registry.py b/lite/tools/cmake_tools/parse_op_registry.py
index db58c455a9..7eb3337ed8 100644
--- a/lite/tools/cmake_tools/parse_op_registry.py
+++ b/lite/tools/cmake_tools/parse_op_registry.py
@@ -13,10 +13,14 @@
 # limitations under the License.
 ''' Collect op registry information. '''
 
+from __future__ import print_function
 import sys
 import logging
 from ast import RegisterLiteOpParser
 
+if len(sys.argv) != 5:
+    print("Error: parse_op_registry.py requires four inputs!")
+    exit(1)
 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
 minops_list_path = sys.argv[3]
diff --git a/lite/tools/cmake_tools/record_supported_kernel_op.py b/lite/tools/cmake_tools/record_supported_kernel_op.py
new file mode 100644
index 0000000000..f6a3af6bd3
--- /dev/null
+++ b/lite/tools/cmake_tools/record_supported_kernel_op.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+import logging
+from ast import RegisterLiteKernelParser
+from ast import RegisterLiteOpParser
+
+if len(sys.argv) != 4:
+    print("Error: record_supported_kernel_op.py requires three inputs!")
+    exit(1)
+kernels_list_path = sys.argv[1]
+ops_list_path = sys.argv[2]
+kernel_op_map_dest_path = sys.argv[3]
+
+
+out_lines = [
+'''
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include<vector>
+#include<map>
+#include<string>
+
+const std::vector<std::vector<std::string>> supported_ops_target = {
+'''
+]
+
+ops_lines=[]
+
+# valid targets and valid_ops
+valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU"]
+valid_ops = [[],[],[],[],[],[],[],[],[],[]]
+class TargetType:
+    kUnk = 0
+    kHost = 1
+    kX86 = 2
+    kCUDA = 3
+    kARM = 4
+    kOpenCL = 5
+    kFPGA = 7
+    kNPU = 8
+    kXPU = 9
+    kAny = 6  # any target
+
+# record op_info of valid kernels into `valid_ops` according to different target type
+with open(kernels_list_path) as f:
+    paths = set([path for path in f])
+    for path in paths:
+        with open(path.strip()) as g:
+            c = g.read()
+            kernel_parser = RegisterLiteKernelParser(c)
+            kernel_parser.parse()
+            for k in kernel_parser.kernels:
+                if hasattr(TargetType, k.target):
+                    index=getattr(TargetType, k.target)
+                    valid_ops[index].append(k.op_type)
+
+# clear the repeated ops
+for target in valid_targets:
+    index = getattr(TargetType, target)
+    valid_ops[index] = list(set(valid_ops[index]))
+
+paths = set()
+with open(ops_list_path) as f:
+    paths = set([path for path in f])
+    for path in paths:
+        str_info = open(path.strip()).read()
+        op_parser = RegisterLiteOpParser(str_info)
+        ops = op_parser.parse()
+        for op in ops:
+            if "_grad" in op:
+                continue
+            out = '    {"%s", { "' % op
+            op_targets = []
+            for target in valid_targets:
+                if op in valid_ops[getattr(TargetType, target)]:
+                    op_targets.append(target)
+            if len(op_targets) > 0:
+                out = out +'", "'.join(op_targets)+ '" }}'
+            else:
+                # unknow type op:  kUnk = 0
+                valid_ops[0].append(op)
+                out = out +'kUnk" }}'
+            ops_lines.append(out)
+
+with open(kernel_op_map_dest_path, 'w') as f:
+    logging.info("write kernel list to %s" % kernel_op_map_dest_path)
+    f.write('\n'.join(out_lines))
+    # write kernels into head file
+    for target in valid_targets:
+        if len(valid_ops[getattr(TargetType, target)]) == 0 :
+            f.write("\n    // %s_OPS: " %target)
+            f.write('\n    {},')
+        else:
+            f.write("\n    // %s_OPS: " %target)
+            f.write('\n    {"')
+            f.write('","'.join(valid_ops[getattr(TargetType, target)]))
+            f.write('"},\n')
+    f.write('};')
+    # write op info into head file
+    f.write('\nconst std::map<std::string, std::vector<std::string>> supported_ops={\n')
+    f.write(',\n'.join(ops_lines))
+    f.write('\n};')
diff --git a/lite/utils/cv/CMakeLists.txt b/lite/utils/cv/CMakeLists.txt
index 0edcb2ef24..6c88e70de1 100644
--- a/lite/utils/cv/CMakeLists.txt
+++ b/lite/utils/cv/CMakeLists.txt
@@ -1,5 +1,4 @@
 if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_ARM)
-    set(lite_cv_deps)
     lite_cc_library(paddle_cv_arm SRCS
             image_convert.cc
             paddle_image_preprocess.cc
@@ -7,5 +6,5 @@ if(LITE_WITH_CV AND (NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND LITE_WITH_
             image_flip.cc
             image_rotate.cc
             image_resize.cc
-            DEPS ${lite_cv_deps} paddle_api place)
+            DEPS paddle_api place)
 endif()
diff --git a/lite/utils/cv/image2tensor.cc b/lite/utils/cv/image2tensor.cc
index b51a82da1d..3a09039a0f 100644
--- a/lite/utils/cv/image2tensor.cc
+++ b/lite/utils/cv/image2tensor.cc
@@ -18,6 +18,13 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void gray_to_tensor(const uint8_t* src,
+                    float* output,
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales);
+
 void bgr_to_tensor_chw(const uint8_t* src,
                        float* output,
                        int width,
@@ -52,7 +59,7 @@ void bgra_to_tensor_hwc(const uint8_t* src,
  * NCHW
   * param src: input image data
   * param dstTensor: output tensor data
-  * param srcFormat: input image format, support BGR(GRB) and BGRA(RGBA)
+  * param srcFormat: input image format, support GRAY, BGR(GRB) and BGRA(RGBA)
   * param srcw: input image width
   * param srch: input image height
   * param layout: output tensor layout，support NHWC and NCHW
@@ -79,6 +86,9 @@ void Image2Tensor::choose(const uint8_t* src,
   } else if (layout == LayoutType::kNHWC &&
              (srcFormat == BGRA || srcFormat == RGBA)) {
     impl_ = bgra_to_tensor_hwc;
+  } else if ((layout == LayoutType::kNHWC || layout == LayoutType::kNCHW) &&
+             (srcFormat == GRAY)) {
+    impl_ = gray_to_tensor;
   } else {
     printf("this layout: %d or image format: %d not support \n",
            static_cast<int>(layout),
@@ -87,6 +97,147 @@ void Image2Tensor::choose(const uint8_t* src,
   }
   impl_(src, output, srcw, srch, means, scales);
 }
+
+void gray_to_tensor(const uint8_t* src,
+                    float* output,
+                    int width,
+                    int height,
+                    float* means,
+                    float* scales) {
+  int size = width * height;
+  float mean_val = means[0];
+  float scale_val = scales[0];
+
+  int dim16 = width >> 16;
+  int remain = width % 16;
+
+  float32x4_t vmean = vdupq_n_f32(mean_val);
+  float32x4_t vscale = vdupq_n_f32(scale_val);
+#pragma omp parallel for
+  for (int i = 0; i < height; i += 1) {
+    const uint8_t* din_ptr = src + i * width;
+    float* ptr_h = output + i * width;
+    int cnt = dim16;
+    if (cnt > 0) {
+#ifdef __aarch64__
+      asm volatile(
+          "prfm   pldl1keep, [%[inptr0]]                \n"
+          "prfm   pldl1keep, [%[inptr0], #64]   \n"
+          "prfm   pldl1keep, [%[inptr0], #128]   \n"
+          "prfm   pldl1keep, [%[inptr0], #192]   \n"
+          "1:     \n"
+          "ld1 {v0.8b}, [%[inptr0]], #8 \n"  // d8 = y0y1y2.."
+          "ld1 {v1.8b}, [%[inptr0]], #8 \n"  // d8 = y0y1y2.."
+          // 8->16
+          "ushll v3.8h, v0.8b, #0  \n"
+          "ushll v4.8h, v0.8b, #0  \n"
+          // 16->32
+          "ushll v6.4s, v3.4h, #0   \n"
+          "ushll2 v7.4s, v3.8h, #0   \n"
+          "ushll v8.4s, v4.4h, #0   \n"
+          "ushll2 v9.4s, v4.8h, #0   \n"
+          // int32->fp32
+          "ucvtf v12.4s, v6.4s \n"
+          "ucvtf v13.4s, v7.4s \n"
+          "ucvtf v14.4s, v8.4s \n"
+          "ucvtf v15.4s, v9.4s \n"
+          // sub -mean
+          "fsub v12.4s, v12.4s, %w[vmean].4s \n"
+          "fsub v13.4s, v13.4s, %w[vmean].4s \n"
+          "fsub v14.4s, v14.4s, %w[vmean].4s \n"
+          "fsub v15.4s, v15.4s, %w[vmean].4s \n"
+          // mul * scale
+          "fmul v6.4s, v12.4s, %w[vscale].4s \n"
+          "fmul v7.4s, v13.4s, %w[vscale].4s \n"
+          "fmul v8.4s, v14.4s, %w[vscale].4s \n"
+          "fmul v9.4s, v15.4s, %w[vscale].4s \n"
+          // store
+          "st1 {v6.4s}, [%[outr0]], #16 \n"
+          "subs %w[cnt], %w[cnt], #1 \n"
+          "st1 {v7.4s}, [%[outr0]], #16 \n"
+          "st1 {v8.4s}, [%[outr0]], #16 \n"
+          "st1 {v9.4s}, [%[outr0]], #16 \n"
+          "bne 1b \n"
+          : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt)
+          : [vmean] "w"(vmean), [vscale] "w"(vscale)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15");
+#else
+      asm volatile(
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #64]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #192]                         @ preload a, 64byte\n"
+          "1: \n"
+          "vld1.8 {d12, d13}, [%[inptr0]]! \n"
+          // 8->16
+          "vmovl.u8 q8, d12 \n"
+          "vmovl.u8 q9, d13 \n"
+          // 16->32
+          "vmovl.u16 q11, d16 \n"
+          "vmovl.u16 q12, d17 \n"
+          "vmovl.u16 q13, d18 \n"
+          "vmovl.u16 q14, d19 \n"
+          // int32->fp32
+          "vcvt.f32.u32 q7, q11 \n"
+          "vcvt.f32.u32 q8, q12 \n"
+          "vcvt.f32.u32 q9, q13 \n"
+          "vcvt.f32.u32 q10, q14 \n"
+          // sub -mean
+          "vsub.f32 q7, q7, %q[vmean] \n"
+          "vsub.f32 q8, q8, %q[vmean] \n"
+          "vsub.f32 q9, q9, %q[vmean] \n"
+          "vsub.f32 q10, q10, %q[vmean] \n"
+          // mul *scale
+          "vmul.f32 q11, q7, %q[vscale] \n"
+          "vmul.f32 q12, q8, %q[vscale] \n"
+          "vmul.f32 q13, q9, %q[vscale] \n"
+          "vmul.f32 q14, q10, %q[vscale] \n"
+          // store
+          "vst1.32  {d22 - d23}, [%[outr0]]! \n"
+          "subs %[cnt], #1 \n"
+          "vst1.32  {d24 - d25}, [%[outr0]]! \n"
+          "vst1.32  {d26 - d27}, [%[outr0]]! \n"
+          "vst1.32  {d28 - d29}, [%[outr0]]! \n"
+          "bne 1b"
+          : [inptr0] "+r"(din_ptr), [outr0] "+r"(ptr_h), [cnt] "+r"(cnt)
+          : [vmean] "w"(vmean), [vscale] "w"(vscale)
+          : "cc",
+            "memory",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12",
+            "q13",
+            "q14");
+#endif
+    }
+    for (int j = 0; j < remain; j++) {
+      *ptr_h++ = (*din_ptr - mean_val) * scale_val;
+      din_ptr++;
+    }
+  }
+}
+
 void bgr_to_tensor_chw(const uint8_t* src,
                        float* output,
                        int width,
@@ -390,6 +541,7 @@ void bgra_to_tensor_chw(const uint8_t* src,
     }
   }
 }
+
 void bgr_to_tensor_hwc(const uint8_t* src,
                        float* output,
                        int width,
diff --git a/lite/utils/cv/image_convert.cc b/lite/utils/cv/image_convert.cc
index 24b6db70dd..385f56d233 100644
--- a/lite/utils/cv/image_convert.cc
+++ b/lite/utils/cv/image_convert.cc
@@ -30,10 +30,14 @@ void nv21_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 void nv21_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 void nv12_to_bgr(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 void nv12_to_bgra(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// bgra rgba to gray
+void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // bgr rgb to gray
 void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // gray to bgr rgb
 void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch);
+// gray to bgra rgba
+void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // bgr to bgra or rgb to rgba
 void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch);
 // bgra to bgr or rgba to rgb
@@ -112,6 +116,12 @@ void ImageConvert::choose(const uint8_t* src,
     } else if ((srcFormat == RGB && dstFormat == BGRA) ||
                (srcFormat == BGR && dstFormat == RGBA)) {
       impl_ = hwc3_trans_hwc4;
+    } else if ((srcFormat == GRAY && dstFormat == RGBA) ||
+               (srcFormat == GRAY && dstFormat == BGRA)) {
+      impl_ = hwc1_to_hwc4;
+    } else if ((srcFormat == RGBA && dstFormat == GRAY) ||
+               (srcFormat == BGRA && dstFormat == GRAY)) {
+      impl_ = hwc4_to_hwc1;
     } else {
       printf("srcFormat: %d, dstFormat: %d does not support! \n",
              srcFormat,
@@ -989,7 +999,7 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
           "vshrn.u32 d24, q6, #7 \n"
           "vshrn.u32 d25, q7, #7 \n"
           "vshrn.u32 d26, q8, #7 \n"
-          "vshrn.u32 d27, q8, #7 \n"
+          "vshrn.u32 d27, q9, #7 \n"
           // 16->8
           "vmovn.u16 d4, q10 \n"
           "vmovn.u16 d5, q11 \n"
@@ -1077,6 +1087,280 @@ void hwc3_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   }
 }
 /*
+采用CV_BGR2GRAY,转换公式Gray = 0.1140*B + 0.5870*G + 0.2989*R
+采用CV_RGB2GRAY,转换公式Gray = 0.1140*R + 0.5870*G + 0.2989*B
+b = 0.114 *128 = 14.529 = 15
+g = 0.587 * 128 = 75.136 = 75
+r = 0.2989 * 127 = 38.2592 = 38
+Gray = (15*B + 75*G + 38*R)/128
+bgra2gray, rgba2gray
+*/
+void hwc4_to_hwc1(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  uint8_t b = 15;
+  uint8_t g = 75;
+  uint8_t r = 38;
+
+  uint8x8_t vb = vdup_n_u8(b);
+  uint8x8_t vg = vdup_n_u8(g);
+  uint8x8_t vr = vdup_n_u8(r);
+#ifdef __aarch64__
+#else
+  uint8_t vb_array[8] = {b, b, b, b, b, b, b, b};
+  uint8_t vg_array[8] = {g, g, g, g, g, g, g, g};
+  uint8_t vr_array[8] = {r, r, r, r, r, r, r, r};
+#endif
+  int cnt_pro = srcw >> 3;
+  int remain_pro = srcw % 8;
+  int win = srcw * 4;
+  int i = 0;
+#pragma omp parallel for
+  for (i = 0; i < srch - 3; i += 4) {
+    int j = 0;
+    const uint8_t* inptr0 = src + i * win;
+    const uint8_t* inptr1 = inptr0 + win;
+    const uint8_t* inptr2 = inptr1 + win;
+    const uint8_t* inptr3 = inptr2 + win;
+    uint8_t* outr0 = dst + i * srcw;
+    uint8_t* outr1 = outr0 + srcw;
+    uint8_t* outr2 = outr1 + srcw;
+    uint8_t* outr3 = outr2 + srcw;
+
+    int cnt = cnt_pro;
+    if (cnt > 0) {
+#ifdef __aarch64__
+      asm volatile(
+          "prfm   pldl1keep, [%[inptr0]]                \n"
+          "prfm   pldl1keep, [%[inptr0], #128]   \n"
+          "prfm   pldl1keep, [%[inptr1]]        \n"
+          "prfm   pldl1keep, [%[inptr1], #128]   \n"
+          "prfm   pldl1keep, [%[inptr2]]                \n"
+          "prfm   pldl1keep, [%[inptr2], #128]   \n"
+          "prfm   pldl1keep, [%[inptr3]]                \n"
+          "prfm   pldl1keep, [%[inptr3], #128]   \n"
+          "1: \n"
+          "ld4 {v0.8b - v3.8b}, [%[inptr0]], #32 \n"    // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          "ld4 {v4.8b - v7.8b}, [%[inptr1]], #32 \n"    // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          "ld4 {v8.8b - v11.8b}, [%[inptr2]], #32 \n"   // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          "ld4 {v12.8b - v15.8b}, [%[inptr3]], #32 \n"  // d8 = y0y3y6y9.. d9 =
+                                                        // y1y4y7...
+          // mul b
+          "umull v13.8h, v0.8b, %w[vb].8b \n"   // v0 * vb
+          "umull v14.8h, v4.8b, %w[vb].8b \n"   // v0 * vb
+          "umull v15.8h, v8.8b, %w[vb].8b \n"   // v0 * vb
+          "umull v16.8h, v12.8b, %w[vb].8b \n"  // v0 * vb
+          // mul g
+          "umull v17.8h, v1.8b, %w[vg].8b \n"   // v0 * vb
+          "umull v18.8h, v5.8b, %w[vg].8b \n"   // v0 * vb
+          "umull v19.8h, v9.8b, %w[vg].8b \n"   // v0 * vb
+          "umull v20.8h, v13.8b, %w[vg].8b \n"  // v0 * vb
+          // mul r
+          "umlal v13.8h, v2.8b, %w[vr].8b \n"   // v0 * vb
+          "umlal v14.8h, v6.8b, %w[vr].8b \n"   // v0 * vb
+          "umlal v15.8h, v10.8b, %w[vr].8b \n"  // v0 * vb
+          "umlal v16.8h, v14.8b, %w[vr].8b \n"  // v0 * vb
+          // 16->32
+          "uaddl v0.4s, v17.4h, v13.4h \n"
+          "uaddl2 v1.4s, v17.8h, v13.8h \n"
+          "uaddl v2.4s, v18.4h, v14.4h \n"
+          "uaddl2 v3.4s, v18.8h, v14.8h \n"
+          "uaddl v4.4s, v19.4h, v15.4h \n"
+          "uaddl2 v5.4s, v19.8h, v15.8h \n"
+          "uaddl v6.4s, v20.4h, v16.4h \n"
+          "uaddl2 v7.4s, v20.8h, v16.8h \n"
+          // 32->16 v0 >> 7
+          "shrn v12.4h, v0.4s, #7 \n"
+          "shrn2 v12.8h, v1.4s, #7 \n"
+          "shrn v13.4h, v2.4s, #7 \n"
+          "shrn2 v13.8h, v3.4s, #7 \n"
+          "shrn v14.4h, v4.4s, #7 \n"
+          "shrn2 v14.8h, v5.4s, #7 \n"
+          "shrn v15.4h, v6.4s, #7 \n"
+          "shrn2 v15.8h, v7.4s, #7 \n"
+          // 16->8
+          "xtn v0.8b, v12.8h \n"
+          "xtn v1.8b, v13.8h \n"
+          "xtn v2.8b, v14.8h \n"
+          "xtn v3.8b, v15.8h \n"
+          "subs %w[cnt], %w[cnt], #1 \n"
+          "st1 {v0.8b}, [%[outr0]], #8 \n"
+          "st1 {v1.8b}, [%[outr1]], #8 \n"
+          "st1 {v2.8b}, [%[outr2]], #8 \n"
+          "st1 {v3.8b}, [%[outr3]], #8 \n"
+          "bne 1b \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outr0] "+r"(outr0),
+            [outr1] "+r"(outr1),
+            [outr2] "+r"(outr2),
+            [outr3] "+r"(outr3),
+            [cnt] "+r"(cnt)
+          : [vb] "w"(vb), [vg] "w"(vg), [vr] "w"(vr)
+          : "cc",
+            "memory",
+            "v0",
+            "v1",
+            "v2",
+            "v3",
+            "v4",
+            "v5",
+            "v6",
+            "v7",
+            "v8",
+            "v9",
+            "v10",
+            "v11",
+            "v12",
+            "v13",
+            "v14",
+            "v15",
+            "v16",
+            "v17",
+            "v18",
+            "v19",
+            "v20");
+#else
+      asm volatile(
+          "pld [%[inptr0]]                         @ preload a, 64byte\n"
+          "pld [%[inptr0], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr1]]            @ preload a, 64byte\n"
+          "pld [%[inptr1], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr2]]            @ preload a, 64byte\n"
+          "pld [%[inptr2], #128]                         @ preload a, 64byte\n"
+          "pld [%[inptr3]]            @ preload a, 64byte\n"
+          "pld [%[inptr3], #128]                         @ preload a, 64byte\n"
+          "vld1.8 d0, [%[vb]] \n"
+          "vld1.8 d1, [%[vg]] \n"
+          "vld1.8 d2, [%[vr]] \n"
+          "1: \n"
+          "vld4.8 {d3, d4, d5, d6}, [%[inptr0]]! \n"
+          "vld4.8 {d7, d8, d9, d10}, [%[inptr1]]! \n"
+          "vld4.8 {d11, d12, d13, d14}, [%[inptr2]]! \n"
+          "vld4.8 {d15, d16, d17, d18}, [%[inptr3]]! \n"
+          // vb
+          "vmull.u8 q10, d3, d0 \n"
+          "vmull.u8 q11, d7, d0 \n"
+          "vmull.u8 q12, d11, d0 \n"
+          "vmull.u8 q13, d15, d0 \n"
+          // vg
+          "vmull.u8 q14, d4, d1 \n"
+          "vmull.u8 q15, d8, d1 \n"
+          "vmull.u8 q5, d12, d1 \n"
+          "vmull.u8 q7, d16, d1 \n"
+          // vr
+          "vmlal.u8 q10, d5, d2 \n"
+          "vmlal.u8 q11, d9, d2 \n"
+          "vmlal.u8 q12, d13, d2 \n"
+          "vmlal.u8 q13, d17, d2 \n"
+          // 16->32
+          "vaddl.u16 q2, d28, d20 \n"
+          "vaddl.u16 q3, d29, d21 \n"
+          "vaddl.u16 q4, d30, d22 \n"
+          "vaddl.u16 q10, d31, d23 \n"
+          "vaddl.u16 q6, d10, d24 \n"
+          "vaddl.u16 q11, d11, d25 \n"
+          "vaddl.u16 q8, d14, d26 \n"
+          "vaddl.u16 q9, d15, d27 \n"
+          // 32->16 q2 >> 7
+          "vshrn.u32  d10, q2, #7 \n"
+          "vshrn.u32 d11, q3, #7 \n"
+          "vshrn.u32 d14, q4, #7 \n"
+          "vshrn.u32 d15, q10, #7 \n"
+          "vshrn.u32 d24, q6, #7 \n"
+          "vshrn.u32 d25, q11, #7 \n"
+          "vshrn.u32 d26, q8, #7 \n"
+          "vshrn.u32 d27, q9, #7 \n"
+          // 16->8
+          "vmovn.u16 d4, q5 \n"
+          "vmovn.u16 d5, q7 \n"
+          "vmovn.u16 d6, q12 \n"
+          "vmovn.u16 d7, q13 \n"
+          "subs %[cnt], #1 \n"
+          // store
+          "vst1.8 d4, [%[outr0]]! \n"
+          "vst1.8 d5, [%[outr1]]! \n"
+          "vst1.8 d6, [%[outr2]]! \n"
+          "vst1.8 d7, [%[outr3]]! \n"
+          "bne 1b \n"
+          : [inptr0] "+r"(inptr0),
+            [inptr1] "+r"(inptr1),
+            [inptr2] "+r"(inptr2),
+            [inptr3] "+r"(inptr3),
+            [outr0] "+r"(outr0),
+            [outr1] "+r"(outr1),
+            [outr2] "+r"(outr2),
+            [outr3] "+r"(outr3),
+            [cnt] "+r"(cnt)
+          : [vb] "r"(vb_array), [vg] "r"(vg_array), [vr] "r"(vr_array)
+          : "cc",
+            "memory",
+            "q0",
+            "q1",
+            "q2",
+            "q3",
+            "q4",
+            "q5",
+            "q6",
+            "q7",
+            "q8",
+            "q9",
+            "q10",
+            "q11",
+            "q12",
+            "q13",
+            "q14",
+            "q15");
+#endif
+    }
+    for (; j < remain_pro; j++) {
+      *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7;
+      *outr1++ = (inptr1[0] * b + inptr1[1] * g + inptr1[2] * r) >> 7;
+      *outr2++ = (inptr2[0] * b + inptr2[1] * g + inptr2[2] * r) >> 7;
+      *outr3++ = (inptr3[0] * b + inptr3[1] * g + inptr3[2] * r) >> 7;
+      inptr0 += 4;
+      inptr1 += 4;
+      inptr2 += 4;
+      inptr3 += 4;
+    }
+  }
+  for (; i < srch; i++) {
+    int j = 0;
+    const uint8_t* inptr0 = src + i * win;
+    uint8_t* outr0 = dst + i * srcw;
+    for (j = 0; j < cnt_pro; j++) {
+      uint8x8x4_t y0 = vld4_u8(inptr0);  // d8 = y0y3y6y9.. d9 = y1y4y7...y
+      uint16x8_t val0 = vmull_u8(y0.val[0], vb);
+
+      uint16x8_t val0_1 = vmull_u8(y0.val[1], vg);
+
+      val0 = vmlal_u8(val0, y0.val[2], vr);
+
+      uint32x4_t v0_sum0 = vaddl_u16(vget_low_u16(val0_1), vget_low_u16(val0));
+      uint32x4_t v0_sum1 =
+          vaddl_u16(vget_high_u16(val0_1), vget_high_u16(val0));
+
+      uint16x4_t v0_sum0_16 = vshrn_n_u32(v0_sum0, 7);
+      uint16x4_t v0_sum1_16 = vshrn_n_u32(v0_sum1, 7);
+
+      uint16x8_t v0_sum = vcombine_u16(v0_sum0_16, v0_sum1_16);
+
+      uint8x8_t vout0 = vmovn_u16(v0_sum);
+
+      inptr0 += 32;
+      vst1_u8(outr0, vout0);
+      outr0 += 8;
+    }
+    for (; j < srcw; j++) {
+      *outr0++ = (inptr0[0] * b + inptr0[1] * g + inptr0[2] * r) >> 7;
+      inptr0 += 4;
+    }
+  }
+}
+/*
 采用CV_GRAY2BGR,转换公式B = G = R = Gray
 采用CV_GRAY2RGB,转换公式R = G = B = Gray
 gray2bgr, gray2rgb
@@ -1091,6 +1375,22 @@ void hwc1_to_hwc3(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
     }
   }
 }
+/*
+采用CV_GRAY2BGRA,转换公式B = G = R = Gray A=255
+采用CV_GRAY2RGBA,转换公式R = G = B = Gray A=255
+gray2bgra, gray2rgba
+*/
+void hwc1_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
+  for (int i = 0; i < srch; i++) {
+    for (int j = 0; j < srcw; j++) {
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = *src;
+      *dst++ = 255;
+      src++;
+    }
+  }
+}
 // bgr2bgra, rgb2rgba
 void hwc3_to_hwc4(const uint8_t* src, uint8_t* dst, int srcw, int srch) {
   for (int i = 0; i < srch; i++) {
diff --git a/lite/utils/cv/image_flip.cc b/lite/utils/cv/image_flip.cc
index fd84691a2d..f535c858e4 100644
--- a/lite/utils/cv/image_flip.cc
+++ b/lite/utils/cv/image_flip.cc
@@ -19,6 +19,23 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void ImageFlip::choose(const uint8_t* src,
+                       uint8_t* dst,
+                       ImageFormat srcFormat,
+                       int srcw,
+                       int srch,
+                       FlipParam flip_param) {
+  if (srcFormat == GRAY) {
+    flip_hwc1(src, dst, srcw, srch, flip_param);
+  } else if (srcFormat == BGR || srcFormat == RGB) {
+    flip_hwc3(src, dst, srcw, srch, flip_param);
+  } else if (srcFormat == BGRA || srcFormat == RGBA) {
+    flip_hwc4(src, dst, srcw, srch, flip_param);
+  } else {
+    printf("this srcFormat: %d does not support! \n", srcFormat);
+    return;
+  }
+}
 // gray
 void flip_hwc1_x(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
 void flip_hwc1_y(const uint8_t* src, uint8_t* dst, int w_in, int h_in);
@@ -43,6 +60,9 @@ void flip_hwc1(const uint8_t* src,
     flip_hwc1_y(src, dst, srcw, srch);
   } else if (flip_param == XY) {
     flip_hwc1_xy(src, dst, srcw, srch);
+  } else {
+    printf("its doesn't support Flip: %d \n", static_cast<int>(flip_param));
+    return;
   }
 }
 
@@ -57,6 +77,9 @@ void flip_hwc3(const uint8_t* src,
     flip_hwc3_y(src, dst, srcw, srch);
   } else if (flip_param == XY) {
     flip_hwc3_xy(src, dst, srcw, srch);
+  } else {
+    printf("its doesn't support Flip: %d \n", static_cast<int>(flip_param));
+    return;
   }
 }
 
@@ -71,6 +94,9 @@ void flip_hwc4(const uint8_t* src,
     flip_hwc4_y(src, dst, srcw, srch);
   } else if (flip_param == XY) {
     flip_hwc4_xy(src, dst, srcw, srch);
+  } else {
+    printf("its doesn't support Flip: %d \n", static_cast<int>(flip_param));
+    return;
   }
 }
 /*
diff --git a/lite/utils/cv/image_flip.h b/lite/utils/cv/image_flip.h
index 5e513324a1..7215b9494a 100644
--- a/lite/utils/cv/image_flip.h
+++ b/lite/utils/cv/image_flip.h
@@ -21,6 +21,15 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+class ImageFlip {
+ public:
+  void choose(const uint8_t* src,
+              uint8_t* dst,
+              ImageFormat srcFormat,
+              int srcw,
+              int srch,
+              FlipParam flip_param);
+};
 void flip_hwc1(
     const uint8_t* src, uint8_t* dst, int srcw, int srch, FlipParam flip_param);
 void flip_hwc3(
diff --git a/lite/utils/cv/image_resize.cc b/lite/utils/cv/image_resize.cc
index 8b0b8aa17d..cd02a2cf4b 100644
--- a/lite/utils/cv/image_resize.cc
+++ b/lite/utils/cv/image_resize.cc
@@ -38,6 +38,15 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void ImageResize::choose(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         int srcw,
+                         int srch,
+                         int dstw,
+                         int dsth) {
+  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
+}
 void compute_xy(int srcw,
                 int srch,
                 int dstw,
diff --git a/lite/utils/cv/image_resize.h b/lite/utils/cv/image_resize.h
index e2e399f542..f11f7b5d93 100644
--- a/lite/utils/cv/image_resize.h
+++ b/lite/utils/cv/image_resize.h
@@ -39,6 +39,16 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+class ImageResize {
+ public:
+  void choose(const uint8_t* src,
+              uint8_t* dst,
+              ImageFormat srcFormat,
+              int srcw,
+              int srch,
+              int dstw,
+              int dsth);
+};
 void resize(const uint8_t* src,
             uint8_t* dst,
             ImageFormat srcFormat,
diff --git a/lite/utils/cv/image_rotate.cc b/lite/utils/cv/image_rotate.cc
index 04ba840766..98e61fb444 100644
--- a/lite/utils/cv/image_rotate.cc
+++ b/lite/utils/cv/image_rotate.cc
@@ -19,6 +19,26 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+void ImageRotate::choose(const uint8_t* src,
+                         uint8_t* dst,
+                         ImageFormat srcFormat,
+                         int srcw,
+                         int srch,
+                         float degree) {
+  if (degree != 90 && degree != 180 && degree != 270) {
+    printf("this degree: %f not support \n", degree);
+  }
+  if (srcFormat == GRAY) {
+    rotate_hwc1(src, dst, srcw, srch, degree);
+  } else if (srcFormat == BGR || srcFormat == RGB) {
+    rotate_hwc3(src, dst, srcw, srch, degree);
+  } else if (srcFormat == BGRA || srcFormat == RGBA) {
+    rotate_hwc4(src, dst, srcw, srch, degree);
+  } else {
+    printf("this srcFormat: %d does not support! \n", srcFormat);
+    return;
+  }
+}
 // gray
 void rotate_hwc1_90(
     const uint8_t* src, uint8_t* dst, int w_in, int h_in, int w_out, int h_out);
@@ -50,6 +70,9 @@ void rotate_hwc1(
     rotate_hwc1_180(src, dst, srcw, srch, srcw, srch);
   } else if (degree == 270) {
     rotate_hwc1_270(src, dst, srcw, srch, srch, srcw);
+  } else {
+    printf("this degree: %f does not support! \n", degree);
+    return;
   }
 }
 
@@ -61,6 +84,9 @@ void rotate_hwc3(
     rotate_hwc3_180(src, dst, srcw, srch, srcw, srch);
   } else if (degree == 270) {
     rotate_hwc3_270(src, dst, srcw, srch, srch, srcw);
+  } else {
+    printf("this degree: %f does not support! \n", degree);
+    return;
   }
 }
 
@@ -72,6 +98,9 @@ void rotate_hwc4(
     rotate_hwc4_180(src, dst, srcw, srch, srcw, srch);
   } else if (degree == 270) {
     rotate_hwc4_270(src, dst, srcw, srch, srch, srcw);
+  } else {
+    printf("this degree: %f does not support! \n", degree);
+    return;
   }
 }
 #ifdef __aarch64__
@@ -578,6 +607,7 @@ void rotate_hwc1_90(const uint8_t* src,
   int stride_h = 4 * w_in;
   int stride_h_w = 4 * w_in - 8;
   int stride_out = 4 * w_out;
+  int ww = w_out - 8;
 #pragma omp parallel for
   for (i = 0; i < h_in - 7; i += 8) {
     const uint8_t* inptr0 = src + i * w_in;
@@ -586,7 +616,7 @@ void rotate_hwc1_90(const uint8_t* src,
     const uint8_t* inptr3 = inptr2 + w_in;
     int j = 0;
     for (; j < w_in - 7; j += 8) {
-      uint8_t* outptr0 = dst + j * w_out + i;
+      uint8_t* outptr0 = dst + j * w_out + (ww - i);
       uint8_t* outptr1 = outptr0 + w_out;
       uint8_t* outptr2 = outptr1 + w_out;
       uint8_t* outptr3 = outptr2 + w_out;
@@ -648,7 +678,7 @@ void rotate_hwc1_90(const uint8_t* src,
     const uint8_t* inptr6 = inptr5 + w_in;
     const uint8_t* inptr7 = inptr6 + w_in;
     for (; j < w_in; j++) {
-      uint8_t* outptr = dst + j * w_out + i;
+      uint8_t* outptr = dst + j * w_out + ww - i;
       *outptr++ = *inptr0++;
       *outptr++ = *inptr1++;
       *outptr++ = *inptr2++;
@@ -659,10 +689,11 @@ void rotate_hwc1_90(const uint8_t* src,
       *outptr++ = *inptr7++;
     }
   }
+  ww = w_out - 1;
   for (; i < h_in; i++) {
     const uint8_t* inptr0 = src + i * w_in;
     for (int j = 0; j < w_in; j++) {
-      uint8_t* outptr0 = dst + j * w_out + i;
+      uint8_t* outptr0 = dst + j * w_out + ww - i;
       *outptr0 = *inptr0++;
     }
   }
@@ -693,9 +724,9 @@ void rotate_hwc1_180(const uint8_t* src,
     const uint8_t* inptr3 = inptr2 + w_in;
 
     uint8_t* outptr0 = dst + (h_in - i) * w_out - stride_w;  // last
-    uint8_t* outptr1 = outptr0 + w_out;
-    uint8_t* outptr2 = outptr1 + w_out;
-    uint8_t* outptr3 = outptr2 + w_out;
+    uint8_t* outptr1 = outptr0 - w_out;
+    uint8_t* outptr2 = outptr1 - w_out;
+    uint8_t* outptr3 = outptr2 - w_out;
 
     if (i + 3 >= h_in) {
       uint8_t* ptr = zerobuff + w_in - stride_w;
diff --git a/lite/utils/cv/image_rotate.h b/lite/utils/cv/image_rotate.h
index 8335fca280..8e04a3f524 100644
--- a/lite/utils/cv/image_rotate.h
+++ b/lite/utils/cv/image_rotate.h
@@ -16,10 +16,20 @@
 
 #include <stdint.h>
 #include <vector>
+#include "lite/utils/cv/paddle_image_preprocess.h"
 namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
+class ImageRotate {
+ public:
+  void choose(const uint8_t* src,
+              uint8_t* dst,
+              ImageFormat srcFormat,
+              int srcw,
+              int srch,
+              float degree);
+};
 void rotate_hwc1(
     const uint8_t* src, uint8_t* dst, int srcw, int srch, float degree);
 void rotate_hwc3(
diff --git a/lite/utils/cv/paddle_image_preprocess.cc b/lite/utils/cv/paddle_image_preprocess.cc
index f180475568..c46811a046 100644
--- a/lite/utils/cv/paddle_image_preprocess.cc
+++ b/lite/utils/cv/paddle_image_preprocess.cc
@@ -25,7 +25,6 @@ namespace paddle {
 namespace lite {
 namespace utils {
 namespace cv {
-
 #define PI 3.14159265f
 #define Degrees2Radians(degrees) ((degrees) * (SK_ScalarPI / 180))
 #define Radians2Degrees(radians) ((radians) * (180 / SK_ScalarPI))
@@ -38,7 +37,7 @@ ImagePreprocess::ImagePreprocess(ImageFormat srcFormat,
   this->dstFormat_ = dstFormat;
   this->transParam_ = param;
 }
-void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) {
+void ImagePreprocess::imageConvert(const uint8_t* src, uint8_t* dst) {
   ImageConvert img_convert;
   img_convert.choose(src,
                      dst,
@@ -48,10 +47,10 @@ void ImagePreprocess::imageCovert(const uint8_t* src, uint8_t* dst) {
                      this->transParam_.ih);
 }
 
-void ImagePreprocess::imageCovert(const uint8_t* src,
-                                  uint8_t* dst,
-                                  ImageFormat srcFormat,
-                                  ImageFormat dstFormat) {
+void ImagePreprocess::imageConvert(const uint8_t* src,
+                                   uint8_t* dst,
+                                   ImageFormat srcFormat,
+                                   ImageFormat dstFormat) {
   ImageConvert img_convert;
   img_convert.choose(src,
                      dst,
@@ -68,7 +67,8 @@ void ImagePreprocess::imageResize(const uint8_t* src,
                                   int srch,
                                   int dstw,
                                   int dsth) {
-  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
+  ImageResize img_resize;
+  img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
 
 void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
@@ -77,7 +77,8 @@ void ImagePreprocess::imageResize(const uint8_t* src, uint8_t* dst) {
   int dstw = this->transParam_.ow;
   int dsth = this->transParam_.oh;
   auto srcFormat = this->dstFormat_;
-  resize(src, dst, srcFormat, srcw, srch, dstw, dsth);
+  ImageResize img_resize;
+  img_resize.choose(src, dst, srcFormat, srcw, srch, dstw, dsth);
 }
 
 void ImagePreprocess::imageRotate(const uint8_t* src,
@@ -86,19 +87,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src,
                                   int srcw,
                                   int srch,
                                   float degree) {
-  if (degree != 90 && degree != 180 && degree != 270) {
-    printf("this degree: %f not support \n", degree);
-  }
-  if (srcFormat == GRAY) {
-    rotate_hwc1(src, dst, srcw, srch, degree);
-  } else if (srcFormat == BGR || srcFormat == RGB) {
-    rotate_hwc3(src, dst, srcw, srch, degree);
-  } else if (srcFormat == BGRA || srcFormat == RGBA) {
-    rotate_hwc4(src, dst, srcw, srch, degree);
-  } else {
-    printf("this srcFormat: %d does not support! \n", srcFormat);
-    return;
-  }
+  ImageRotate img_rotate;
+  img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
 
 void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) {
@@ -106,10 +96,8 @@ void ImagePreprocess::imageRotate(const uint8_t* src, uint8_t* dst) {
   auto srch = this->transParam_.oh;
   auto srcFormat = this->dstFormat_;
   auto degree = this->transParam_.rotate_param;
-  if (degree != 90 && degree != 180 && degree != 270) {
-    printf("this degree: %f not support \n", degree);
-  }
-  ImagePreprocess::imageRotate(src, dst, srcFormat, srcw, srch, degree);
+  ImageRotate img_rotate;
+  img_rotate.choose(src, dst, srcFormat, srcw, srch, degree);
 }
 
 void ImagePreprocess::imageFlip(const uint8_t* src,
@@ -118,16 +106,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src,
                                 int srcw,
                                 int srch,
                                 FlipParam flip_param) {
-  if (srcFormat == GRAY) {
-    flip_hwc1(src, dst, srcw, srch, flip_param);
-  } else if (srcFormat == BGR || srcFormat == RGB) {
-    flip_hwc3(src, dst, srcw, srch, flip_param);
-  } else if (srcFormat == BGRA || srcFormat == RGBA) {
-    flip_hwc4(src, dst, srcw, srch, flip_param);
-  } else {
-    printf("this srcFormat: %d does not support! \n", srcFormat);
-    return;
-  }
+  ImageFlip img_flip;
+  img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
 
 void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) {
@@ -135,7 +115,8 @@ void ImagePreprocess::imageFlip(const uint8_t* src, uint8_t* dst) {
   auto srch = this->transParam_.oh;
   auto srcFormat = this->dstFormat_;
   auto flip_param = this->transParam_.flip_param;
-  ImagePreprocess::imageFlip(src, dst, srcFormat, srcw, srch, flip_param);
+  ImageFlip img_flip;
+  img_flip.choose(src, dst, srcFormat, srcw, srch, flip_param);
 }
 
 void ImagePreprocess::image2Tensor(const uint8_t* src,
diff --git a/lite/utils/cv/paddle_image_preprocess.h b/lite/utils/cv/paddle_image_preprocess.h
index 5a46a9e48e..a12c0d11f0 100644
--- a/lite/utils/cv/paddle_image_preprocess.h
+++ b/lite/utils/cv/paddle_image_preprocess.h
@@ -19,6 +19,7 @@
 #include <vector>
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_place.h"
+
 namespace paddle {
 namespace lite {
 namespace utils {
@@ -37,9 +38,9 @@ enum ImageFormat {
 };
 // flip enum
 enum FlipParam {
-  X = 0,  // flip along the X axis
-  Y,      // flip along the Y axis
-  XY      // flip along the XY axis
+  XY = -1,  // flip along the XY axis
+  X = 0,    // flip along the X axis
+  Y         // flip along the Y axis
 };
 // transform param
 typedef struct {
@@ -69,11 +70,12 @@ class ImagePreprocess {
   * BGR(RGB)and BGRA(RGBA) transform,
   * BGR(RGB)and RGB(BGR) transform,
   * BGR(RGB)and RGBA(BGRA) transform,
-  * BGR(RGB)and GRAY transform,
+  * BGR(RGB) and GRAY transform,
+  * BGRA(RGBA) and GRAY transform,
   * param src: input image data
   * param dst: output image data
   */
-  void imageCovert(const uint8_t* src, uint8_t* dst);
+  void imageConvert(const uint8_t* src, uint8_t* dst);
   /*
   * image color convert
   * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
@@ -81,6 +83,7 @@ class ImagePreprocess {
   * BGR(RGB)and RGB(BGR) transform,
   * BGR(RGB)and RGBA(BGRA) transform,
   * BGR(RGB)and GRAY transform,
+  * BGRA(RGBA) and GRAY transform,
   * param src: input image data
   * param dst: output image data
   * param srcFormat: input image image format support: GRAY, NV12(NV21),
@@ -88,10 +91,10 @@ class ImagePreprocess {
   * param dstFormat: output image image format, support GRAY, BGR(RGB) and
   * BGRA(RGBA)
   */
-  void imageCovert(const uint8_t* src,
-                   uint8_t* dst,
-                   ImageFormat srcFormat,
-                   ImageFormat dstFormat);
+  void imageConvert(const uint8_t* src,
+                    uint8_t* dst,
+                    ImageFormat srcFormat,
+                    ImageFormat dstFormat);
   /*
   * image resize, use bilinear method
   * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
@@ -171,7 +174,8 @@ class ImagePreprocess {
                  FlipParam flip_param);
   /*
   * change image data to tensor data
-  * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and
+  * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
+  * and
   * NCHW
   * param src: input image data
   * param dstTensor: output tensor data
@@ -186,7 +190,8 @@ class ImagePreprocess {
                     float* scales);
   /*
    * change image data to tensor data
-  * support image format is BGR(RGB) and BGRA(RGBA), Data layout is NHWC and
+  * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
+  * and
   * NCHW
   * param src: input image data
   * param dstTensor: output tensor data
diff --git a/lite/utils/env.h b/lite/utils/env.h
old mode 100755
new mode 100644
diff --git a/mobile/src/common/log.h b/mobile/src/common/log.h
index 69654c505d..3b42188b62 100644
--- a/mobile/src/common/log.h
+++ b/mobile/src/common/log.h
@@ -80,7 +80,6 @@ static const char *ANDROID_LOG_TAG =
 #endif
 
 enum LogLevel {
-  kNO_LOG,
   kLOG_ERROR,
   kLOG_WARNING,
   kLOG_INFO,
@@ -89,15 +88,16 @@ enum LogLevel {
   kLOG_DEBUG1,
   kLOG_DEBUG2,
   kLOG_DEBUG3,
-  kLOG_DEBUG4
+  kLOG_DEBUG4,
+  kNO_LOG,
 };
 
 // log level
 static LogLevel log_level = kLOG_DEBUG4;
 
-static std::vector<std::string> logs{"NO     ", "ERROR  ", "WARNING", "INFO   ",
-                                     "VERBOSE", "DEBUG  ", "DEBUG1 ", "DEBUG2 ",
-                                     "DEBUG3 ", "DEBUG4 "};
+static std::vector<std::string> logs{"ERROR  ", "WARNING", "INFO   ", "VERBOSE",
+                                     "DEBUG  ", "DEBUG1 ", "DEBUG2 ", "DEBUG3 ",
+                                     "DEBUG4 ", "NO     "};
 struct ToLog;
 struct Print;
 
@@ -217,7 +217,6 @@ struct ToLog {
 #define ANDROIDLOGV(...)
 
 enum LogLevel {
-  kNO_LOG,
   kLOG_ERROR,
   kLOG_WARNING,
   kLOG_INFO,
@@ -226,7 +225,8 @@ enum LogLevel {
   kLOG_DEBUG1,
   kLOG_DEBUG2,
   kLOG_DEBUG3,
-  kLOG_DEBUG4
+  kLOG_DEBUG4,
+  kNO_LOG
 };
 
 struct ToLog;
diff --git a/mobile/src/fpga/V2/image.cpp b/mobile/src/fpga/V2/image.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/fpga/V2/pe.cpp b/mobile/src/fpga/V2/pe.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/framework/cl/cl_deleter.h b/mobile/src/framework/cl/cl_deleter.h
index 55af631174..731e5de663 100644
--- a/mobile/src/framework/cl/cl_deleter.h
+++ b/mobile/src/framework/cl/cl_deleter.h
@@ -15,45 +15,51 @@ limitations under the License. */
 #pragma once
 
 #include "CL/cl.h"
-
+#include "common/log.h"
 struct CLKernelDeleter {
   template <class T>
   void operator()(T *clKernelObj) {
-    clReleaseKernel(clKernelObj);
+    const cl_int status = clReleaseKernel(clKernelObj);
+    LOG(paddle_mobile::kNO_LOG) << "clReleaseKernel  status:     " << status;
   }
 };
 
 struct CLMemDeleter {
   template <class T>
   void operator()(T *clMemObj) {
-    clReleaseMemObject(clMemObj);
+    const cl_int status = clReleaseMemObject(clMemObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLMemDeleter  status:     " << status;
   }
 };
 
 struct CLEventDeleter {
   template <class T>
   void operator()(T *clEventObj) {
-    clReleaseEvent(clEventObj);
+    const cl_int status = clReleaseEvent(clEventObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLEventDeleter  status:     " << status;
   }
 };
 
 struct CLCommQueueDeleter {
   template <class T>
   void operator()(T *clQueueObj) {
-    clReleaseCommandQueue(clQueueObj);
+    const cl_int status = clReleaseCommandQueue(clQueueObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLCommQueueDeleter  status:     " << status;
   }
 };
 
 struct CLContextDeleter {
   template <class T>
   void operator()(T *clContextObj) {
-    clReleaseContext(clContextObj);
+    const cl_int status = clReleaseContext(clContextObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLContextDeleter  status:     " << status;
   }
 };
 
 struct CLProgramDeleter {
   template <class T>
   void operator()(T *clProgramObj) {
-    clReleaseProgram(clProgramObj);
+    const cl_int status = clReleaseProgram(clProgramObj);
+    LOG(paddle_mobile::kNO_LOG) << "CLProgramDeleter  status:   " << status;
   }
 };
diff --git a/mobile/src/framework/cl/cl_engine.cpp b/mobile/src/framework/cl/cl_engine.cpp
index c39ae00b00..e8a8361eac 100644
--- a/mobile/src/framework/cl/cl_engine.cpp
+++ b/mobile/src/framework/cl/cl_engine.cpp
@@ -23,9 +23,11 @@ namespace paddle_mobile {
 namespace framework {
 
 bool CLEngine::Init() {
+  LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init()";
   if (initialized_) {
     return true;
   }
+  LOG(paddle_mobile::kNO_LOG) << "CLEngine::Init() ...";
   cl_int status;
   bool is_setplatform_success = SetPlatform();
   bool is_setcldeviceid_success = SetClDeviceId();
@@ -53,12 +55,14 @@ bool CLEngine::SetPlatform() {
     return false;
   }
   /**For clarity, choose the first available platform. */
+  LOG(paddle_mobile::kNO_LOG) << "numPlatforms: " << numPlatforms;
   if (numPlatforms > 0) {
     cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
         malloc(numPlatforms * sizeof(cl_platform_id)));
     status = clGetPlatformIDs(numPlatforms, platforms, NULL);
     platform_ = platforms[0];
     free(platforms);
+    LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_;
     return status == CL_SUCCESS;
   }
 
@@ -67,70 +71,21 @@ bool CLEngine::SetPlatform() {
 
 bool CLEngine::SetClDeviceId() {
   cl_uint numDevices = 0;
-  devices_ = NULL;
+  LOG(paddle_mobile::kNO_LOG) << "platform: " << platform_;
   cl_int status =
       clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
   if (status != CL_SUCCESS) {
     return false;
   }
+  LOG(paddle_mobile::kNO_LOG) << "numDevices: " << numDevices;
+
   if (numDevices > 0) {
-    devices_ = reinterpret_cast<cl_device_id *>(
-        malloc(numDevices * sizeof(cl_device_id)));
     status = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_GPU, numDevices, devices_,
                             NULL);
+    LOG(paddle_mobile::kNO_LOG) << "devices_[0]" << devices_[0];
     return status == CL_SUCCESS;
   }
   return false;
 }
-
-// std::unique_ptr<_cl_kernel, clKernel_deleter> CLEngine::GSetKernel(
-//    const std::string &kernel_name) {
-//  std::unique_ptr<_cl_kernel, clKernel_deleter> kernel(
-//      clCreateKernel(program_.get(), kernel_name.c_str(), NULL));
-//  return std::move(kernel);
-//}
-//
-// bool CLEngine::SetClCommandQueue() {
-//  cl_int status;
-//  command_queue_.reset(
-//          clCreateCommandQueue(context_.get(), devices_[0], 0, &status));
-//  return true;
-//}
-
-// bool CLEngine::SetClContext() {
-//  context_.reset(clCreateContext(NULL, 1, devices_, NULL, NULL, NULL));
-//  return true;
-//}
-
-// bool CLEngine::LoadKernelFromFile(const char *kernel_file) {
-//  size_t size;
-//  char *str;
-//  std::fstream f(kernel_file, (std::fstream::in | std::fstream::binary));
-//
-//  if (!f.is_open()) {
-//    return false;
-//  }
-//
-//  size_t fileSize;
-//  f.seekg(0, std::fstream::end);
-//  size = fileSize = (size_t)f.tellg();
-//  f.seekg(0, std::fstream::beg);
-//  str = new char[size + 1];
-//  if (!str) {
-//    f.close();
-//    return 0;
-//  }
-//
-//  f.read(str, fileSize);
-//  f.close();
-//  str[size] = '\0';
-//  const char *source = str;
-//  size_t sourceSize[] = {strlen(source)};
-//  program_.reset(
-//      clCreateProgramWithSource(context_.get(), 1, &source, sourceSize,
-//      NULL));
-//  return true;
-//}
-
 }  // namespace framework
 }  // namespace paddle_mobile
diff --git a/mobile/src/framework/cl/cl_engine.h b/mobile/src/framework/cl/cl_engine.h
index 2e21dd9e39..2a6362ebc0 100644
--- a/mobile/src/framework/cl/cl_engine.h
+++ b/mobile/src/framework/cl/cl_engine.h
@@ -57,19 +57,27 @@ class CLLocalWorkSizeInfo {
   // max number of work-items in local_work_size in dim 2
   size_t max_work_item_size2;
 };
-
+inline void ctx_info(const char *errinfo, const void *private_info, size_t cb,
+                     void *user_data) {
+  fprintf(stderr, "OpenCL Error (via pfn_notify): %s\n", errinfo);
+}
 class CLEngine {
  public:
   static CLEngine *Instance();
 
   bool Init();
   bool isInitSuccess();
-  std::unique_ptr<_cl_context, CLContextDeleter> CreateContext() {
+
+  std::shared_ptr<_cl_context> CreateContext() {
+    DLOG << "CreateContext ---";
+    DLOG << "platform: " << platform_;
+    DLOG << "devices_[0]: " << devices_[0];
+
     cl_int status;
-    cl_context c = clCreateContext(NULL, 1, devices_, NULL, NULL, &status);
-    std::unique_ptr<_cl_context, CLContextDeleter> context_ptr(c);
+    cl_context c = clCreateContext(NULL, 1, devices_, &ctx_info, NULL, &status);
+    std::shared_ptr<_cl_context> context(c, CLContextDeleter());
     CL_CHECK_ERRORS(status);
-    return std::move(context_ptr);
+    return std::move(context);
   }
 
   std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> CreateClCommandQueue(
@@ -84,14 +92,14 @@ class CLEngine {
   }
 
   cl_context getContext() {
-    if (context_ == nullptr) {
+    if (context_.get() == nullptr) {
       context_ = CreateContext();
     }
     return context_.get();
   }
 
   cl_command_queue getClCommandQueue() {
-    if (command_queue_ == nullptr) {
+    if (command_queue_.get() == nullptr) {
       command_queue_ = CreateClCommandQueue(getContext());
     }
     return command_queue_.get();
@@ -124,9 +132,9 @@ class CLEngine {
     if (status != CL_SUCCESS || ret_size / sizeof(size_t) < 3) {
       return CLLocalWorkSizeInfo(0, 0, 0, 0);
     }
-    DLOG << max_work_item_sizes[0];
-    DLOG << max_work_item_sizes[1];
-    DLOG << max_work_item_sizes[2];
+    DLOG << " max_work_item_sizes {" << max_work_item_sizes[0] << ", "
+         << max_work_item_sizes[1] << ", " << max_work_item_sizes[2] << "}";
+
     localWorkSizeInfo_ =
         CLLocalWorkSizeInfo(max_work_group_size, max_work_item_sizes[0],
                             max_work_item_sizes[1], max_work_item_sizes[2]);
@@ -182,8 +190,8 @@ class CLEngine {
     cl_program p =
         clCreateProgramWithSource(context, 1, &source, sourceSize, &status_);
 
-    DLOG << " cl kernel from source";
-    DLOG << " source size: " << sourceSize[0];
+    LOG(kLOG_DEBUG4) << " cl kernel from source";
+    LOG(kLOG_DEBUG4) << " source size: " << sourceSize[0];
     CL_CHECK_ERRORS(status_);
 
     std::unique_ptr<_cl_program, CLProgramDeleter> program_ptr(p);
@@ -216,11 +224,7 @@ class CLEngine {
       DLOG << " program build error: " << log;
     }
 
-    if (status == CL_SUCCESS) {
-      return true;
-    } else {
-      return false;
-    }
+    return status == CL_SUCCESS;
   }
 
   cl_device_id DeviceID(int index = 0) { return devices_[index]; }
@@ -239,28 +243,13 @@ class CLEngine {
 
   CLLocalWorkSizeInfo localWorkSizeInfo_;
 
-  cl_platform_id platform_;
-
-  cl_device_id *devices_;
-
   cl_int status_;
-
   std::string cl_path_;
-  std::unique_ptr<_cl_program, CLProgramDeleter> program_;
-
-  std::unique_ptr<_cl_context, CLContextDeleter> context_ = nullptr;
-
-  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_ =
-      nullptr;
-
-  //  bool SetClContext();
-
-  //  bool SetClCommandQueue();
-
-  //  bool LoadKernelFromFile(const char *kernel_file);
-
-  //  bool BuildProgram();
   bool is_init_success_ = false;
+  std::unique_ptr<_cl_command_queue, CLCommQueueDeleter> command_queue_;
+  std::shared_ptr<_cl_context> context_;
+  cl_device_id devices_[10];
+  cl_platform_id platform_;
 };
 
 }  // namespace framework
diff --git a/mobile/src/framework/cl/cl_helper.h b/mobile/src/framework/cl/cl_helper.h
index 893456211d..db9aa37ae2 100644
--- a/mobile/src/framework/cl/cl_helper.h
+++ b/mobile/src/framework/cl/cl_helper.h
@@ -36,9 +36,9 @@ class CLHelper {
 
   void AddKernel(const std::string &kernel_name, const std::string &file_name,
                  const std::string &options = "") {
-    DLOG << " begin add kernel ";
+    LOG(kLOG_DEBUG1) << " begin add kernel ";
     auto kernel = scope_->GetKernel(kernel_name, file_name, options);
-    DLOG << " add kernel ing ";
+    LOG(kLOG_DEBUG1) << " begin add kernel ";
     kernels.emplace_back(std::move(kernel));
   }
 
diff --git a/mobile/src/framework/cl/cl_image.h b/mobile/src/framework/cl/cl_image.h
index d3d48cda8b..57656c3c6d 100644
--- a/mobile/src/framework/cl/cl_image.h
+++ b/mobile/src/framework/cl/cl_image.h
@@ -87,14 +87,14 @@ class CLImage {
     PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
                           " need call SetTensorData first");
 
-    DLOG << " begin init cl image ";
+    LOG(kNO_LOG) << " begin init cl image ";
     image_dims_ = converter->InitImageDimInfoWith(tensor_dims_);
 
     half_t *image_data = new half_t[product(image_dims_) * 4];
 
-    DLOG << " convert to image";
+    LOG(kNO_LOG) << " convert to image";
     converter->NCHWToImage(tensor_data_, image_data, tensor_dims_);
-    DLOG << " end convert to image";
+    LOG(kNO_LOG) << " end convert to image";
 
     InitCLImage(context, image_dims_[0], image_dims_[1], image_data);
 
@@ -105,7 +105,7 @@ class CLImage {
     tensor_data_ = nullptr;
     image_converter_ = converter;
     initialized_ = true;
-    DLOG << " end init cl image";
+    LOG(kNO_LOG) << " end init cl image";
   }
 
   void InitNImage(cl_context context, cl_command_queue command_queue) {
@@ -137,9 +137,9 @@ class CLImage {
     //    CLImageConverterFolder();
     CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
     PADDLE_MOBILE_ENFORCE(!shared_mem_, "do not init mem after shared .")
-    DLOG << " to get image dims ";
+    //    LOG(kNO_LOG) << " to get image dims ";
     image_dims_ = normal_converter->InitImageDimInfoWith(dim);
-    DLOG << " end get image dims " << image_dims_;
+    //    LOG(kNO_LOG) << " end get image dims " << image_dims_;
 
     InitCLImage(context, image_dims_[0], image_dims_[1], nullptr);
 
@@ -148,7 +148,7 @@ class CLImage {
     image_converter_ = normal_converter;
     cl_event_ = CLEngine::Instance()->CreateEvent(context);
     initialized_ = true;
-    DLOG << " end init cl image";
+    //    LOG(kNO_LOG) << " end init cl image";
   }
   /**
    *  create fake size cl_mem for mem share
@@ -169,9 +169,9 @@ class CLImage {
     InitCLImage(context, real_image_dims_[0], real_image_dims_[1], nullptr);
     // cheat cl_image they got what they wanted
     image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
-    DLOG << "InitFakeSizeImage ... ";
-    DLOG << "real_image_dims:  " << real_image_dims_;
-    DLOG << "image_dims_:  " << image_dims_;
+    LOG(kNO_LOG) << "InitFakeSizeImage ... ";
+    LOG(kNO_LOG) << "real_image_dims:  " << real_image_dims_;
+    LOG(kNO_LOG) << "image_dims_:  " << image_dims_;
     PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] &&
                               real_image_dims_[1] >= image_dims_[1],
                           "real image is not enough");
@@ -182,7 +182,7 @@ class CLImage {
     initialized_ = true;
     shared_mem_ = true;
 
-    DLOG << " end init FakeSizeImage";
+    LOG(kNO_LOG) << " end init FakeSizeImage";
   }
   /**
    * init cl mem with a exist cl mem
@@ -197,21 +197,21 @@ class CLImage {
     real_image_dims_ = src.real_image_dims_;
     image_dims_ = normal_converter->InitImageDimInfoWith(need_dims);
 
-    DLOG << "InitWithExistMem ... ";
-    DLOG << "real_image_dims:  " << real_image_dims_;
-    DLOG << "image_dims_:  " << image_dims_;
+    LOG(kNO_LOG) << "InitWithExistMem ... ";
+    LOG(kNO_LOG) << "real_image_dims:  " << real_image_dims_;
+    LOG(kNO_LOG) << "image_dims_:  " << image_dims_;
 
     if (real_image_dims_[0] < image_dims_[0] ||
         real_image_dims_[1] < image_dims_[1]) {
-      DLOG << "real image is not enough!";
-      DLOG << "real_image_dims:  " << real_image_dims_;
-      DLOG << "image_dims_:  " << image_dims_;
+      LOG(kNO_LOG) << "real image is not enough!";
+      LOG(kNO_LOG) << "real_image_dims:  " << real_image_dims_;
+      LOG(kNO_LOG) << "image_dims_:  " << image_dims_;
     }
     PADDLE_MOBILE_ENFORCE(real_image_dims_[0] >= image_dims_[0] &&
                               real_image_dims_[1] >= image_dims_[1],
                           "real image is not enough!");
     if (cl_image_ != src.cl_image_) {
-      cl_image_.reset(src.cl_image_.get(), CLMemDeleter());
+      cl_image_ = src.cl_image_;
     }
 
     tensor_dims_ = need_dims;
@@ -221,7 +221,7 @@ class CLImage {
     initialized_ = true;
     shared_mem_ = true;
 
-    DLOG << " end init WithExistMem";
+    LOG(kNO_LOG) << " end init WithExistMem";
   }
 
   void InitConv2dTransposeFilterCLImage(cl_context context,
@@ -233,18 +233,6 @@ class CLImage {
     InitCLImage(context, command_queue, converter);
   }
 
-  /*! The internal of two tensors share the same memory block. */
-  inline CLImage &ShareHolderWith(const CLImage &src) {
-    PADDLE_MOBILE_ENFORCE(
-        src.cl_image_ != nullptr,
-        "Tensor holds no memory. Call Tensor::mutable_data first.")
-
-    if (cl_image_ != src.cl_image_) {
-      cl_image_.reset(src.cl_image_.get(), CLMemDeleter());
-    }
-    return *this;
-  }
-
   cl_mem GetCLImage() const { return cl_image_.get(); }
 
   const DDim &ImageDims() const { return image_dims_; }
diff --git a/mobile/src/framework/cl/cl_scope.h b/mobile/src/framework/cl/cl_scope.h
index 643ce32b57..49e705e5a0 100644
--- a/mobile/src/framework/cl/cl_scope.h
+++ b/mobile/src/framework/cl/cl_scope.h
@@ -35,30 +35,27 @@ namespace framework {
 
 class CLScope {
  public:
-  CLScope() {
-    CLEngine *engine = CLEngine::Instance();
-    context_ = engine->getContext();
-    command_queue_ = engine->getClCommandQueue();
-    localWorkSizeInfo_ = engine->getLocalWorkSizeInfo();
-  }
+  CLScope() {}
 
-  cl_command_queue CommandQueue() { return command_queue_; }
+  cl_command_queue CommandQueue() {
+    return CLEngine::Instance()->getClCommandQueue();
+  }
 
   std::unique_ptr<_cl_kernel, CLKernelDeleter> GetKernel(
       const std::string &kernel_name, const std::string &file_name,
       const std::string &options) {
-    DLOG << " to get program " << file_name;
+    LOG(kLOG_DEBUG2) << " to get program " << file_name;
     auto program = Program(file_name, kernel_name, options);
-    DLOG << " end get program ~ ";
-    DLOG << " to create kernel: " << kernel_name;
+    LOG(kLOG_DEBUG2) << " end get program ~ ";
+    LOG(kLOG_DEBUG2) << " to create kernel: " << kernel_name;
     std::unique_ptr<_cl_kernel, CLKernelDeleter> kernel(
         clCreateKernel(program, kernel_name.c_str(), &status_));
     CL_CHECK_ERRORS(status_);
-    DLOG << " end create kernel ~ ";
+    LOG(kLOG_DEBUG2) << " end create kernel ~ ";
     return std::move(kernel);
   }
 
-  cl_context Context() { return context_; }
+  cl_context Context() { return CLEngine::Instance()->getContext(); }
 
   cl_program Program(const std::string &file_name,
                      const std::string &kernel_name,
@@ -79,11 +76,13 @@ class CLScope {
       std::string header(header_it->second.begin(), header_it->second.end());
       source = header + "\n" + source;
       auto program = CLEngine::Instance()->CreateProgramWithSource(
-          context_, source.c_str());
+          CLEngine::Instance()->getContext(), source.c_str());
 
-      DLOG << " --- begin build program -> " << program_key << " --- ";
+      LOG(kLOG_DEBUG3) << " --- begin build program -> " << program_key
+                       << " --- ";
       CLEngine::Instance()->BuildProgram(program.get(), options);
-      DLOG << " --- end build program -> " << program_key << " --- ";
+      LOG(kLOG_DEBUG3) << " --- end build program -> " << program_key
+                       << " --- ";
 
       programs_[program_key] = std::move(program);
       return programs_[program_key].get();
@@ -97,19 +96,23 @@ class CLScope {
         return it->second.get();
       }
       auto program = CLEngine::Instance()->CreateProgramWith(
-          context_,
+          CLEngine::Instance()->getContext(),
           CLEngine::Instance()->GetCLPath() + "/cl_kernel/" + file_name);
 
-      DLOG << " --- begin build program -> " << program_key << " --- ";
+      LOG(kLOG_DEBUG3) << " --- begin build program ele-> " << program_key
+                       << " --- ";
       CLEngine::Instance()->BuildProgram(program.get(), options);
-      DLOG << " --- end build program -> " << program_key << " --- ";
+      LOG(kLOG_DEBUG3) << " --- end build program ele-> " << program_key
+                       << " --- ";
 
       programs_[program_key] = std::move(program);
       return programs_[program_key].get();
     }
   }
 
-  CLLocalWorkSizeInfo LocalWorkSizeInfo() { return localWorkSizeInfo_; }
+  CLLocalWorkSizeInfo LocalWorkSizeInfo() {
+    return CLEngine::Instance()->getLocalWorkSizeInfo();
+  }
   size_t KernelWorkSize(cl_kernel kernel) {
     size_t kernel_work_size = CLEngine::Instance()->GetKernelWorkSize(kernel);
     return kernel_work_size;
@@ -117,12 +120,9 @@ class CLScope {
 
  private:
   cl_int status_;
-  cl_context context_;
-  cl_command_queue command_queue_;
   std::unordered_map<std::string,
                      std::unique_ptr<_cl_program, CLProgramDeleter>>
       programs_;
-  CLLocalWorkSizeInfo localWorkSizeInfo_;
 };
 
 }  // namespace framework
diff --git a/mobile/src/framework/context.h b/mobile/src/framework/context.h
index 944d54cc49..18e40311bc 100644
--- a/mobile/src/framework/context.h
+++ b/mobile/src/framework/context.h
@@ -44,15 +44,13 @@ namespace framework {
 struct CPUContext {
  private:
   CPUContext();
-  virtual ~CPUContext() {}
 
  public:
+  ~CPUContext() {}
+
   static CPUContext* Context() {
-    static CPUContext* ctx = nullptr;
-    if (ctx == nullptr) {
-      ctx = new CPUContext();
-    }
-    return ctx;
+    static CPUContext ctx;
+    return &ctx;
   }
 
   void set_thread_num(int thread_num,
diff --git a/mobile/src/framework/executor.cpp b/mobile/src/framework/executor.cpp
index d03cefe59a..cda5c5522c 100644
--- a/mobile/src/framework/executor.cpp
+++ b/mobile/src/framework/executor.cpp
@@ -80,7 +80,7 @@ Executor<Device, T>::Executor(const Program<Device> &program,
   std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
   for (int j = 0; j < ops.size(); ++j) {
     std::shared_ptr<OpDesc> op_desc = ops[j];
-    DLOG << "create op: " << op_desc->Type();
+    LOG(kLOG_INFO) << "create op[" << j << "]: " << op_desc->Type();
 
     auto op_handler = OpRegistry<Device>::CreateOp(
         op_desc->Type(), op_desc->GetInputs(), op_desc->GetOutputs(),
@@ -111,7 +111,8 @@ Executor<Device, T>::Executor(const Program<Device> &program,
     clock_gettime(CLOCK_MONOTONIC, &ts);
     profile[op_index].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
-    DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
+    LOG(kLOG_INFO) << "Initialize op[" << count++
+                   << "]: " << op_handler->Type();
     if (op_handler->Type() == "feed" || op_handler->Type() == "fetch") {
       op_handler->setPrePostType(config_.pre_post_type);
     }
@@ -1015,7 +1016,7 @@ void Executor<GPU_CL, float>::InitMemory() {
           const TensorDesc &desc = var_desc->Tensor_desc();
           //          DDim ddim = make_ddim(desc.Dims());
           DDim ddim = cl_image->dims();
-          DLOG << var_desc->Name();
+          LOG(kLOG_DEBUG1) << "init image of " << var_desc->Name();
           cl_image->InitEmptyImage(context, command_queue, ddim);
         }
       }
diff --git a/mobile/src/framework/loader.cpp b/mobile/src/framework/loader.cpp
index 34cf6253cb..31274743f8 100644
--- a/mobile/src/framework/loader.cpp
+++ b/mobile/src/framework/loader.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "framework/loader.h"
+#include <memory>
 
 #include "framework/lod_tensor.h"
 #include "framework/program/program-optimize/program_optimize.h"
@@ -173,7 +174,7 @@ static size_t ReadBuffer(const char *file_name, uint8_t **out) {
   rewind(fp);
 
   DLOG << "model size: " << size;
-
+  PADDLE_MOBILE_ENFORCE(size > 0, "model size should > 0")
   *out = reinterpret_cast<uint8_t *>(malloc(size));
 
   size_t cur_len = 0;
diff --git a/mobile/src/framework/operator.cpp b/mobile/src/framework/operator.cpp
index 402512c723..a091a49b35 100644
--- a/mobile/src/framework/operator.cpp
+++ b/mobile/src/framework/operator.cpp
@@ -62,31 +62,39 @@ void OperatorBase<Dtype>::Run() {
   DLOG << "-------------" << type_ << "----------------------------";
   vector<string> input_keys = GetInputKeys();
   for (const auto key : input_keys) {
-    auto var_vec_in = inputs_.at(key);
-    for (int i = 0; i < var_vec_in.size(); ++i) {
-      auto var = this->scope_->FindVar(var_vec_in[i]);
-      if (var->IsInitialized() &&
-          var->template IsType<framework::LoDTensor>()) {
-        const Tensor *tensor = var->template Get<framework::LoDTensor>();
-        if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
+    if (inputs_.count(key) > 0) {
+      auto var_vec_in = inputs_.at(key);
+      for (int i = 0; i < var_vec_in.size(); ++i) {
+        auto var = this->scope_->FindVar(var_vec_in[i]);
+        if (var->IsInitialized() &&
+            var->template IsType<framework::LoDTensor>()) {
+          const Tensor *tensor = var->template Get<framework::LoDTensor>();
+          if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor;
 #ifdef PADDLE_MOBILE_FPGA
-        DLOG << var_vec_in[i];
+          DLOG << var_vec_in[i];
 #endif
+        }
       }
+    } else {
+      DLOG << "did not find key (" << key << ") in inputs_";
     }
   }
   for (const auto key : GetOutKeys()) {
-    auto var_vec_out = outputs_.at(key);
-    for (int i = 0; i < var_vec_out.size(); ++i) {
-      auto var = scope_->FindVar(var_vec_out[i]);
-      if (var->IsInitialized() &&
-          var->template IsType<framework::LoDTensor>()) {
-        const Tensor *tensor = var->template Get<framework::LoDTensor>();
-        if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
+    if (outputs_.count(key) > 0) {
+      auto var_vec_out = outputs_.at(key);
+      for (int i = 0; i < var_vec_out.size(); ++i) {
+        auto var = scope_->FindVar(var_vec_out[i]);
+        if (var->IsInitialized() &&
+            var->template IsType<framework::LoDTensor>()) {
+          const Tensor *tensor = var->template Get<framework::LoDTensor>();
+          if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor;
 #ifdef PADDLE_MOBILE_FPGA
-        DLOG << var_vec_out[i];
+          DLOG << var_vec_out[i];
 #endif
+        }
       }
+    } else {
+      DLOG << "did not find key (" << key << ") in outputs_";
     }
   }
 #endif
@@ -100,27 +108,37 @@ void OperatorBase<GPU_CL>::Run() {
   DLOG << "-------------" << type_ << "----------------------------";
   vector<string> input_keys = GetInputKeys();
   for (const auto key : input_keys) {
-    auto var_vec_in = inputs_.at(key);
-    for (int i = 0; i < var_vec_in.size(); ++i) {
-      auto var = scope_->FindVar(var_vec_in[i]);
-      if (var->IsInitialized() && var->template IsType<framework::CLImage>()) {
-        const CLImage *cl_image = var->template Get<framework::CLImage>();
-        if (cl_image) {
-          DLOG << type_ << " input- " << key << "=" << *cl_image;
+    if (inputs_.count(key) > 0) {
+      auto var_vec_in = inputs_.at(key);
+      for (int i = 0; i < var_vec_in.size(); ++i) {
+        auto var = scope_->FindVar(var_vec_in[i]);
+        if (var->IsInitialized() &&
+            var->template IsType<framework::CLImage>()) {
+          const CLImage *cl_image = var->template Get<framework::CLImage>();
+          if (cl_image) {
+            DLOG << type_ << " input- " << key << "=" << *cl_image;
+          }
         }
       }
+    } else {
+      DLOG << "did not find key (" << key << ") in inputs_";
     }
   }
   for (const auto key : GetOutKeys()) {
-    auto var_vec_out = outputs_.at(key);
-    for (int i = 0; i < var_vec_out.size(); ++i) {
-      auto var = scope_->FindVar(var_vec_out[i]);
-      if (var->IsInitialized() && var->template IsType<framework::CLImage>()) {
-        const CLImage *cl_image = var->template Get<framework::CLImage>();
-        if (cl_image) {
-          DLOG << type_ << " output- " << key << "=" << *cl_image;
+    if (outputs_.count(key) > 0) {
+      auto var_vec_out = outputs_.at(key);
+      for (int i = 0; i < var_vec_out.size(); ++i) {
+        auto var = scope_->FindVar(var_vec_out[i]);
+        if (var->IsInitialized() &&
+            var->template IsType<framework::CLImage>()) {
+          const CLImage *cl_image = var->template Get<framework::CLImage>();
+          if (cl_image) {
+            DLOG << type_ << " output- " << key << "=" << *cl_image;
+          }
         }
       }
+    } else {
+      DLOG << "did not find key (" << key << ") in outputs_";
     }
   }
 #endif
diff --git a/mobile/src/io/opencl_interface.cpp b/mobile/src/io/opencl_interface.cpp
index 1df5b48339..636cd1b760 100644
--- a/mobile/src/io/opencl_interface.cpp
+++ b/mobile/src/io/opencl_interface.cpp
@@ -28,8 +28,26 @@ cl_command_queue getClCommandQueue() {
 }
 
 bool isInitSuccess() {
+  prepareOpenclRuntime();
   return framework::CLEngine::Instance()->isInitSuccess();
 }
 
+bool prepareOpenclRuntime() {
+#ifdef PREPARE_OPENCL_RUNTIME
+  DLOG << "cl runtime prepared. ";
+  cl_uint numPlatforms;  // the NO. of platforms
+  cl_int status = clGetPlatformIDs(0, NULL, &numPlatforms);
+  if (status == CL_SUCCESS) {
+    if (numPlatforms > 0) {
+      cl_platform_id *platforms = reinterpret_cast<cl_platform_id *>(
+          malloc(numPlatforms * sizeof(cl_platform_id)));
+      status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+      free(platforms);
+    }
+  }
+#endif
+  return true;
+}
+
 }  // namespace paddle_mobile
 #endif
diff --git a/mobile/src/io/opencl_interface.h b/mobile/src/io/opencl_interface.h
index f1039f1373..6a3608790a 100644
--- a/mobile/src/io/opencl_interface.h
+++ b/mobile/src/io/opencl_interface.h
@@ -21,6 +21,7 @@ namespace paddle_mobile {
 cl_context getContext();
 cl_command_queue getClCommandQueue();
 bool isInitSuccess();
+bool prepareOpenclRuntime();
 
 }  // namespace paddle_mobile
 
diff --git a/mobile/src/io/paddle_mobile.h b/mobile/src/io/paddle_mobile.h
index 8b8f0683ab..8c40b0696a 100644
--- a/mobile/src/io/paddle_mobile.h
+++ b/mobile/src/io/paddle_mobile.h
@@ -26,6 +26,7 @@ limitations under the License. */
 #include "io/paddle_inference_api.h"
 #ifdef PADDLE_MOBILE_CL
 #include "framework/cl/cl_engine.h"
+#include "io/opencl_interface.h"
 #endif
 
 namespace paddle_mobile {
@@ -34,16 +35,24 @@ template <typename Device, typename T = float>
 class PaddleMobile {
  public:
   explicit PaddleMobile(PaddleMobileConfigInternal config) : config_(config) {
-#ifndef PADDLE_MOBILE_CL
     bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
+#ifndef PADDLE_MOBILE_CL
     PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
+#else
+    if (is_gpu) {
+      prepareOpenclRuntime();
+    }
 #endif
   }
 
   PaddleMobile() {
-#ifndef PADDLE_MOBILE_CL
     bool is_gpu = std::is_same<DeviceType<kGPU_CL>, Device>::value;
+#ifndef PADDLE_MOBILE_CL
     PADDLE_MOBILE_ENFORCE(!is_gpu, "Please recompile with GPU_CL is on");
+#else
+    if (is_gpu) {  // recheck when run cpu in with opencl.
+      prepareOpenclRuntime();
+    }
 #endif
   }
   virtual ~PaddleMobile() { Clear(); }
diff --git a/mobile/src/operators/expand_op.cpp b/mobile/src/operators/expand_op.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/expand_op.h b/mobile/src/operators/expand_op.h
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/grid_sampler_op.cpp b/mobile/src/operators/grid_sampler_op.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/grid_sampler_op.h b/mobile/src/operators/grid_sampler_op.h
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/mobile/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
index 4895c07d20..b7f4d16c3b 100644
--- a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
+++ b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_mul_kernel.cl
@@ -110,4 +110,22 @@ __kernel void channel_mul_d2(__global image2d_t input, __global image2d_t bias,
   half4 in = read_imageh(input, sampler, coords);
   half4 output = mad(in, biase, 0);
   write_imageh(outputImage, coords, output);
+}
+
+__kernel void channel_mul_d4(__global image2d_t input, __global image2d_t bias,
+                          __write_only image2d_t outputImage, int w) {
+  int x = get_global_id(0);
+  int y = get_global_id(1);
+  const sampler_t sampler =
+      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  int2 coords;
+  coords.x = x;
+  coords.y = y;
+  int2 coords_bias;
+  coords_bias.x = x / w;
+  coords_bias.y = 0;
+  half4 in = read_imageh(input, sampler, coords);
+  half4 biase = read_imageh(bias, sampler, coords_bias);
+  half4 output = in * biase;
+  write_imageh(outputImage, coords, output);
 }
\ No newline at end of file
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/elementwise_sub_kernel.cl
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/expend.cl b/mobile/src/operators/kernel/cl/cl_kernel/expend.cl
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl b/mobile/src/operators/kernel/cl/cl_kernel/grid_sampler_kernel.cl
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
index 8d66b50a99..4261681f3e 100644
--- a/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/conv_transpose_kernel.cpp
@@ -43,7 +43,10 @@ bool ConvTransposeKernel<GPU_CL, float>::Init(
     this->cl_helper_.AddKernel("conv_transpose3x3s2",
                                "conv_transpose_kernel.cl");
   } else {
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+    param->ExecMode() = ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS_FLOAT;
+    param->Filter()->InitConv2dTransposeFilterCLImage(
+        cl_helper_.CLContext(), cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_transpose", "conv_transpose_kernel.cl");
   }
   return true;
 }
@@ -58,6 +61,9 @@ void ConvTransposeKernel<GPU_CL, float>::Compute(
     case ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS3x3s2_FLOAT:
       ConvTranspose3x3s2AddBnRelu(&this->cl_helper_, param);
       break;
+    case ConvTransposeParam<GPU_CL>::EXEC_CONVTRANS_FLOAT:
+      ConvTransposeAddBnRelu(&this->cl_helper_, param);
+      break;
     default:
       PADDLE_MOBILE_THROW_EXCEPTION(
           "Invalid convolution transpose execute mode %d", param.ExecMode());
diff --git a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
index fd5b9e6bc3..37034a0189 100644
--- a/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
+++ b/mobile/src/operators/kernel/cl/elementwise_mul_kernel.cpp
@@ -30,16 +30,23 @@ bool ElementwiseMulKernel<GPU_CL, float>::Init(
   if (bias->dims() == param->InputX()->dims()) {
     DLOG << "init element wise mul";
     this->cl_helper_.AddKernel("elementwise_mul", "elementwise_mul_kernel.cl");
-  } else if (bias->dims().size() == 1) {
-    DLOG << "init channel_mul";
-    this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
-  } else if (bias->dims().size() == 2) {
-    // etc. input  1 72 28 28
-    // filter 1 72
-    DLOG << "init channel_mul_d2";
-    this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
   } else {
-    PADDLE_MOBILE_ENFORCE(false, "element mul not supported yet");
+    const int bias_dim_size = bias->dims().size();
+    if (bias_dim_size == 1) {
+      DLOG << "init channel_mul";
+      this->cl_helper_.AddKernel("channel_mul", "elementwise_mul_kernel.cl");
+    } else if (bias_dim_size == 2) {
+      // etc. input  1 72 28 28
+      // filter 1 72
+      DLOG << "init channel_mul_d2";
+      this->cl_helper_.AddKernel("channel_mul_d2", "elementwise_mul_kernel.cl");
+    } else if (bias_dim_size == 4) {
+      DLOG << "init channel_mul_d4";
+      this->cl_helper_.AddKernel("channel_mul_d4", "elementwise_mul_kernel.cl");
+    } else {
+      PADDLE_MOBILE_ENFORCE(false,
+                            "element mul not supported this situation yet");
+    }
   }
   return true;
 }
@@ -71,68 +78,103 @@ void ElementwiseMulKernel<GPU_CL, float>::Compute(
         clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
                                NULL, global_work_size, NULL, 0, NULL, NULL);
     CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 1) {
-    DLOG << "channel mul";
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    int tensor_w = input->dims()[input->dims().size() - 1];
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&input_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&bias_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&output_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                            reinterpret_cast<void *>(&tensor_w));
-    CL_CHECK_ERRORS(status);
-    auto width = input->ImageWidth();
-    auto height = input->ImageHeight();
-    size_t global_work_size[2] = {width, height};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
-  } else if (bias->dims().size() == 2) {
-    DLOG << "channel mul d2";
+  } else {
+    const int bias_dim_size = bias->dims().size();
+    if (bias_dim_size == 1) {
+      DLOG << "channel mul";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+    } else if (bias_dim_size == 2) {
+      DLOG << "channel mul d2";
 
-    // etc. input  1 72 28 28
-    // filter 1 72   -->  1 1 1 72
-    DLOG << "input->ImageDims():  " << input->ImageDims();
-    DLOG << "bias->ImageDims():  " << bias->ImageDims();
-    DLOG << "out->ImageDims():  " << output->ImageDims();
+      // etc. input  1 72 28 28
+      // filter 1 72   -->  1 1 1 72
+      DLOG << "input->ImageDims():  " << input->ImageDims();
+      DLOG << "bias->ImageDims():  " << bias->ImageDims();
+      DLOG << "out->ImageDims():  " << output->ImageDims();
 
-    DLOG << "channel mul d2";
-    cl_mem input_image = input->GetCLImage();
-    cl_mem bias_image = bias->GetCLImage();
-    cl_mem output_image = output->GetCLImage();
-    int tensor_w = input->dims()[input->dims().size() - 1];
-    status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&input_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&bias_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
-                            reinterpret_cast<void *>(&output_image));
-    CL_CHECK_ERRORS(status);
-    status = clSetKernelArg(kernel, 3, sizeof(cl_int),
-                            reinterpret_cast<void *>(&tensor_w));
-    CL_CHECK_ERRORS(status);
-    auto width = input->ImageWidth();
-    auto height = input->ImageHeight();
-    size_t global_work_size[2] = {width, height};
-    status =
-        clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
-                               NULL, global_work_size, NULL, 0, NULL, NULL);
-    CL_CHECK_ERRORS(status);
+      DLOG << "channel mul d2";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
 
-    //    bias->PrintTensor(*bias);
-  } else {
-    PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
+      //    bias->PrintTensor(*bias);
+    } else if (bias_dim_size == 4) {
+      DLOG << "channel_mul_d4";
+      // etc. input  1 72 28 28
+      // filter 1 72   -->  1 1 1 72
+      DLOG << "input->ImageDims():  " << input->ImageDims();
+      DLOG << "bias->ImageDims():  " << bias->ImageDims();
+      DLOG << "out->ImageDims():  " << output->ImageDims();
+
+      DLOG << "channel mul d2";
+      cl_mem input_image = input->GetCLImage();
+      cl_mem bias_image = bias->GetCLImage();
+      cl_mem output_image = output->GetCLImage();
+      int tensor_w = input->dims()[input->dims().size() - 1];
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&input_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&bias_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(cl_mem),
+                              reinterpret_cast<void *>(&output_image));
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(cl_int),
+                              reinterpret_cast<void *>(&tensor_w));
+      CL_CHECK_ERRORS(status);
+      auto width = input->ImageWidth();
+      auto height = input->ImageHeight();
+      size_t global_work_size[2] = {width, height};
+      status =
+          clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 2,
+                                 NULL, global_work_size, NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+    } else {
+      PADDLE_MOBILE_ENFORCE(false, "element mul not support this situation yet")
+    }
   }
 }
 
diff --git a/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp b/mobile/src/operators/kernel/cl/elementwise_sub_kernel.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/cl/expand_kernel.cpp b/mobile/src/operators/kernel/cl/expand_kernel.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp b/mobile/src/operators/kernel/cl/grid_sampler_kernel.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/expand_kernel.h b/mobile/src/operators/kernel/expand_kernel.h
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_kernel.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/kernel/grid_sampler_kernel.h b/mobile/src/operators/kernel/grid_sampler_kernel.h
old mode 100755
new mode 100644
diff --git a/mobile/src/operators/op_param.h b/mobile/src/operators/op_param.h
index e58159fbb7..f588b9fc79 100644
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -344,10 +344,14 @@ class OpParam {
 
   template <typename T>
   static const T GetAttr(const string &key, const AttributeMap &map) {
+    PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map",
+                          key.c_str())
     return ((Attribute)map.at(key)).Get<T>();
   }
   static const std::string GetStringAttr(const string &key,
                                          const AttributeMap &map) {
+    PADDLE_MOBILE_ENFORCE(HasAttr(key, map), "%s is not contained in attr map",
+                          key.c_str())
     return ((Attribute)map.at(key)).GetString();
   }
 
@@ -355,6 +359,10 @@ class OpParam {
     return map.count(key) > 0;
   }
 
+  static const bool HasVar(const string &key, const VariableNameMap &var_map) {
+    return var_map.count(key) > 0;
+  }
+
   template <typename T>
   static T *GetVarValue(const string &key, const VariableNameMap &var_map,
                         const Scope &scope) {
@@ -2624,6 +2632,7 @@ class ConvTransposeParam : public OpParam {
     EXEC_DECONV4X4_FLOAT,
     EXEC_DEPTHWISETRANS_FLOAT,
     EXEC_CONVTRANS3x3s2_FLOAT,
+    EXEC_CONVTRANS_FLOAT,
   };
 
   ExecMode &ExecMode() const { return exec_mode_; }
@@ -3100,16 +3109,37 @@ class NearestInterpolationParam : public OpParam {
                             const AttributeMap &attrs, Scope *scope)
       : OpParam(inputs, outputs, attrs, scope) {
     input_x_ = InputXFrom<GType>(inputs, *scope);
-    input_outsize_ = InputOutSizeFrom<GType>(inputs, *scope);
+    const bool has_out_size = HasVar("OutSize", inputs);
+
+    if (has_out_size) {
+      input_outsize_ = InputOutSizeFrom<GType>(inputs, *scope);
+    }
+
     out_ = OutFrom<GType>(outputs, *scope);
-    out_h_ = GetAttr<int>("out_h", attrs);
-    out_w_ = GetAttr<int>("out_w", attrs);
+
+    if (HasAttr("out_h", attrs)) {
+      out_h_ = GetAttr<int>("out_h", attrs);
+    } else if (HasAttr("out_h ", attrs)) {
+      // some models hurts ....   attr with space ..
+      out_h_ = GetAttr<int>("out_h ", attrs);
+    }
+
+    if (HasAttr("out_w", attrs)) {
+      out_w_ = GetAttr<int>("out_w", attrs);
+    } else if (HasAttr("out_w ", attrs)) {
+      // some models hurts ....   attr with space ..
+      out_w_ = GetAttr<int>("out_w ", attrs);
+    }
+
+    LOG(kLOG_DEBUG1) << "out_h_: " << out_h_;
+    LOG(kLOG_DEBUG1) << "out_w_: " << out_w_;
+
     if (HasAttr("scale", attrs)) {
       has_scale_ = true;
       scale_ = GetAttr<float>("scale", attrs);
     }
-    DLOG << "has_scale_:  " << has_scale_;
-    DLOG << "scale_:  " << scale_;
+    LOG(kLOG_DEBUG1) << "has_scale_:  " << has_scale_;
+    LOG(kLOG_DEBUG1) << "scale_:  " << scale_;
   }
   const GType *InputX() const { return input_x_; }
   const GType *InputOutPutSize() const { return input_outsize_; }
diff --git a/mobile/src/pass/memory_optimize_cl.cpp b/mobile/src/pass/memory_optimize_cl.cpp
index 355123349d..53bb675f17 100644
--- a/mobile/src/pass/memory_optimize_cl.cpp
+++ b/mobile/src/pass/memory_optimize_cl.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #ifdef PADDLE_MOBILE_CL
 #include "pass/memory_optimize_cl.h"
 #include <algorithm>
+#include <utility>
 #include "framework/cl/cl_image.h"
 #include "framework/lod_tensor.h"
 namespace paddle_mobile {
@@ -79,7 +80,7 @@ void MemoryOptPassCl::operator()(
 
     std::vector<ClVarNode *> fetch_var_nodes;
     for (const auto &op : block->Ops()) {
-      DLOG << "op_desc->Type(): " << op->Type();
+      LOG(kNO_LOG) << "op_desc->Type(): " << op->Type();
       for (const auto &outputs : op->GetOutputs()) {
         for (const auto &output : outputs.second) {
           // not a persistable and not a exclude one ,then add it to
@@ -87,7 +88,7 @@ void MemoryOptPassCl::operator()(
           if (!IsPersistable(output) &&
               std::find(exclude_var_names.begin(), exclude_var_names.end(),
                         output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
+            LOG(kNO_LOG) << "output: " << output;
             ClVarNode *node = CreateNode(output);
             analysis_nodes_.push(node);
           }
@@ -100,7 +101,7 @@ void MemoryOptPassCl::operator()(
           if (!IsPersistable(input) &&
               std::find(exclude_var_names.begin(), exclude_var_names.end(),
                         input) == exclude_var_names.end()) {
-            DLOG << "input: " << input;
+            LOG(kNO_LOG) << "input: " << input;
             ClVarNode *node = CreateNode(input);
             analysis_nodes_.push(node);
             if (op->Type() == "fetch") {
@@ -114,7 +115,7 @@ void MemoryOptPassCl::operator()(
           if (!IsPersistable(output) &&
               std::find(exclude_var_names.begin(), exclude_var_names.end(),
                         output) == exclude_var_names.end()) {
-            DLOG << "output: " << output;
+            LOG(kNO_LOG) << "output: " << output;
             ClVarNode *node = CreateNode(output);
             analysis_nodes_.push(node);
           }
@@ -164,8 +165,8 @@ void MemoryOptPassCl::ShareData(
   cl_command_queue command_queue = scope->GetCLScpoe()->CommandQueue();
 
   for (const auto &list : reused_nodes_) {
-    DLOG << "\n";
-    DLOG << "gpu . share memory within these variables";
+    LOG(kNO_LOG) << "\n";
+    LOG(kNO_LOG) << "gpu . share memory within these variables";
     int64_t x_based_max_numl = -1;
     int64_t y_based_max_numl = -1;
     int64_t x_based_max_x = -1;
diff --git a/mobile/test/CMakeLists.txt b/mobile/test/CMakeLists.txt
index 76ddd78f1a..078440f45b 100644
--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
@@ -551,6 +551,12 @@ if (ENABLE_ALL_TEST)
 
         ADD_EXECUTABLE(test-inference-api-v2 net/test_inference_api_v2.cpp test_helper.h test_include.h executor_for_test.h)
         target_link_libraries(test-inference-api-v2 paddle-mobile)
+
+        if (GPU_CL)
+            ADD_EXECUTABLE(test-net-male2fe net/test_mobilenet_male2fe.cpp test_helper.h test_include.h executor_for_test.h)
+            target_link_libraries(test-net-male2fe paddle-mobile)
+        endif()
+
     endif ()
 else ()
     # gen test
diff --git a/mobile/test/executor_for_test_opencl.h b/mobile/test/executor_for_test_opencl.h
old mode 100755
new mode 100644
diff --git a/mobile/test/net/test_inference_api_v2.cpp b/mobile/test/net/test_inference_api_v2.cpp
old mode 100755
new mode 100644
diff --git a/mobile/test/net/test_mobilenet_male2fe.cpp b/mobile/test/net/test_mobilenet_male2fe.cpp
new file mode 100644
index 0000000000..eb83b5bafe
--- /dev/null
+++ b/mobile/test/net/test_mobilenet_male2fe.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../../src/common/types.h"
+#include "../test_helper.h"
+#include "../test_include.h"
+
+void feed(PaddleMobile<paddle_mobile::GPU_CL> *paddle_mobile, const DDim &dims,
+          std::string image_path, std::string feed_name) {
+  float *input_data_array = new float[product(dims)];
+  std::ifstream in(image_path, std::ios::in);
+  for (int i = 0; i < product(dims); i++) {
+    float num;
+    in >> num;
+    input_data_array[i] = num;
+  }
+  in.close();
+  framework::Tensor input_tensor(input_data_array, dims);
+  DLOG << feed_name << " : " << input_tensor;
+  paddle_mobile->Feed(feed_name, input_tensor);
+}
+
+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::GPU_CL> paddle_mobile;
+  auto time1 = paddle_mobile::time();
+#ifdef PADDLE_MOBILE_CL
+  paddle_mobile.SetCLPath("/data/local/tmp/bin");
+#endif
+
+  if (paddle_mobile.Load(std::string("../models/nanbiannv") + "/model",
+                         std::string("../models/nanbiannv") + "/params",
+                         true)) {
+    auto time2 = paddle_mobile::time();
+    std::cout << "load cost :" << paddle_mobile::time_diff(time1, time2) << "ms"
+              << std::endl;
+
+    std::vector<float> input;
+    feed(&paddle_mobile, {1, 3, 256, 256}, "../images/input_1_3_256_256",
+         "image");
+
+    auto time3 = paddle_mobile::time();
+    paddle_mobile.Predict();
+    auto time4 = paddle_mobile::time();
+
+    std::cout << "predict cost :" << paddle_mobile::time_diff(time3, time4)
+              << "ms" << std::endl;
+  }
+
+  auto rgb = paddle_mobile.Fetch("rgb");
+  auto mask = paddle_mobile.Fetch("mask");
+  LOG(kLOG_INFO) << "rgb" << *rgb;
+  LOG(kLOG_INFO) << "mask" << *mask;
+  return 0;
+}
diff --git a/mobile/test/net/test_net_multi_feed.cpp b/mobile/test/net/test_net_multi_feed.cpp
old mode 100755
new mode 100644
diff --git a/mobile/test/operators/test_expend_op.cpp b/mobile/test/operators/test_expend_op.cpp
old mode 100755
new mode 100644
diff --git a/mobile/tools/python/fluidtools/run_multi_feed.py b/mobile/tools/python/fluidtools/run_multi_feed.py
old mode 100755
new mode 100644
-- 
GitLab